dataset update
This commit is contained in:
parent
4730204816
commit
136eddef07
4
hft.py
4
hft.py
|
|
@ -2,7 +2,7 @@ import os
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
||||||
from datasets import load_dataset
|
from datasets import Dataset
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import re
|
import re
|
||||||
import pytesseract
|
import pytesseract
|
||||||
|
|
@ -112,7 +112,7 @@ model = CustomModel.from_pretrained(model_name)
|
||||||
# Przygotowanie datasetu
|
# Przygotowanie datasetu
|
||||||
catalog_path = "file_catalog.json"
|
catalog_path = "file_catalog.json"
|
||||||
data = prepare_dataset("files", catalog_path)
|
data = prepare_dataset("files", catalog_path)
|
||||||
dataset = load_dataset("dict", data=data)
|
dataset = Dataset.from_list(data)
|
||||||
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
|
||||||
|
|
||||||
# Konfiguracja treningu
|
# Konfiguracja treningu
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue