diff --git a/hft.py b/hft.py index e0796bd..9431524 100644 --- a/hft.py +++ b/hft.py @@ -2,7 +2,7 @@ import os import torch import torch.nn as nn from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer -from datasets import load_dataset +from datasets import Dataset from PIL import Image import re import pytesseract @@ -112,7 +112,7 @@ model = CustomModel.from_pretrained(model_name) # Przygotowanie datasetu catalog_path = "file_catalog.json" data = prepare_dataset("files", catalog_path) -dataset = load_dataset("dict", data=data) +dataset = Dataset.from_list(data) tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) # Konfiguracja treningu