From 136eddef079f6c412c0ec8bc11d433fe1e009d87 Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Tue, 25 Feb 2025 12:25:02 +0100 Subject: [PATCH] dataset update --- hft.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hft.py b/hft.py index e0796bd..9431524 100644 --- a/hft.py +++ b/hft.py @@ -2,7 +2,7 @@ import os import torch import torch.nn as nn from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer -from datasets import load_dataset +from datasets import Dataset from PIL import Image import re import pytesseract @@ -112,7 +112,7 @@ model = CustomModel.from_pretrained(model_name) # Przygotowanie datasetu catalog_path = "file_catalog.json" data = prepare_dataset("files", catalog_path) -dataset = load_dataset("dict", data=data) +dataset = Dataset.from_list(data) tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) # Konfiguracja treningu