dataset update

This commit is contained in:
l.gabrysiak 2025-02-25 12:25:02 +01:00
parent 4730204816
commit 136eddef07
1 changed files with 2 additions and 2 deletions

4
hft.py
View File

@ -2,7 +2,7 @@ import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from datasets import Dataset
from PIL import Image
import re
import pytesseract
@ -112,7 +112,7 @@ model = CustomModel.from_pretrained(model_name)
# Przygotowanie datasetu
catalog_path = "file_catalog.json"
data = prepare_dataset("files", catalog_path)
dataset = load_dataset("dict", data=data)
dataset = Dataset.from_list(data)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
# Konfiguracja treningu