dataset update

This commit is contained in:
l.gabrysiak 2025-02-25 12:25:02 +01:00
parent 4730204816
commit 136eddef07
1 changed files with 2 additions and 2 deletions

4
hft.py
View File

@ -2,7 +2,7 @@ import os
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset from datasets import Dataset
from PIL import Image from PIL import Image
import re import re
import pytesseract import pytesseract
@ -112,7 +112,7 @@ model = CustomModel.from_pretrained(model_name)
# Przygotowanie datasetu # Przygotowanie datasetu
catalog_path = "file_catalog.json" catalog_path = "file_catalog.json"
data = prepare_dataset("files", catalog_path) data = prepare_dataset("files", catalog_path)
dataset = load_dataset("dict", data=data) dataset = Dataset.from_list(data)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
# Konfiguracja treningu # Konfiguracja treningu