diff --git a/.DS_Store b/.DS_Store index b2cf37d..48b9a8d 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/hft.py b/hft.py index e3deafe..be2a268 100644 --- a/hft.py +++ b/hft.py @@ -1,7 +1,7 @@ import os import torch import torch.nn as nn -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling from datasets import Dataset import re import json @@ -11,7 +11,6 @@ import pytesseract from PIL import Image from collections import defaultdict from huggingface_hub import login -from transformers import DataCollatorForLanguageModeling # Konfiguracja os.environ['TORCH_USE_CUDA_DSA'] = '1' @@ -174,20 +173,23 @@ class CustomModel(nn.Module): def generate(self, *args, **kwargs): return self.base_model.generate(*args, **kwargs) -class CustomTrainer(Trainer): - def compute_loss(self, model, inputs, return_outputs=False): - source_idx = inputs.pop("source_idx", None) - outputs = model(**inputs, source_idx=source_idx) - return (outputs.loss, outputs) if return_outputs else outputs.loss - class CustomDataCollator(DataCollatorForLanguageModeling): def torch_call(self, examples): - batch = super().torch_call(examples) + # Przetwórz podstawowe pola + input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples]) + attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples]) + labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples]) - # Dodanie source_idx do batcha + batch = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels + } + + # Dodaj source_idx jeśli istnieje if "source_idx" in examples[0]: - source_idx = [ex["source_idx"] for ex in examples] - batch["source_idx"] = torch.tensor(source_idx, dtype=torch.long) + source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples]) + batch["source_idx"] = source_idx return batch @@ -205,7 +207,8 @@ def main(): print("\nBrak danych do treningu!") return - # Przygotowanie datasetu + dataset = Dataset.from_list(data) + def tokenize_function(examples): tokenized = tokenizer( examples["text"], @@ -218,13 +221,11 @@ def main(): "input_ids": tokenized["input_ids"].squeeze(), "attention_mask": tokenized["attention_mask"].squeeze(), "labels": tokenized["input_ids"].squeeze().clone(), - "source_idx": examples["source_idx"] + "source_idx": torch.tensor(examples["source_idx"], dtype=torch.long) } - dataset = Dataset.from_list(data) tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16) - # Model i trening model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config) model.source_mapper = source_mapper device = torch.device("cuda" if torch.cuda.is_available() else "cpu")