This commit is contained in:
l.gabrysiak 2025-02-25 22:00:00 +01:00
parent bf034eaf8f
commit eb69fe1633
2 changed files with 17 additions and 16 deletions

BIN
.DS_Store vendored

Binary file not shown.

33
hft.py
View File

@ -1,7 +1,7 @@
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import re
import json
@ -11,7 +11,6 @@ import pytesseract
from PIL import Image
from collections import defaultdict
from huggingface_hub import login
from transformers import DataCollatorForLanguageModeling
# Konfiguracja
os.environ['TORCH_USE_CUDA_DSA'] = '1'
@ -174,20 +173,23 @@ class CustomModel(nn.Module):
def generate(self, *args, **kwargs):
return self.base_model.generate(*args, **kwargs)
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
source_idx = inputs.pop("source_idx", None)
outputs = model(**inputs, source_idx=source_idx)
return (outputs.loss, outputs) if return_outputs else outputs.loss
class CustomDataCollator(DataCollatorForLanguageModeling):
def torch_call(self, examples):
batch = super().torch_call(examples)
# Przetwórz podstawowe pola
input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples])
attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples])
labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples])
# Dodanie source_idx do batcha
batch = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels
}
# Dodaj source_idx jeśli istnieje
if "source_idx" in examples[0]:
source_idx = [ex["source_idx"] for ex in examples]
batch["source_idx"] = torch.tensor(source_idx, dtype=torch.long)
source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples])
batch["source_idx"] = source_idx
return batch
@ -205,7 +207,8 @@ def main():
print("\nBrak danych do treningu!")
return
# Przygotowanie datasetu
dataset = Dataset.from_list(data)
def tokenize_function(examples):
tokenized = tokenizer(
examples["text"],
@ -218,13 +221,11 @@ def main():
"input_ids": tokenized["input_ids"].squeeze(),
"attention_mask": tokenized["attention_mask"].squeeze(),
"labels": tokenized["input_ids"].squeeze().clone(),
"source_idx": examples["source_idx"]
"source_idx": torch.tensor(examples["source_idx"], dtype=torch.long)
}
dataset = Dataset.from_list(data)
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)
# Model i trening
model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
model.source_mapper = source_mapper
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")