This commit is contained in:
l.gabrysiak 2025-02-25 22:00:00 +01:00
parent bf034eaf8f
commit eb69fe1633
2 changed files with 17 additions and 16 deletions

BIN
.DS_Store vendored

Binary file not shown.

33
hft.py
View File

@ -1,7 +1,7 @@
import os import os
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset from datasets import Dataset
import re import re
import json import json
@ -11,7 +11,6 @@ import pytesseract
from PIL import Image from PIL import Image
from collections import defaultdict from collections import defaultdict
from huggingface_hub import login from huggingface_hub import login
from transformers import DataCollatorForLanguageModeling
# Konfiguracja # Konfiguracja
os.environ['TORCH_USE_CUDA_DSA'] = '1' os.environ['TORCH_USE_CUDA_DSA'] = '1'
@ -174,20 +173,23 @@ class CustomModel(nn.Module):
def generate(self, *args, **kwargs): def generate(self, *args, **kwargs):
return self.base_model.generate(*args, **kwargs) return self.base_model.generate(*args, **kwargs)
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
source_idx = inputs.pop("source_idx", None)
outputs = model(**inputs, source_idx=source_idx)
return (outputs.loss, outputs) if return_outputs else outputs.loss
class CustomDataCollator(DataCollatorForLanguageModeling): class CustomDataCollator(DataCollatorForLanguageModeling):
def torch_call(self, examples): def torch_call(self, examples):
batch = super().torch_call(examples) # Przetwórz podstawowe pola
input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples])
attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples])
labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples])
# Dodanie source_idx do batcha batch = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels
}
# Dodaj source_idx jeśli istnieje
if "source_idx" in examples[0]: if "source_idx" in examples[0]:
source_idx = [ex["source_idx"] for ex in examples] source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples])
batch["source_idx"] = torch.tensor(source_idx, dtype=torch.long) batch["source_idx"] = source_idx
return batch return batch
@ -205,7 +207,8 @@ def main():
print("\nBrak danych do treningu!") print("\nBrak danych do treningu!")
return return
# Przygotowanie datasetu dataset = Dataset.from_list(data)
def tokenize_function(examples): def tokenize_function(examples):
tokenized = tokenizer( tokenized = tokenizer(
examples["text"], examples["text"],
@ -218,13 +221,11 @@ def main():
"input_ids": tokenized["input_ids"].squeeze(), "input_ids": tokenized["input_ids"].squeeze(),
"attention_mask": tokenized["attention_mask"].squeeze(), "attention_mask": tokenized["attention_mask"].squeeze(),
"labels": tokenized["input_ids"].squeeze().clone(), "labels": tokenized["input_ids"].squeeze().clone(),
"source_idx": examples["source_idx"] "source_idx": torch.tensor(examples["source_idx"], dtype=torch.long)
} }
dataset = Dataset.from_list(data)
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16) tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)
# Model i trening
model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config) model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
model.source_mapper = source_mapper model.source_mapper = source_mapper
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")