From eb69fe163369b2a99fe32486c7e69bcb6c3c06d0 Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Tue, 25 Feb 2025 22:00:00 +0100 Subject: [PATCH] mod --- .DS_Store | Bin 6148 -> 6148 bytes hft.py | 33 +++++++++++++++++---------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/.DS_Store b/.DS_Store index b2cf37d250f33e6f3721afac6800921711fd979d..48b9a8d1f51ca29d7565af1b0aa70ea425ba16d8 100644 GIT binary patch delta 194 zcmZoMXfc=|#>B)qu~2NHo+2aL#DLw41(+Ba*(UQaKB&)0Hw;eB&n;j80iL-}fg~$K z8bc;S4nrzKac;hgOHxjL5>S-GKK)*$;r3&W=qgeOs3^!lw&4I!JCY5XUo*~O+04$t e&jEDdW=5v(%#-;=EIAk%7=Va@VRL}U7G?m{X)bjD delta 68 zcmZoMXfc=|#>B`mu~2NHo+2aD#DLwC4MbQb^E2JuoWneuWn)7F<7Rdaeh#3T&5g|8 WnJ4p$SaL7`0V4wg)8+t?EzAItFA-e; diff --git a/hft.py b/hft.py index e3deafe..be2a268 100644 --- a/hft.py +++ b/hft.py @@ -1,7 +1,7 @@ import os import torch import torch.nn as nn -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling from datasets import Dataset import re import json @@ -11,7 +11,6 @@ import pytesseract from PIL import Image from collections import defaultdict from huggingface_hub import login -from transformers import DataCollatorForLanguageModeling # Konfiguracja os.environ['TORCH_USE_CUDA_DSA'] = '1' @@ -174,20 +173,23 @@ class CustomModel(nn.Module): def generate(self, *args, **kwargs): return self.base_model.generate(*args, **kwargs) -class CustomTrainer(Trainer): - def compute_loss(self, model, inputs, return_outputs=False): - source_idx = inputs.pop("source_idx", None) - outputs = model(**inputs, source_idx=source_idx) - return (outputs.loss, outputs) if return_outputs else outputs.loss - class CustomDataCollator(DataCollatorForLanguageModeling): def torch_call(self, examples): - batch = super().torch_call(examples) + # Przetwórz podstawowe pola + input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples]) + attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples]) + labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples]) - # Dodanie source_idx do batcha + batch = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels + } + + # Dodaj source_idx jeśli istnieje if "source_idx" in examples[0]: - source_idx = [ex["source_idx"] for ex in examples] - batch["source_idx"] = torch.tensor(source_idx, dtype=torch.long) + source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples]) + batch["source_idx"] = source_idx return batch @@ -205,7 +207,8 @@ def main(): print("\nBrak danych do treningu!") return - # Przygotowanie datasetu + dataset = Dataset.from_list(data) + def tokenize_function(examples): tokenized = tokenizer( examples["text"], @@ -218,13 +221,11 @@ def main(): "input_ids": tokenized["input_ids"].squeeze(), "attention_mask": tokenized["attention_mask"].squeeze(), "labels": tokenized["input_ids"].squeeze().clone(), - "source_idx": examples["source_idx"] + "source_idx": torch.tensor(examples["source_idx"], dtype=torch.long) } - dataset = Dataset.from_list(data) tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16) - # Model i trening model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config) model.source_mapper = source_mapper device = torch.device("cuda" if torch.cuda.is_available() else "cpu")