mod
This commit is contained in:
parent
bf034eaf8f
commit
eb69fe1633
33
hft.py
33
hft.py
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
|
||||
from datasets import Dataset
|
||||
import re
|
||||
import json
|
||||
|
|
@ -11,7 +11,6 @@ import pytesseract
|
|||
from PIL import Image
|
||||
from collections import defaultdict
|
||||
from huggingface_hub import login
|
||||
from transformers import DataCollatorForLanguageModeling
|
||||
|
||||
# Konfiguracja
|
||||
os.environ['TORCH_USE_CUDA_DSA'] = '1'
|
||||
|
|
@ -174,20 +173,23 @@ class CustomModel(nn.Module):
|
|||
def generate(self, *args, **kwargs):
|
||||
return self.base_model.generate(*args, **kwargs)
|
||||
|
||||
class CustomTrainer(Trainer):
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
source_idx = inputs.pop("source_idx", None)
|
||||
outputs = model(**inputs, source_idx=source_idx)
|
||||
return (outputs.loss, outputs) if return_outputs else outputs.loss
|
||||
|
||||
class CustomDataCollator(DataCollatorForLanguageModeling):
|
||||
def torch_call(self, examples):
|
||||
batch = super().torch_call(examples)
|
||||
# Przetwórz podstawowe pola
|
||||
input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples])
|
||||
attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples])
|
||||
labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples])
|
||||
|
||||
# Dodanie source_idx do batcha
|
||||
batch = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"labels": labels
|
||||
}
|
||||
|
||||
# Dodaj source_idx jeśli istnieje
|
||||
if "source_idx" in examples[0]:
|
||||
source_idx = [ex["source_idx"] for ex in examples]
|
||||
batch["source_idx"] = torch.tensor(source_idx, dtype=torch.long)
|
||||
source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples])
|
||||
batch["source_idx"] = source_idx
|
||||
|
||||
return batch
|
||||
|
||||
|
|
@ -205,7 +207,8 @@ def main():
|
|||
print("\nBrak danych do treningu!")
|
||||
return
|
||||
|
||||
# Przygotowanie datasetu
|
||||
dataset = Dataset.from_list(data)
|
||||
|
||||
def tokenize_function(examples):
|
||||
tokenized = tokenizer(
|
||||
examples["text"],
|
||||
|
|
@ -218,13 +221,11 @@ def main():
|
|||
"input_ids": tokenized["input_ids"].squeeze(),
|
||||
"attention_mask": tokenized["attention_mask"].squeeze(),
|
||||
"labels": tokenized["input_ids"].squeeze().clone(),
|
||||
"source_idx": examples["source_idx"]
|
||||
"source_idx": torch.tensor(examples["source_idx"], dtype=torch.long)
|
||||
}
|
||||
|
||||
dataset = Dataset.from_list(data)
|
||||
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)
|
||||
|
||||
# Model i trening
|
||||
model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
|
||||
model.source_mapper = source_mapper
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
|
|
|||
Loading…
Reference in New Issue