mod
This commit is contained in:
parent
bf034eaf8f
commit
eb69fe1633
33
hft.py
33
hft.py
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
@ -11,7 +11,6 @@ import pytesseract
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from huggingface_hub import login
|
from huggingface_hub import login
|
||||||
from transformers import DataCollatorForLanguageModeling
|
|
||||||
|
|
||||||
# Konfiguracja
|
# Konfiguracja
|
||||||
os.environ['TORCH_USE_CUDA_DSA'] = '1'
|
os.environ['TORCH_USE_CUDA_DSA'] = '1'
|
||||||
|
|
@ -174,20 +173,23 @@ class CustomModel(nn.Module):
|
||||||
def generate(self, *args, **kwargs):
|
def generate(self, *args, **kwargs):
|
||||||
return self.base_model.generate(*args, **kwargs)
|
return self.base_model.generate(*args, **kwargs)
|
||||||
|
|
||||||
class CustomTrainer(Trainer):
|
|
||||||
def compute_loss(self, model, inputs, return_outputs=False):
|
|
||||||
source_idx = inputs.pop("source_idx", None)
|
|
||||||
outputs = model(**inputs, source_idx=source_idx)
|
|
||||||
return (outputs.loss, outputs) if return_outputs else outputs.loss
|
|
||||||
|
|
||||||
class CustomDataCollator(DataCollatorForLanguageModeling):
|
class CustomDataCollator(DataCollatorForLanguageModeling):
|
||||||
def torch_call(self, examples):
|
def torch_call(self, examples):
|
||||||
batch = super().torch_call(examples)
|
# Przetwórz podstawowe pola
|
||||||
|
input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples])
|
||||||
|
attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples])
|
||||||
|
labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples])
|
||||||
|
|
||||||
# Dodanie source_idx do batcha
|
batch = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": attention_mask,
|
||||||
|
"labels": labels
|
||||||
|
}
|
||||||
|
|
||||||
|
# Dodaj source_idx jeśli istnieje
|
||||||
if "source_idx" in examples[0]:
|
if "source_idx" in examples[0]:
|
||||||
source_idx = [ex["source_idx"] for ex in examples]
|
source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples])
|
||||||
batch["source_idx"] = torch.tensor(source_idx, dtype=torch.long)
|
batch["source_idx"] = source_idx
|
||||||
|
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
|
|
@ -205,7 +207,8 @@ def main():
|
||||||
print("\nBrak danych do treningu!")
|
print("\nBrak danych do treningu!")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Przygotowanie datasetu
|
dataset = Dataset.from_list(data)
|
||||||
|
|
||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
tokenized = tokenizer(
|
tokenized = tokenizer(
|
||||||
examples["text"],
|
examples["text"],
|
||||||
|
|
@ -218,13 +221,11 @@ def main():
|
||||||
"input_ids": tokenized["input_ids"].squeeze(),
|
"input_ids": tokenized["input_ids"].squeeze(),
|
||||||
"attention_mask": tokenized["attention_mask"].squeeze(),
|
"attention_mask": tokenized["attention_mask"].squeeze(),
|
||||||
"labels": tokenized["input_ids"].squeeze().clone(),
|
"labels": tokenized["input_ids"].squeeze().clone(),
|
||||||
"source_idx": examples["source_idx"]
|
"source_idx": torch.tensor(examples["source_idx"], dtype=torch.long)
|
||||||
}
|
}
|
||||||
|
|
||||||
dataset = Dataset.from_list(data)
|
|
||||||
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)
|
||||||
|
|
||||||
# Model i trening
|
|
||||||
model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
|
model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
|
||||||
model.source_mapper = source_mapper
|
model.source_mapper = source_mapper
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue