diff --git a/hft.py b/hft.py index d1e432a..266ccf3 100644 --- a/hft.py +++ b/hft.py @@ -1,8 +1,7 @@ import os import torch import torch.nn as nn -#from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer -from transformers import GPTNeoForCausalLM, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM # Zmiana importu +from transformers import GPTNeoForCausalLM, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM from datasets import Dataset from PIL import Image import re @@ -14,13 +13,12 @@ from torch.cuda.amp import autocast from collections import defaultdict from huggingface_hub import login -import torch torch.cuda.empty_cache() +# Logowanie do Hugging Face Hub login(token="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") os.environ["TOKENIZERS_PARALLELISM"] = "false" -# Nowa klasa do zarządzania źródłami class SourceMapper: def __init__(self): self.source_to_idx = defaultdict(lambda: len(self.source_to_idx)) @@ -78,7 +76,7 @@ def prepare_dataset(directory, catalog_path, source_mapper): doc_type = identify_legal_document(file, file_catalog) if doc_type != "Opracowanie własne": - articles = re.split(r'(Art\.\s+\d+[\.\s])', text) + articles = re.split(r'(Art\.?\s+\d+[\.\s])', text) for i in range(1, len(articles), 2): article_number = articles[i].strip() article_content = articles[i+1].strip() if i+1 < len(articles) else "" @@ -94,7 +92,7 @@ def prepare_dataset(directory, catalog_path, source_mapper): for chunk in chunks: data.append({ "text": chunk, - "source_idx": -1 # Brak źródła + "source_idx": -1 }) return data @@ -114,13 +112,10 @@ def custom_collate_fn(batch): input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch]) attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch]) labels = torch.stack([torch.tensor(b["labels"]) for b in batch]) - - # Dodajemy domyślne source_idx, jeśli nie istnieje source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long) - return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx} -class CustomModel(GPTNeoForCausalLM): # Zmiana klasy bazowej +class CustomModel(GPTNeoForCausalLM): def __init__(self, config): super().__init__(config) self.source_embedding = nn.Embedding( @@ -130,48 +125,42 @@ class CustomModel(GPTNeoForCausalLM): # Zmiana klasy bazowej ) def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs): - outputs = super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - labels=labels, - **kwargs - ) - - if source_idx is not None: - source_embeds = self.source_embedding(source_idx).unsqueeze(1) - outputs.logits += source_embeds - + with autocast(): + outputs = super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + **kwargs + ) + if source_idx is not None: + source_embeds = self.source_embedding(source_idx).unsqueeze(1) + outputs.logits += source_embeds return outputs class CustomTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False, **kwargs): labels = inputs.pop("labels") - source_idx = inputs.pop("source_idx") - outputs = model(**inputs, labels=labels, source_idx=source_idx) + with autocast(): + source_idx = inputs.pop("source_idx") + outputs = model(**inputs, labels=labels, source_idx=source_idx) return (outputs.loss, outputs) if return_outputs else outputs.loss -# Inicjalizacja komponentów source_mapper = SourceMapper() -model_name = "EleutherAI/gpt-neo-2.7B" #"google/gemma-2-2b" +model_name = "EleutherAI/gpt-neo-2.7B" tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token -# Przygotowanie danych -catalog_path = "file_catalog.json" -data = prepare_dataset("files", catalog_path, source_mapper) +data = prepare_dataset("files", "file_catalog.json", source_mapper) dataset = Dataset.from_list(data) tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=32) -# Inicjalizacja modelu config = AutoModelForCausalLM.from_pretrained(model_name).config -#model = CustomModel.from_pretrained(model_name, config=config) model = CustomModel.from_pretrained(model_name) model.config.gradient_checkpointing = True model.config.use_cache = False model.resize_token_embeddings(len(tokenizer)) model.gradient_checkpointing_enable() -# Konfiguracja treningu training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, @@ -182,18 +171,16 @@ training_args = TrainingArguments( save_strategy="steps", save_steps=1000, report_to="none", - gradient_checkpointing=True, - per_device_train_batch_size=4, # batch size dla treningu - per_device_eval_batch_size=4, # batch size dla ewaluacji - logging_dir='./logs' # folder do logów + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + logging_dir='./logs' ) -# Trening trainer = CustomTrainer( model=model, args=training_args, train_dataset=tokenized_dataset, - data_collator=custom_collate_fn # Użyj niestandardowego collate_fn + data_collator=custom_collate_fn ) trainer.train()