This commit is contained in:
l.gabrysiak 2025-02-25 21:54:33 +01:00
parent d5049b651c
commit 9afa461252
1 changed files with 65 additions and 107 deletions

172
hft.py
View File

@ -1,7 +1,7 @@
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import re
import json
@ -102,34 +102,30 @@ def prepare_dataset(directory, catalog_path, source_mapper):
print(f"Znaleziono {len(articles)} fragmentów")
# Generowanie większej liczby przykładów
for i in range(0, len(articles)-1, 2):
for chunk_size in [256, 512, 1024]: # Różne rozmiary chunków
article_number = articles[i]
article_content = articles[i+1]
article_number = articles[i]
article_content = articles[i+1]
if len(article_content) < 50:
continue
chunks = [article_content[j:j+chunk_size] for j in range(0, len(article_content), chunk_size//2)]
chunks = [c for c in chunks if len(c) > 100]
for chunk in chunks:
source = f"{doc_type}, {article_number}"
source_mapper.add_source(source)
data.append({
"text": f"{article_number} {chunk}",
"source_idx": source_mapper.get_idx(source)
})
source = f"{doc_type}, {article_number}"
source_mapper.add_source(source)
data.append({
"text": f"{article_number} {article_content}",
"source_idx": source_mapper.get_idx(source)
})
else:
clean_text = re.sub(r'\s+', ' ', text).strip()
for chunk_size in [256, 512, 768]: # Trzy różne rozmiary
chunks = [clean_text[i:i+chunk_size] for i in range(0, len(clean_text), chunk_size//2)]
chunks = [c for c in chunks if c.strip()]
for chunk in chunks:
data.append({
"text": chunk,
"source_idx": -1
})
print(f"Dodano {len(chunks)*3} chunków")
chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
chunks = [c for c in chunks if c.strip()]
for chunk in chunks:
data.append({
"text": chunk,
"source_idx": -1
})
print(f"Dodano {len(chunks)} chunków")
except Exception as e:
print(f"Błąd podczas przetwarzania pliku: {str(e)}")
@ -151,83 +147,37 @@ class CustomModel(nn.Module):
self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
self.source_embedding = nn.Embedding(10000, config.hidden_size, padding_idx=-1)
# Fine-tuning części modelu
for param in self.base_model.parameters():
param.requires_grad = False
for param in self.base_model.get_output_embeddings().parameters():
param.requires_grad = True
for param in self.base_model.get_input_embeddings().parameters():
param.requires_grad = True
def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
if source_idx is not None:
valid_indices = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1)
source_embeds = self.source_embedding(valid_indices).unsqueeze(1)
inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds
return self.base_model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs)
return self.base_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
return self.base_model(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
labels=labels,
**kwargs
)
return self.base_model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
**kwargs
)
def generate(self, *args, **kwargs):
return self.base_model.generate(*args, **kwargs)
class CustomTrainer(Trainer):
def __init__(self, *args, **kwargs):
self.tokenizer = kwargs.pop('tokenizer', None)
super().__init__(*args, **kwargs)
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.pop("labels")
def compute_loss(self, model, inputs, return_outputs=False):
source_idx = inputs.pop("source_idx", None)
outputs = model(**inputs, labels=labels, source_idx=source_idx)
outputs = model(**inputs, source_idx=source_idx)
return (outputs.loss, outputs) if return_outputs else outputs.loss
def evaluate(self):
questions = [
"Jakie są prawa pracownika według art. 1?",
"Kto jest pracownikiem według art. 2?",
"Jakie są obowiązki pracodawcy według art. 3?"
]
print("\n" + "="*50 + "\nEWALUACJA\n" + "="*50)
for q in questions:
result = self.generate_answer(q)
print(f"\nPYTANIE: {q}")
print(f"ODPOWIEDŹ: {result['answer'][:500]}")
print(f"ŹRÓDŁA: {', '.join(result['sources'])}")
print("-"*80)
return {"loss": 0.0}
def generate_answer(self, question):
inputs = self.tokenizer(
f"[PYTANIE] {question} [KONTEKST]",
return_tensors="pt",
truncation=True,
max_length=512
).to(self.model.base_model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=200,
temperature=0.5,
top_p=0.9,
repetition_penalty=2.0,
num_beams=3,
no_repeat_ngram_size=3
)
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = answer.split("[KONTEKST]")[-1].strip()
sources = set()
for match in re.finditer(r'(?i)art\.?\s*\d+', answer):
article_ref = match.group(0).strip()
for idx, source in self.model.source_mapper.idx_to_source.items():
if article_ref.lower() in source.lower():
sources.add(source)
return {"answer": answer, "sources": list(sources)}
def main():
source_mapper = SourceMapper()
@ -245,49 +195,57 @@ def main():
dataset = Dataset.from_list(data)
def tokenize(examples):
return tokenizer(
def tokenize_function(examples):
tokenized = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=512,
return_tensors="pt"
)
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=16)
return {
"input_ids": tokenized["input_ids"][0],
"attention_mask": tokenized["attention_mask"][0],
"labels": tokenized["input_ids"][0].clone(),
"source_idx": examples["source_idx"]
}
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=8,
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
learning_rate=5e-6,
weight_decay=0.01,
warmup_ratio=0.1,
fp16=torch.cuda.is_available(),
logging_steps=50,
save_strategy="epoch",
eval_strategy="no",
report_to="none",
remove_unused_columns=False
tokenized_dataset = dataset.map(tokenize_function, batched=False)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
model.source_mapper = source_mapper
model.to("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
learning_rate=2e-5,
fp16=torch.cuda.is_available(),
logging_steps=10,
save_strategy="steps",
save_steps=1000,
report_to="none",
remove_unused_columns=False
)
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
tokenizer=tokenizer
)
print("\nRozpoczęcie treningu...")
trainer.train()
print("\nKońcowa ewaluacja...")
trainer.evaluate()
if __name__ == "__main__":
main()