This commit is contained in:
l.gabrysiak 2025-02-25 21:54:33 +01:00
parent d5049b651c
commit 9afa461252
1 changed files with 65 additions and 107 deletions

140
hft.py
View File

@ -1,7 +1,7 @@
import os import os
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset from datasets import Dataset
import re import re
import json import json
@ -102,26 +102,22 @@ def prepare_dataset(directory, catalog_path, source_mapper):
print(f"Znaleziono {len(articles)} fragmentów") print(f"Znaleziono {len(articles)} fragmentów")
# Generowanie większej liczby przykładów
for i in range(0, len(articles)-1, 2): for i in range(0, len(articles)-1, 2):
for chunk_size in [256, 512, 1024]: # Różne rozmiary chunków
article_number = articles[i] article_number = articles[i]
article_content = articles[i+1] article_content = articles[i+1]
chunks = [article_content[j:j+chunk_size] for j in range(0, len(article_content), chunk_size//2)] if len(article_content) < 50:
chunks = [c for c in chunks if len(c) > 100] continue
for chunk in chunks:
source = f"{doc_type}, {article_number}" source = f"{doc_type}, {article_number}"
source_mapper.add_source(source) source_mapper.add_source(source)
data.append({ data.append({
"text": f"{article_number} {chunk}", "text": f"{article_number} {article_content}",
"source_idx": source_mapper.get_idx(source) "source_idx": source_mapper.get_idx(source)
}) })
else: else:
clean_text = re.sub(r'\s+', ' ', text).strip() clean_text = re.sub(r'\s+', ' ', text).strip()
for chunk_size in [256, 512, 768]: # Trzy różne rozmiary chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
chunks = [clean_text[i:i+chunk_size] for i in range(0, len(clean_text), chunk_size//2)]
chunks = [c for c in chunks if c.strip()] chunks = [c for c in chunks if c.strip()]
for chunk in chunks: for chunk in chunks:
@ -129,7 +125,7 @@ def prepare_dataset(directory, catalog_path, source_mapper):
"text": chunk, "text": chunk,
"source_idx": -1 "source_idx": -1
}) })
print(f"Dodano {len(chunks)*3} chunków") print(f"Dodano {len(chunks)} chunków")
except Exception as e: except Exception as e:
print(f"Błąd podczas przetwarzania pliku: {str(e)}") print(f"Błąd podczas przetwarzania pliku: {str(e)}")
@ -151,84 +147,38 @@ class CustomModel(nn.Module):
self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config) self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
self.source_embedding = nn.Embedding(10000, config.hidden_size, padding_idx=-1) self.source_embedding = nn.Embedding(10000, config.hidden_size, padding_idx=-1)
# Fine-tuning części modelu
for param in self.base_model.parameters(): for param in self.base_model.parameters():
param.requires_grad = False param.requires_grad = False
for param in self.base_model.get_output_embeddings().parameters(): for param in self.base_model.get_output_embeddings().parameters():
param.requires_grad = True param.requires_grad = True
for param in self.base_model.get_input_embeddings().parameters():
param.requires_grad = True
def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs): def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
if source_idx is not None: if source_idx is not None:
valid_indices = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1) valid_indices = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1)
source_embeds = self.source_embedding(valid_indices).unsqueeze(1) source_embeds = self.source_embedding(valid_indices).unsqueeze(1)
inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds
return self.base_model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs) return self.base_model(
return self.base_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs) inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
labels=labels,
**kwargs
)
return self.base_model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
**kwargs
)
def generate(self, *args, **kwargs): def generate(self, *args, **kwargs):
return self.base_model.generate(*args, **kwargs) return self.base_model.generate(*args, **kwargs)
class CustomTrainer(Trainer): class CustomTrainer(Trainer):
def __init__(self, *args, **kwargs): def compute_loss(self, model, inputs, return_outputs=False):
self.tokenizer = kwargs.pop('tokenizer', None)
super().__init__(*args, **kwargs)
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.pop("labels")
source_idx = inputs.pop("source_idx", None) source_idx = inputs.pop("source_idx", None)
outputs = model(**inputs, labels=labels, source_idx=source_idx) outputs = model(**inputs, source_idx=source_idx)
return (outputs.loss, outputs) if return_outputs else outputs.loss return (outputs.loss, outputs) if return_outputs else outputs.loss
def evaluate(self):
questions = [
"Jakie są prawa pracownika według art. 1?",
"Kto jest pracownikiem według art. 2?",
"Jakie są obowiązki pracodawcy według art. 3?"
]
print("\n" + "="*50 + "\nEWALUACJA\n" + "="*50)
for q in questions:
result = self.generate_answer(q)
print(f"\nPYTANIE: {q}")
print(f"ODPOWIEDŹ: {result['answer'][:500]}")
print(f"ŹRÓDŁA: {', '.join(result['sources'])}")
print("-"*80)
return {"loss": 0.0}
def generate_answer(self, question):
inputs = self.tokenizer(
f"[PYTANIE] {question} [KONTEKST]",
return_tensors="pt",
truncation=True,
max_length=512
).to(self.model.base_model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=200,
temperature=0.5,
top_p=0.9,
repetition_penalty=2.0,
num_beams=3,
no_repeat_ngram_size=3
)
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = answer.split("[KONTEKST]")[-1].strip()
sources = set()
for match in re.finditer(r'(?i)art\.?\s*\d+', answer):
article_ref = match.group(0).strip()
for idx, source in self.model.source_mapper.idx_to_source.items():
if article_ref.lower() in source.lower():
sources.add(source)
return {"answer": answer, "sources": list(sources)}
def main(): def main():
source_mapper = SourceMapper() source_mapper = SourceMapper()
model_name = "crumb/nano-mistral" model_name = "crumb/nano-mistral"
@ -245,49 +195,57 @@ def main():
dataset = Dataset.from_list(data) dataset = Dataset.from_list(data)
def tokenize(examples): def tokenize_function(examples):
return tokenizer( tokenized = tokenizer(
examples["text"], examples["text"],
truncation=True, truncation=True,
padding="max_length", padding="max_length",
max_length=512, max_length=512,
return_tensors="pt" return_tensors="pt"
) )
return {
"input_ids": tokenized["input_ids"][0],
"attention_mask": tokenized["attention_mask"][0],
"labels": tokenized["input_ids"][0].clone(),
"source_idx": examples["source_idx"]
}
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=16) tokenized_dataset = dataset.map(tokenize_function, batched=False)
training_args = TrainingArguments( data_collator = DataCollatorForLanguageModeling(
output_dir="./results", tokenizer=tokenizer,
num_train_epochs=8, mlm=False
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
learning_rate=5e-6,
weight_decay=0.01,
warmup_ratio=0.1,
fp16=torch.cuda.is_available(),
logging_steps=50,
save_strategy="epoch",
eval_strategy="no",
report_to="none",
remove_unused_columns=False
) )
model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config) model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
model.source_mapper = source_mapper model.source_mapper = source_mapper
model.to("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
learning_rate=2e-5,
fp16=torch.cuda.is_available(),
logging_steps=10,
save_strategy="steps",
save_steps=1000,
report_to="none",
remove_unused_columns=False
)
trainer = CustomTrainer( trainer = CustomTrainer(
model=model, model=model,
args=training_args, args=training_args,
train_dataset=tokenized_dataset, train_dataset=tokenized_dataset,
data_collator=data_collator,
tokenizer=tokenizer tokenizer=tokenizer
) )
print("\nRozpoczęcie treningu...") print("\nRozpoczęcie treningu...")
trainer.train() trainer.train()
print("\nKońcowa ewaluacja...")
trainer.evaluate()
if __name__ == "__main__": if __name__ == "__main__":
main() main()