diff --git a/allegro.py b/allegro.py index e0cac0f..5f221f1 100644 --- a/allegro.py +++ b/allegro.py @@ -1,119 +1,9 @@ -import os -import re -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling -from datasets import Dataset +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM -# Konfiguracja -os.environ["TOKENIZERS_PARALLELISM"] = "false" -MODEL_NAME = "allegro/herbert-base-cased" -SPECIAL_TOKENS = ["[CITATION_START]", "[CITATION_END]"] -TEXT_FILE_PATH = "./docs/kodekspracy.txt" # Zmień na właściwą ścieżkę +model = AutoModelForSeq2SeqLM.from_pretrained("allegro/multislav-5lang") +tokenizer = AutoTokenizer.from_pretrained("allegro/multislav-5lang") -def prepare_dataset_from_file(file_path): - with open(file_path, 'r', encoding='utf-8') as f: - text = f.read() +model.save_pretrained("./models/ably") +tokenizer.save_pretrained("./models/ably") - articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL) - - formatted_articles = [] - for article in articles: - article = ' '.join(article.strip().split()) - - art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL) - if art_match: - art_number = art_match.group(1) - art_text = art_match.group(2) - - paragraphs = re.split(r'(§\s*\d+\.)', art_text) - if len(paragraphs) > 1: - formatted_paragraphs = [] - for i in range(1, len(paragraphs), 2): - para_num = paragraphs[i].strip() - para_text = paragraphs[i+1].strip() - formatted_paragraphs.append(f"{para_num} {para_text}") - formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs) - else: - formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}" - - formatted_articles.append({"text": formatted}) - - questions = [ - f"Zacytuj artykuł {art_number} Kodeksu pracy.", - f"Co mówi artykuł {art_number} Kodeksu pracy?", - f"Podaj treść artykułu {art_number} Kodeksu pracy." - ] - for question in questions: - formatted_articles.append({"text": f"{question}\n{formatted}"}) - - return formatted_articles - -def main(): - # Inicjalizacja tokenizera - tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS}) - - print(f"Pad token: {tokenizer.pad_token}") - print(f"Pad token ID: {tokenizer.pad_token_id}") - - # Przygotowanie danych - data = prepare_dataset_from_file(TEXT_FILE_PATH) - dataset = Dataset.from_dict({"text": [d["text"] for d in data]}) - - # Tokenizacja - def tokenize_function(examples): - tokenized = tokenizer( - examples["text"], - truncation=True, - padding="max_length", - max_length=512, - return_tensors="pt" - ) - tokenized["labels"] = tokenized["input_ids"].clone() - return tokenized - - tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names) - - # Model i data collator - model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) - model.resize_token_embeddings(len(tokenizer)) - model.config.pad_token_id = tokenizer.pad_token_id - - data_collator = DataCollatorForLanguageModeling( - tokenizer=tokenizer, - mlm=False - ) - - # Konfiguracja treningu - training_args = TrainingArguments( - output_dir="./results", - num_train_epochs=32, - per_device_train_batch_size=2, - learning_rate=1e-5, - logging_steps=10, - weight_decay=0.01, - report_to="none", - save_strategy="steps", - save_steps=500, - evaluation_strategy="steps", - eval_steps=500, - load_best_model_at_end=True, - ) - - # Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset, - eval_dataset=tokenized_dataset, - data_collator=data_collator - ) - - print("Rozpoczęcie treningu...") - trainer.train() - trainer.save_model("./trained_model/allegro") - tokenizer.save_pretrained("./trained_model/allegro") - -if __name__ == "__main__": - main() +print("✅ Model został wytrenowany i zapisany!") \ No newline at end of file