import os import re import torch from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling from datasets import Dataset # Konfiguracja os.environ["TOKENIZERS_PARALLELISM"] = "false" MODEL_NAME = "gpt2" SPECIAL_TOKENS = ["[CITATION_START]", "[CITATION_END]"] TEXT_FILE_PATH = "./docs/kodekspracy.txt" # Zmień na właściwą ścieżkę def prepare_dataset_from_file(file_path): with open(file_path, 'r', encoding='utf-8') as f: text = f.read() # Wydziel artykuły za pomocą wyrażenia regularnego articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL) formatted_articles = [] for article in articles: # Usuń zbędne białe znaki article = ' '.join(article.strip().split()) # Wydziel numer artykułu i treść art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL) if art_match: art_number = art_match.group(1) art_text = art_match.group(2) # Podziel na paragrafy, jeśli istnieją paragraphs = re.split(r'(§\s*\d+\.)', art_text) if len(paragraphs) > 1: formatted_paragraphs = [] for i in range(1, len(paragraphs), 2): para_num = paragraphs[i].strip() para_text = paragraphs[i+1].strip() formatted_paragraphs.append(f"{para_num} {para_text}") formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs) else: formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}" formatted_articles.append({"text": formatted}) return formatted_articles def main(): # Inicjalizacja tokenizera tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS}) tokenizer.pad_token = tokenizer.eos_token # Przygotowanie danych data = prepare_dataset_from_file(TEXT_FILE_PATH) dataset = Dataset.from_dict({"text": [d["text"] for d in data]}) # Tokenizacja def tokenize_function(examples): tokenized = tokenizer( examples["text"], truncation=True, padding="max_length", max_length=256, # Zwiększono dla dłuższych artykułów return_tensors="pt" ) tokenized["labels"] = tokenized["input_ids"].clone() return tokenized tokenized_dataset = dataset.map(tokenize_function, batched=True) # Model i data collator model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) model.resize_token_embeddings(len(tokenizer), mean_resizing=False) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # Konfiguracja treningu training_args = TrainingArguments( output_dir="./results", num_train_epochs=5, # Zwiększono liczbę epok per_device_train_batch_size=2, learning_rate=5e-5, logging_steps=10, report_to="none", save_strategy="no" ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=data_collator ) print("Rozpoczęcie treningu...") trainer.train() trainer.save_model("./trained_model/gpt") if __name__ == "__main__": main()