import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from datasets import Dataset # Konfiguracja os.environ["TOKENIZERS_PARALLELISM"] = "false" MODEL_NAME = "gpt2" SPECIAL_TOKENS = ["[CITATION_START]", "[CITATION_END]"] def prepare_simple_dataset(): return [ {"text": "[CITATION_START] Kodeks Pracy, Art. 1 [CITATION_END] Tekst artykułu..."}, {"text": "[CITATION_START] Kodeks Pracy, Art. 2 [CITATION_END] Inny tekst..."} ] def main(): # Inicjalizacja tokenizera tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS}) tokenizer.pad_token = tokenizer.eos_token # Przygotowanie danych data = prepare_simple_dataset() dataset = Dataset.from_dict({"text": [d["text"] for d in data]}) # Tokenizacja def tokenize_function(examples): return tokenizer( examples["text"], truncation=True, padding="max_length", max_length=128, return_tensors="pt" ) tokenized_dataset = dataset.map(tokenize_function, batched=True) # Model model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) model.resize_token_embeddings(len(tokenizer)) # Konfiguracja treningu training_args = TrainingArguments( output_dir="./results", num_train_epochs=1, per_device_train_batch_size=2, remove_unused_columns=True, logging_steps=1, report_to="none" # Wyłączenie raportowania ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, ) print("Rozpoczęcie treningu...") trainer.train() if __name__ == "__main__": main()