import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling from datasets import Dataset # Konfiguracja os.environ["TOKENIZERS_PARALLELISM"] = "false" MODEL_NAME = "gpt2" SPECIAL_TOKENS = ["[CITATION_START]", "[CITATION_END]"] def prepare_simple_dataset(): return [ {"text": "[CITATION_START] Kodeks Pracy, Art. 1 [CITATION_END] Tekst artykułu..."}, {"text": "[CITATION_START] Kodeks Pracy, Art. 2 [CITATION_END] Inny tekst..."} ] def main(): # Inicjalizacja tokenizera tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS}) tokenizer.pad_token = tokenizer.eos_token # Przygotowanie danych data = prepare_simple_dataset() dataset = Dataset.from_dict({"text": [d["text"] for d in data]}) # Tokenizacja z prawidłowymi etykietami def tokenize_function(examples): tokenized = tokenizer( examples["text"], truncation=True, padding="max_length", max_length=128, return_tensors="pt" ) tokenized["labels"] = tokenized["input_ids"].clone() return tokenized tokenized_dataset = dataset.map(tokenize_function, batched=True) # Model i data collator model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) model.resize_token_embeddings(len(tokenizer), mean_resizing=False) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # Konfiguracja treningu training_args = TrainingArguments( output_dir="./results", num_train_epochs=1, per_device_train_batch_size=2, remove_unused_columns=True, logging_steps=1, report_to="none" ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=data_collator ) print("Rozpoczęcie treningu...") trainer.train() if __name__ == "__main__": main()