From 735b5fe6239cab079924ac3f9f2c6ae04ce0ac78 Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Wed, 26 Feb 2025 11:42:17 +0100 Subject: [PATCH] mod allegro --- allegro.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/allegro.py b/allegro.py index 39e8018..89c185b 100644 --- a/allegro.py +++ b/allegro.py @@ -58,7 +58,7 @@ def main(): # Inicjalizacja tokenizera tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS}) - tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token = tokenizer.eos_token # Dodaj tę linię # Przygotowanie danych data = prepare_dataset_from_file(TEXT_FILE_PATH) @@ -70,17 +70,18 @@ def main(): examples["text"], truncation=True, padding="max_length", - max_length=1024, # Zwiększono dla dłuższych artykułów + max_length=512, # Zwiększono dla dłuższych artykułów return_tensors="pt" ) tokenized["labels"] = tokenized["input_ids"].clone() return tokenized - tokenized_dataset = dataset.map(tokenize_function, batched=True) + tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names) # Model i data collator model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) - model.resize_token_embeddings(len(tokenizer), mean_resizing=False) + model.resize_token_embeddings(len(tokenizer)) # Dodaj tę linię + model.config.pad_token_id = tokenizer.pad_token_id # Dodaj tę linię data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer,