mod allegro

This commit is contained in:
l.gabrysiak 2025-02-26 11:42:17 +01:00
parent 545700ad40
commit 735b5fe623
1 changed files with 5 additions and 4 deletions

View File

@ -58,7 +58,7 @@ def main():
# Inicjalizacja tokenizera
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token = tokenizer.eos_token # Dodaj tę linię
# Przygotowanie danych
data = prepare_dataset_from_file(TEXT_FILE_PATH)
@ -70,17 +70,18 @@ def main():
examples["text"],
truncation=True,
padding="max_length",
max_length=1024, # Zwiększono dla dłuższych artykułów
max_length=512, # Zwiększono dla dłuższych artykułów
return_tensors="pt"
)
tokenized["labels"] = tokenized["input_ids"].clone()
return tokenized
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
# Model i data collator
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
model.resize_token_embeddings(len(tokenizer)) # Dodaj tę linię
model.config.pad_token_id = tokenizer.pad_token_id # Dodaj tę linię
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,