mod allegro

This commit is contained in:
l.gabrysiak 2025-02-26 11:42:17 +01:00
parent 545700ad40
commit 735b5fe623
1 changed files with 5 additions and 4 deletions

View File

@ -58,7 +58,7 @@ def main():
# Inicjalizacja tokenizera # Inicjalizacja tokenizera
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS}) tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token # Dodaj tę linię
# Przygotowanie danych # Przygotowanie danych
data = prepare_dataset_from_file(TEXT_FILE_PATH) data = prepare_dataset_from_file(TEXT_FILE_PATH)
@ -70,17 +70,18 @@ def main():
examples["text"], examples["text"],
truncation=True, truncation=True,
padding="max_length", padding="max_length",
max_length=1024, # Zwiększono dla dłuższych artykułów max_length=512, # Zwiększono dla dłuższych artykułów
return_tensors="pt" return_tensors="pt"
) )
tokenized["labels"] = tokenized["input_ids"].clone() tokenized["labels"] = tokenized["input_ids"].clone()
return tokenized return tokenized
tokenized_dataset = dataset.map(tokenize_function, batched=True) tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
# Model i data collator # Model i data collator
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer), mean_resizing=False) model.resize_token_embeddings(len(tokenizer)) # Dodaj tę linię
model.config.pad_token_id = tokenizer.pad_token_id # Dodaj tę linię
data_collator = DataCollatorForLanguageModeling( data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, tokenizer=tokenizer,