mod allegro
This commit is contained in:
parent
545700ad40
commit
735b5fe623
|
|
@ -58,7 +58,7 @@ def main():
|
|||
# Inicjalizacja tokenizera
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.pad_token = tokenizer.eos_token # Dodaj tę linię
|
||||
|
||||
# Przygotowanie danych
|
||||
data = prepare_dataset_from_file(TEXT_FILE_PATH)
|
||||
|
|
@ -70,17 +70,18 @@ def main():
|
|||
examples["text"],
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
max_length=1024, # Zwiększono dla dłuższych artykułów
|
||||
max_length=512, # Zwiększono dla dłuższych artykułów
|
||||
return_tensors="pt"
|
||||
)
|
||||
tokenized["labels"] = tokenized["input_ids"].clone()
|
||||
return tokenized
|
||||
|
||||
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
||||
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
|
||||
|
||||
# Model i data collator
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
|
||||
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
|
||||
model.resize_token_embeddings(len(tokenizer)) # Dodaj tę linię
|
||||
model.config.pad_token_id = tokenizer.pad_token_id # Dodaj tę linię
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
|
|
|
|||
Loading…
Reference in New Issue