mod allegro
This commit is contained in:
parent
545700ad40
commit
735b5fe623
|
|
@ -58,7 +58,7 @@ def main():
|
||||||
# Inicjalizacja tokenizera
|
# Inicjalizacja tokenizera
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
|
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token # Dodaj tę linię
|
||||||
|
|
||||||
# Przygotowanie danych
|
# Przygotowanie danych
|
||||||
data = prepare_dataset_from_file(TEXT_FILE_PATH)
|
data = prepare_dataset_from_file(TEXT_FILE_PATH)
|
||||||
|
|
@ -70,17 +70,18 @@ def main():
|
||||||
examples["text"],
|
examples["text"],
|
||||||
truncation=True,
|
truncation=True,
|
||||||
padding="max_length",
|
padding="max_length",
|
||||||
max_length=1024, # Zwiększono dla dłuższych artykułów
|
max_length=512, # Zwiększono dla dłuższych artykułów
|
||||||
return_tensors="pt"
|
return_tensors="pt"
|
||||||
)
|
)
|
||||||
tokenized["labels"] = tokenized["input_ids"].clone()
|
tokenized["labels"] = tokenized["input_ids"].clone()
|
||||||
return tokenized
|
return tokenized
|
||||||
|
|
||||||
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
|
||||||
|
|
||||||
# Model i data collator
|
# Model i data collator
|
||||||
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
|
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
|
||||||
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
|
model.resize_token_embeddings(len(tokenizer)) # Dodaj tę linię
|
||||||
|
model.config.pad_token_id = tokenizer.pad_token_id # Dodaj tę linię
|
||||||
|
|
||||||
data_collator = DataCollatorForLanguageModeling(
|
data_collator = DataCollatorForLanguageModeling(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue