mod allegro

This commit is contained in:
l.gabrysiak 2025-02-28 22:09:57 +01:00
parent 57ca071282
commit 544d14bcc2
1 changed files with 22 additions and 17 deletions

View File

@ -6,19 +6,24 @@ model_name = "allegro/multislav-5lang"
model = MarianForCausalLM.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)
model.save_pretrained("./models/ably")
tokenizer.save_pretrained("./models/ably")
print("✅ Model został wytrenowany i zapisany!")
# Załaduj dane (przykład dla tłumaczenia z języka rumuńskiego na angielski)
dataset = load_dataset("wmt16", "ro-en")
#dataset = load_dataset("wmt16", "ro-en")
def tokenize_function(examples):
# Tokenizacja
tokenized = tokenizer([example['en'] for example in examples['translation']],
[example['ro'] for example in examples['translation']],
truncation=True, padding='max_length', max_length=128)
# Ustawienie labels
tokenized['labels'] = tokenized['input_ids'].copy()
return tokenized
#def tokenize_function(examples):
# # Tokenizacja
# tokenized = tokenizer([example['en'] for example in examples['translation']],
# [example['ro'] for example in examples['translation']],
# truncation=True, padding='max_length', max_length=128)
# # Ustawienie labels
# tokenized['labels'] = tokenized['input_ids'].copy()
# return tokenized
tokenized_datasets = dataset.map(tokenize_function, batched=True)
#tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Skonfiguruj trenera
training_args = TrainingArguments(
@ -31,12 +36,12 @@ training_args = TrainingArguments(
weight_decay=0.01,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
)
#trainer = Trainer(
# model=model,
# args=training_args,
# train_dataset=tokenized_datasets["train"],
# eval_dataset=tokenized_datasets["test"],
#)
# Trening modelu
trainer.train()
#trainer.train()