mod allegro

This commit is contained in:
l.gabrysiak 2025-02-28 22:09:57 +01:00
parent 57ca071282
commit 544d14bcc2
1 changed files with 22 additions and 17 deletions

View File

@ -6,19 +6,24 @@ model_name = "allegro/multislav-5lang"
model = MarianForCausalLM.from_pretrained(model_name) model = MarianForCausalLM.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name) tokenizer = MarianTokenizer.from_pretrained(model_name)
model.save_pretrained("./models/ably")
tokenizer.save_pretrained("./models/ably")
print("✅ Model został wytrenowany i zapisany!")
# Załaduj dane (przykład dla tłumaczenia z języka rumuńskiego na angielski) # Załaduj dane (przykład dla tłumaczenia z języka rumuńskiego na angielski)
dataset = load_dataset("wmt16", "ro-en") #dataset = load_dataset("wmt16", "ro-en")
def tokenize_function(examples): #def tokenize_function(examples):
# Tokenizacja # # Tokenizacja
tokenized = tokenizer([example['en'] for example in examples['translation']], # tokenized = tokenizer([example['en'] for example in examples['translation']],
[example['ro'] for example in examples['translation']], # [example['ro'] for example in examples['translation']],
truncation=True, padding='max_length', max_length=128) # truncation=True, padding='max_length', max_length=128)
# Ustawienie labels # # Ustawienie labels
tokenized['labels'] = tokenized['input_ids'].copy() # tokenized['labels'] = tokenized['input_ids'].copy()
return tokenized # return tokenized
tokenized_datasets = dataset.map(tokenize_function, batched=True) #tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Skonfiguruj trenera # Skonfiguruj trenera
training_args = TrainingArguments( training_args = TrainingArguments(
@ -31,12 +36,12 @@ training_args = TrainingArguments(
weight_decay=0.01, weight_decay=0.01,
) )
trainer = Trainer( #trainer = Trainer(
model=model, # model=model,
args=training_args, # args=training_args,
train_dataset=tokenized_datasets["train"], # train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"], # eval_dataset=tokenized_datasets["test"],
) #)
# Trening modelu # Trening modelu
trainer.train() #trainer.train()