ably.do/allegro.py

37 lines
1.1 KiB
Python
Raw Normal View History

2025-02-28 15:41:23 -05:00
from transformers import MarianForCausalLM, MarianTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
2025-02-28 15:26:21 -05:00
2025-02-28 15:41:23 -05:00
# Załaduj model i tokenizer
2025-02-28 15:26:21 -05:00
model_name = "allegro/multislav-5lang"
2025-02-28 15:41:23 -05:00
model = MarianForCausalLM.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)
2025-02-28 15:26:21 -05:00
2025-02-28 15:41:23 -05:00
# Załaduj dane (przykład dla tłumaczenia z języka rumuńskiego na angielski)
dataset = load_dataset("wmt16", "ro-en")
2025-02-28 15:26:21 -05:00
2025-02-28 15:41:23 -05:00
# Przetwórz dane do formatu odpowiedniego dla modelu
2025-02-28 13:47:09 -05:00
def tokenize_function(examples):
2025-02-28 15:43:06 -05:00
return tokenizer(examples['translation']['ro'], examples['translation']['en'], truncation=True, padding='max_length', max_length=128)
2025-02-26 05:37:10 -05:00
2025-02-28 15:41:23 -05:00
tokenized_datasets = dataset.map(tokenize_function, batched=True)
2025-02-28 13:47:09 -05:00
2025-02-28 15:41:23 -05:00
# Skonfiguruj trenera
2025-02-28 13:47:09 -05:00
training_args = TrainingArguments(
output_dir="./results",
2025-02-28 15:41:23 -05:00
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
2025-02-28 13:47:09 -05:00
weight_decay=0.01,
2025-02-28 15:26:21 -05:00
)
2025-02-28 13:47:09 -05:00
trainer = Trainer(
model=model,
args=training_args,
2025-02-28 15:41:23 -05:00
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
2025-02-28 13:47:09 -05:00
)
2025-02-28 15:41:23 -05:00
# Trening modelu
trainer.train()