mod allegro

This commit is contained in:
l.gabrysiak 2025-02-28 22:12:04 +01:00
parent 544d14bcc2
commit ad00842f91
1 changed files with 2 additions and 39 deletions

View File

@ -1,7 +1,5 @@
from transformers import MarianForCausalLM, MarianTokenizer, Trainer, TrainingArguments from transformers import MarianForCausalLM, MarianTokenizer, TrainingArguments
from datasets import load_dataset
# Załaduj model i tokenizer
model_name = "allegro/multislav-5lang" model_name = "allegro/multislav-5lang"
model = MarianForCausalLM.from_pretrained(model_name) model = MarianForCausalLM.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name) tokenizer = MarianTokenizer.from_pretrained(model_name)
@ -9,39 +7,4 @@ tokenizer = MarianTokenizer.from_pretrained(model_name)
model.save_pretrained("./models/ably") model.save_pretrained("./models/ably")
tokenizer.save_pretrained("./models/ably") tokenizer.save_pretrained("./models/ably")
print("✅ Model został wytrenowany i zapisany!") print("✅ Model został wytrenowany i zapisany!")
# Załaduj dane (przykład dla tłumaczenia z języka rumuńskiego na angielski)
#dataset = load_dataset("wmt16", "ro-en")
#def tokenize_function(examples):
# # Tokenizacja
# tokenized = tokenizer([example['en'] for example in examples['translation']],
# [example['ro'] for example in examples['translation']],
# truncation=True, padding='max_length', max_length=128)
# # Ustawienie labels
# tokenized['labels'] = tokenized['input_ids'].copy()
# return tokenized
#tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Skonfiguruj trenera
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
)
#trainer = Trainer(
# model=model,
# args=training_args,
# train_dataset=tokenized_datasets["train"],
# eval_dataset=tokenized_datasets["test"],
#)
# Trening modelu
#trainer.train()