mod allegro

This commit is contained in:
l.gabrysiak 2025-02-28 22:04:41 +01:00
parent 33eff363bc
commit 967b10e153
1 changed files with 7 additions and 5 deletions

View File

@ -9,12 +9,14 @@ tokenizer = MarianTokenizer.from_pretrained(model_name)
# Załaduj dane (przykład dla tłumaczenia z języka rumuńskiego na angielski)
dataset = load_dataset("wmt16", "ro-en")
# Przetwórz dane do formatu odpowiedniego dla modelu
def tokenize_function(examples):
# Jeśli 'translation' to lista słowników, np. [{'en': 'text1', 'ro': 'text1_translated'}, ...]
return tokenizer([example['en'] for example in examples['translation']],
[example['ro'] for example in examples['translation']],
truncation=True, padding='max_length', max_length=128)
# Tokenizacja
tokenized = tokenizer([example['en'] for example in examples['translation']],
[example['ro'] for example in examples['translation']],
truncation=True, padding='max_length', max_length=128)
# Ustawienie labels
tokenized['labels'] = tokenized['input_ids'].copy()
return tokenized
tokenized_datasets = dataset.map(tokenize_function, batched=True)