mod allegro

2025-02-28 21:44:24 +01:00 · 2025-02-28 21:44:24 +01:00 · 33eff363bc
parent 447de65d83
commit 33eff363bc
1 changed files with 4 additions and 1 deletions
--- a/allegro.py
+++ b/allegro.py
@ -11,7 +11,10 @@ dataset = load_dataset("wmt16", "ro-en")

 # Przetwórz dane do formatu odpowiedniego dla modelu
 def tokenize_function(examples):
-    return tokenizer(examples['translation']['ro'], examples['translation']['en'], truncation=True, padding='max_length', max_length=128)
+    # Jeśli 'translation' to lista słowników, np. [{'en': 'text1', 'ro': 'text1_translated'}, ...]
+    return tokenizer([example['en'] for example in examples['translation']], 
+                     [example['ro'] for example in examples['translation']], 
+                     truncation=True, padding='max_length', max_length=128)

 tokenized_datasets = dataset.map(tokenize_function, batched=True)