diff --git a/hft.py b/hft.py index 4b4bca0..984f23a 100644 --- a/hft.py +++ b/hft.py @@ -109,8 +109,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name) model = CustomModel.from_pretrained(model_name) # Przygotowanie datasetu -catalog_path = "file_catalog.json" -data = prepare_dataset("files") +data = prepare_dataset("files", "file_catalog.json") dataset = load_dataset("dict", data=data) tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)