From 2dd8198c3a114861c5d80be3f6666c2b4d6c561e Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Wed, 26 Feb 2025 00:26:35 +0100 Subject: [PATCH] mod gpt --- gpt.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/gpt.py b/gpt.py index 487cace..ed66541 100644 --- a/gpt.py +++ b/gpt.py @@ -1,6 +1,6 @@ import os import torch -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling from datasets import Dataset # Konfiguracja @@ -24,21 +24,28 @@ def main(): data = prepare_simple_dataset() dataset = Dataset.from_dict({"text": [d["text"] for d in data]}) - # Tokenizacja + # Tokenizacja z prawidłowymi etykietami def tokenize_function(examples): - return tokenizer( + tokenized = tokenizer( examples["text"], truncation=True, padding="max_length", max_length=128, return_tensors="pt" ) + tokenized["labels"] = tokenized["input_ids"].clone() + return tokenized tokenized_dataset = dataset.map(tokenize_function, batched=True) - # Model + # Model i data collator model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) model.resize_token_embeddings(len(tokenizer)) + + data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, + mlm=False + ) # Konfiguracja treningu training_args = TrainingArguments( @@ -47,7 +54,7 @@ def main(): per_device_train_batch_size=2, remove_unused_columns=True, logging_steps=1, - report_to="none" # Wyłączenie raportowania + report_to="none" ) # Trainer @@ -55,6 +62,7 @@ def main(): model=model, args=training_args, train_dataset=tokenized_dataset, + data_collator=data_collator ) print("Rozpoczęcie treningu...")