From ccc2af5185202be316e5b2184f55217a97fdf415 Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Tue, 25 Feb 2025 18:16:51 +0100 Subject: [PATCH] Poprawka fukcji uzycia --- hft.py | 77 ++++++++++++++++++++++++---------------------------------- 1 file changed, 32 insertions(+), 45 deletions(-) diff --git a/hft.py b/hft.py index 6c66b01..a26323d 100644 --- a/hft.py +++ b/hft.py @@ -107,12 +107,11 @@ def tokenize_function(examples): return tokenized def custom_collate_fn(batch): - input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch]) - attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch]) - labels = torch.stack([torch.tensor(b["labels"]) for b in batch]) - - source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long) - #print("source_idx shape:", source_idx.shape) # Debugowanie + device = next(model.parameters()).device + input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch]).to(device) + attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch]).to(device) + labels = torch.stack([torch.tensor(b["labels"]) for b in batch]).to(device) + source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long).to(device) return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx} class CustomModel(nn.Module): @@ -127,8 +126,6 @@ class CustomModel(nn.Module): def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs): if source_idx is not None: - #print("Max source_idx:", torch.max(source_idx)) - #print("Num embeddings:", self.source_embedding.num_embeddings) source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings - 1) source_embeds = self.source_embedding(source_idx).unsqueeze(1).expand(-1, input_ids.size(1), -1) hidden_states = self.base_model.get_input_embeddings()(input_ids) + source_embeds @@ -146,6 +143,27 @@ class CustomTrainer(Trainer): loss = outputs.loss return (loss, outputs) if return_outputs else loss +def generate_answer(question, model, tokenizer, source_mapper, max_length=200): + inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512) + inputs = {k: v.to(model.device) for k, v in inputs.items()} + + outputs = model.base_model.generate( + **inputs, + max_length=max_length, + num_return_sequences=1, + return_dict_in_generate=True, + output_scores=True, + ) + + answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) + + # Pobierz źródło z ostatniego tokena + last_token_id = outputs.sequences[0][-1].item() + source_idx = model.source_embedding.weight.shape[0] - 1 + source = source_mapper.get_source(source_idx) + + return f"{answer}\n\nŹródło: {source if source else 'Opracowanie własne'}" + # Inicjalizacja komponentów source_mapper = SourceMapper() model_name = "crumb/nano-mistral" @@ -160,9 +178,9 @@ tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8) # Inicjalizacja modelu config = AutoModelForCausalLM.from_pretrained(model_name).config -#print("Vocabulary size:", config.vocab_size) model = CustomModel(model_name, config) -#model.to("cpu") # Zmienione na CPU dla debugowania +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = model.to(device) # Konfiguracja treningu training_args = TrainingArguments( @@ -171,7 +189,7 @@ training_args = TrainingArguments( per_device_train_batch_size=2, gradient_accumulation_steps=4, learning_rate=2e-5, - fp16=False, # Wyłączone dla CPU + fp16=torch.cuda.is_available(), logging_steps=1, logging_dir="./logs", save_strategy="steps", @@ -189,40 +207,9 @@ trainer = CustomTrainer( ) trainer.train() -# Funkcja generująca odpowiedź -def generate_answer(question, model, tokenizer, source_mapper, max_length=200): - inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512) - - outputs = model.base_model.generate( - **inputs, - max_length=max_length, - num_return_sequences=1, - return_dict_in_generate=True, - output_scores=True, - ) - - answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) - - # Pobierz źródło z ostatniego tokena - last_token_id = outputs.sequences[0][-1].item() - source_idx = model.source_embeddi - - - - - -# Po zakończeniu treningu modelu - -# Przygotowanie niezbędnych komponentów -model.eval() # Przełącz model w tryb ewaluacji -model = model.to("cuda" if torch.cuda.is_available() else "cpu") # Przenieś model na GPU, jeśli jest dostępne - -# Przykładowe pytanie +# Przykładowe użycie +model.eval() question = "Ile dni urlopu przysługuje pracownikowi?" - -# Generowanie odpowiedzi answer = generate_answer(question, model, tokenizer, source_mapper) - -# Wyświetlenie wyniku print("Pytanie:", question) -print("Odpowiedź:", answer) \ No newline at end of file +print("Odpowiedź:", answer)