mod

2025-02-25 20:09:36 +01:00 · 2025-02-25 20:09:36 +01:00 · 02aa12d24e
parent 2cceeb31c8
commit 02aa12d24e
1 changed files with 57 additions and 110 deletions
--- a/hft.py
+++ b/hft.py
@ -110,16 +110,13 @@ def custom_collate_fn(batch):
    input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])
    attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
    labels = torch.stack([torch.tensor(b["labels"]) for b in batch])
-    
    source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long)
-    #print("source_idx shape:", source_idx.shape)  # Debugowanie
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx}

-# Zmodyfikowana klasa CustomModel
-class CustomModel(AutoModelForCausalLM):  # 🔵 Zmiana dziedziczenia
+class CustomModel(nn.Module):
    def __init__(self, model_name, config):
-        super().__init__(config)  # 🔵 Inicjalizacja klasy bazowej
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
+        super().__init__()
+        self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
        self.source_embedding = nn.Embedding(
            num_embeddings=1000,
            embedding_dim=config.hidden_size,
@ -130,21 +127,24 @@ class CustomModel(AutoModelForCausalLM):  # 🔵 Zmiana dziedziczenia
        if source_idx is not None:
            source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings - 1)
            source_embeds = self.source_embedding(source_idx).unsqueeze(1).expand(-1, input_ids.size(1), -1)
-            inputs_embeds = self.model.get_input_embeddings()(input_ids) + source_embeds
-            return self.model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs)
-        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
-
-    # 🔵 Dodanie metody generate
+            inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds
+            outputs = self.base_model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs)
+        else:
+            outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
+        return outputs
+    
    def generate(self, *args, **kwargs):
-        return self.model.generate(*args, **kwargs)
+        return self.base_model.generate(*args, **kwargs)

 class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        source_idx = inputs.pop("source_idx", None)
-        outputs = model(**inputs, labels=labels, source_idx=source_idx)
-        loss = outputs.loss
-        return (loss, outputs) if return_outputs else loss
+        outputs = model(input_ids=inputs["input_ids"], 
+                       attention_mask=inputs["attention_mask"], 
+                       labels=labels, 
+                       source_idx=source_idx)
+        return (outputs.loss, outputs) if return_outputs else outputs.loss

 # Inicjalizacja komponentów
 source_mapper = SourceMapper()
@ -160,9 +160,9 @@ tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8)

 # Inicjalizacja modelu
 config = AutoModelForCausalLM.from_pretrained(model_name).config
-#print("Vocabulary size:", config.vocab_size)
 model = CustomModel(model_name, config)
-#model.to("cpu")  # Zmienione na CPU dla debugowania
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)

 # Konfiguracja treningu
 training_args = TrainingArguments(
@ -171,13 +171,12 @@ training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
-    fp16=False,  # Wyłączone dla CPU
+    fp16=torch.cuda.is_available(),
    logging_steps=1,
-    logging_dir="./logs",
    save_strategy="steps",
    save_steps=1000,
    logging_strategy="no",
-    report_to="none",
+    report_to="none"
 )

 # Trening
@ -189,91 +188,9 @@ trainer = CustomTrainer(
 )
 trainer.train()

-# Utwórz katalog do zapisu modelu
-save_directory = "./trained_model/ably.do/hse"
-os.makedirs(save_directory, exist_ok=True)
-
-# 1. Zapisz wagę modelu
-torch.save(model.state_dict(), os.path.join(save_directory, "hse-nano-mistral.bin"))
-
-# 2. Zapisz tokenizer
-tokenizer.save_pretrained(save_directory)
-
-# 3. Zapisz mapowanie źródeł
-source_mapper_data = {
-    "source_to_idx": dict(source_mapper.source_to_idx),
-    "idx_to_source": source_mapper.idx_to_source
-}
-
-with open(os.path.join(save_directory, "source_mapper.json"), 'w') as f:
-    json.dump(source_mapper_data, f)
-
-# 4. Zapisz konfigurację modelu (opcjonalnie, ale zalecane)
-model.base_model.config.save_pretrained(save_directory)
-
-# Funkcja generująca odpowiedź
+# Funkcja testująca
 def generate_answer_with_source(question, model, tokenizer, source_mapper, max_length=200):
    device = next(model.parameters()).device
-    inputs = tokenizer(
-        question,
-        return_tensors="pt",
-        truncation=True,
-        max_length=512
-    ).to(device)
-    
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_length=max_length,
-            num_return_sequences=1,
-            return_dict_in_generate=True,
-            temperature=0.7,
-            top_p=0.9,
-        )
-    
-    answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
-    
-    # Ekstrakcja informacji o źródłach
-    article_matches = re.finditer(r'Art\.\s+\d+', answer)
-    sources = set()
-    
-    for match in article_matches:
-        article_ref = match.group(0).strip()
-        for idx, source in source_mapper.idx_to_source.items():
-            if article_ref in source:
-                sources.add(source)
-                break
-    
-    return {
-        "question": question,
-        "answer": answer,
-        "sources": list(sources) if sources else ["Opracowanie własne"],
-        "num_tokens": len(outputs.sequences[0])
-    }
-    
-
-
-# Przykładowe testy
-test_cases = [
-    "Jaki jest wymiar urlopu wypoczynkowego?",
-    "Jakie są zasady bezpieczeństwa na budowie?",
-    "Wyjaśnij procedurę zwolnienia grupowego",
-    "Co reguluje ustawa o ochronie danych osobowych?",
-    "Jakie dokumenty są potrzebne do zawarcia umowy o pracę?"
-]
-
-print("\n\n🔴 🔴 🔴 ROZPOCZĘCIE TESTOWANIA MODELU 🔴 🔴 🔴")
-for case in test_cases:
-    result = generate_answer_with_source(case, model, tokenizer, source_mapper)
-    print(f"\n🔷 Pytanie: {result['question']}")
-    print(f"🔷 Odpowiedź ({result['num_tokens']} tokenów):")
-    print(result['answer'])
-    print(f"🔷 Źródła: {', '.join(result['sources'])}")
-    print("-"*80)
-    
-# Funkcja generująca odpowiedź
-def generate_answer(question, max_length=200):
-    model.eval()
    inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    with torch.no_grad():
@ -281,12 +198,42 @@ def generate_answer(question, max_length=200):
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
-            return_dict_in_generate=True
+            temperature=0.7,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id
        )
-        
-    answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
-    return answer
    
-# Utwórz katalog do zapisu modelu
-save_directory = "./trained_model/ably.do/hse"
-os.makedirs(save_directory, exist_ok=True)
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    
+    # Wyszukiwanie źródeł
+    sources = set()
+    for idx in source_mapper.idx_to_source:
+        if source_mapper.idx_to_source[idx] in answer:
+            sources.add(source_mapper.idx_to_source[idx])
+    
+    return {
+        "question": question,
+        "answer": answer,
+        "sources": list(sources) if sources else ["Opracowanie własne"]
+    }
+
+# Testowanie
+test_questions = [
+    "Jaki jest wymiar urlopu wypoczynkowego?",
+    "Jakie są zasady bezpieczeństwa na budowie?",
+    "Wyjaśnij procedurę zwolnienia grupowego"
+]
+
+print("\n=== TEST MODELU ===")
+for question in test_questions:
+    result = generate_answer_with_source(question, model, tokenizer, source_mapper)
+    print(f"\nPytanie: {result['question']}")
+    print(f"Odpowiedź: {result['answer']}")
+    print(f"Źródła: {', '.join(result['sources'])}")
+    print("="*80)
+
+# Zapis modelu
+save_directory = "./trained_model"
+os.makedirs(save_directory, exist_ok=True)
+torch.save(model.state_dict(), os.path.join(save_directory, "model.bin"))
+tokenizer.save_pretrained(save_directory)