mod

2025-02-25 20:38:44 +01:00 · 2025-02-25 20:38:44 +01:00 · b14dc7f278
parent 58995c1181
commit b14dc7f278
1 changed files with 89 additions and 99 deletions
--- a/hft.py
+++ b/hft.py
@ -12,9 +12,9 @@ import json
 from collections import defaultdict
 from huggingface_hub import login
 # Konfiguracja
 os.environ['TORCH_USE_CUDA_DSA'] = '1'
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 login(token="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")
 class SourceMapper:
@ -90,7 +90,7 @@ def prepare_dataset(directory, catalog_path, source_mapper):
                for chunk in chunks:
                    data.append({
                        "text": chunk,
-                        "source_idx": -1  # Brak źródła
+                        "source_idx": -1
                    })
    return data
@ -111,127 +111,117 @@ def custom_collate_fn(batch):
    attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
    labels = torch.stack([torch.tensor(b["labels"]) for b in batch])
    source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long)
-    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx}
+    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "source_idx": source_idx
    }
 class CustomModel(nn.Module):
    def __init__(self, model_name, config):
        super().__init__()
        self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
-        self.source_embedding = nn.Embedding(
+        self.source_embedding = nn.Embedding(1000, config.hidden_size, padding_idx=-1)
            num_embeddings=1000,
            embedding_dim=config.hidden_size,
            padding_idx=-1
        )
    def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
        if source_idx is not None:
-            source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings - 1)
+            source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1)
            source_embeds = self.source_embedding(source_idx).unsqueeze(1).expand(-1, input_ids.size(1), -1)
            inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds
-            outputs = self.base_model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs)
+            return self.base_model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs)
-        else:
+        return self.base_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
            outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
        return outputs
    def generate(self, *args, **kwargs):
        return self.base_model.generate(*args, **kwargs)
-class CustomTrainer(Trainer):
+def main():
-    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+    # Inicjalizacja
-        labels = inputs.pop("labels")
+    source_mapper = SourceMapper()
-        source_idx = inputs.pop("source_idx", None)
+    model_name = "crumb/nano-mistral"
-        outputs = model(input_ids=inputs["input_ids"], 
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
-                       attention_mask=inputs["attention_mask"], 
+    tokenizer.pad_token = tokenizer.eos_token
                       labels=labels, 
                       source_idx=source_idx)
        return (outputs.loss, outputs) if return_outputs else outputs.loss
-# Inicjalizacja komponentów
+    # Przygotowanie danych
-source_mapper = SourceMapper()
+    catalog_path = "file_catalog.json"
-model_name = "crumb/nano-mistral"
+    data = prepare_dataset("files", catalog_path, source_mapper)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+    dataset = Dataset.from_list(data)
-tokenizer.pad_token = tokenizer.eos_token
+    tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8)
-# Przygotowanie danych
+    # Model
-catalog_path = "file_catalog.json"
+    config = AutoModelForCausalLM.from_pretrained(model_name).config
-data = prepare_dataset("files", catalog_path, source_mapper)
+    model = CustomModel(model_name, config)
-dataset = Dataset.from_list(data)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8)
+    model.to(device)
-# Inicjalizacja modelu
+    # Trening
-config = AutoModelForCausalLM.from_pretrained(model_name).config
+    training_args = TrainingArguments(
-model = CustomModel(model_name, config)
+        output_dir="./results",
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        num_train_epochs=3,
-model = model.to(device)
+        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=1,
        save_strategy="steps",
        save_steps=1000,
        report_to="none"
    )
-# Konfiguracja treningu
+    trainer = Trainer(
-training_args = TrainingArguments(
+        model=model,
-    output_dir="./results",
+        args=training_args,
-    num_train_epochs=3,
+        train_dataset=tokenized_dataset,
-    per_device_train_batch_size=2,
+        data_collator=custom_collate_fn,
-    gradient_accumulation_steps=4,
+    )
-    learning_rate=2e-5,
+    print("Rozpoczęcie treningu...")
-    fp16=torch.cuda.is_available(),
+    trainer.train()
    logging_steps=1,
    save_strategy="steps",
    save_steps=1000,
    logging_strategy="no",
    report_to="none"
 )
-# Trening
+    # Testowanie
-trainer = CustomTrainer(
+    def generate_answer(question):
-    model=model,
+        inputs = tokenizer(question, return_tensors="pt").to(device)
-    args=training_args,
+        
    train_dataset=tokenized_dataset,
    data_collator=custom_collate_fn,
 )
 trainer.train()
 # Funkcja testująca
 def generate_answer_with_source(question, model, tokenizer, source_mapper, max_length=200):
    device = next(model.parameters()).device
    inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
-            max_length=max_length,
+            max_new_tokens=200,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id
        )
-    
+        
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    
+        answer = answer.replace(question, "").strip()
-    # Wyszukiwanie źródeł
+        
-    sources = set()
+        sources = set()
-    for idx in source_mapper.idx_to_source:
+        for match in re.finditer(r'Art\.\s+\d+', answer):
-        if source_mapper.idx_to_source[idx] in answer:
+            article_ref = match.group(0).strip()
-            sources.add(source_mapper.idx_to_source[idx])
+            for idx, source in source_mapper.idx_to_source.items():
-    
+                if article_ref in source:
-    return {
+                    sources.add(source)
-        "question": question,
+        
-        "answer": answer,
+        return {
-        "sources": list(sources) if sources else ["Opracowanie własne"]
+            "question": question,
-    }
+            "answer": answer,
            "sources": list(sources) if sources else ["Opracowanie własne"]
        }
-# Testowanie
+    # Przykładowe testy
-test_questions = [
+    test_questions = [
-    "Jak brzmi art. 154 kodeksu pracy?"
+        "Jakie są zasady udzielania urlopu wypoczynkowego?",
-]
+        "Co mówi art. 154 kodeksu pracy?",
        "Jakie są obowiązki pracodawcy w zakresie BHP?"
    ]
    print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50)
    for question in test_questions:
        result = generate_answer(question)
        print(f"\nPYTANIE: {result['question']}")
        print(f"ODPOWIEDŹ: {result['answer'][:500]}")
        print(f"ŹRÓDŁA: {', '.join(result['sources'])}")
        print("-"*80)
-print("\n=== TEST MODELU ===")
+if __name__ == "__main__":
-for question in test_questions:
+    main()
    result = generate_answer_with_source(question, model, tokenizer, source_mapper)
    print(f"\nPytanie: {result['question']}")
    print(f"Odpowiedź: {result['answer']}")
    print(f"Źródła: {', '.join(result['sources'])}")
    print("="*80)
 # Zapis modelu
 save_directory = "./trained_model"
 os.makedirs(save_directory, exist_ok=True)
 torch.save(model.state_dict(), os.path.join(save_directory, "model.bin"))
 tokenizer.save_pretrained(save_directory)