Ten kod działa!

2025-02-25 23:32:39 +01:00 · 2025-02-25 23:32:39 +01:00 · a0aab164cb
parent 537e191d5f
commit a0aab164cb
1 changed files with 208 additions and 243 deletions
--- a/hft.py
+++ b/hft.py
@ -1,296 +1,261 @@
 import os
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 from datasets import Dataset
 import re
 import json
 import numpy as np
 import PyPDF2
 import docx2txt
 import pytesseract
 from PIL import Image
 from collections import defaultdict
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
 )
 from datasets import Dataset, Features, Value
 from huggingface_hub import login
 # Konfiguracja
 os.environ['TORCH_USE_CUDA_DSA'] = '1'
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 login(token="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")
-class LegalAITrainer:
+class SourceMapper:
    def __init__(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.source_to_idx = defaultdict(lambda: len(self.source_to_idx))
        self.idx_to_source = {}
-    class SourceMapper:
+    def add_source(self, source):
-        def __init__(self):
+        if source and source not in self.source_to_idx:
-            self.source_to_idx = defaultdict(lambda: len(self.source_to_idx))
+            idx = self.source_to_idx[source]
-            self.idx_to_source = {}
+            self.idx_to_source[idx] = source
-        def add_source(self, source):
+    def get_idx(self, source):
-            if source and source not in self.source_to_idx:
+        return self.source_to_idx[source] if source else -1
                idx = self.source_to_idx[source]
                self.idx_to_source[idx] = source
-        def get_idx(self, source):
+    def get_source(self, idx):
-            return self.source_to_idx[source] if source else -1
+        return self.idx_to_source.get(idx, "Unknown")
-        def get_source(self, idx):
+def load_file_catalog(catalog_path):
-            return self.idx_to_source.get(idx, "Unknown")
+    try:
        with open(catalog_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Błąd wczytywania katalogu plików: {str(e)}")
        return {}
-    class LegalModel(nn.Module):
+def identify_legal_document(filename, file_catalog):
-        def __init__(self, model_name, config):
+    base_name = os.path.splitext(filename)[0].lower()
-            super().__init__()
+    return file_catalog.get(base_name, "Opracowanie własne")
            self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
            self.source_embedding = nn.Embedding(100000, config.hidden_size, padding_idx=-1)
            self.confidence_layer = nn.Linear(config.hidden_size, 1)
-            for param in self.base_model.parameters():
+def extract_text_from_file(file_path):
-                param.requires_grad = False
+    try:
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()
-            for layer in [self.source_embedding, self.confidence_layer]:
+        if ext in ['.txt', '.md']:
-                for param in layer.parameters():
+            with open(file_path, 'r', encoding='utf-8') as file:
-                    param.requires_grad = True
+                return file.read()
-
+        elif ext == '.pdf':
-        def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None):
+            text = ""
-            if source_idx is not None:
+            try:
-                source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1)
+                with open(file_path, 'rb') as file:
-                source_embeds = self.source_embedding(source_idx).unsqueeze(1)
+                    reader = PyPDF2.PdfReader(file)
                inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds
                outputs = self.base_model(
                    inputs_embeds=inputs_embeds,
                    attention_mask=attention_mask,
                    labels=labels
                )
            else:
                outputs = self.base_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            confidence = torch.sigmoid(self.confidence_layer(outputs.hidden_states[-1].mean(dim=1)))
            return {
                "loss": outputs.loss,
                "logits": outputs.logits,
                "confidence": confidence,
                "hidden_states": outputs.hidden_states
            }
    def load_file_catalog(self, catalog_path):
        try:
            with open(catalog_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"Błąd ładowania katalogu: {str(e)}")
            return {}
    def extract_text(self, file_path):
        ext = os.path.splitext(file_path)[1].lower()
        try:
            if ext in ['.txt', '.md']:
                with open(file_path, 'r', encoding='utf-8') as f:
                    return f.read()
            elif ext == '.pdf':
                text = ""
                with open(file_path, 'rb') as f:
                    reader = PyPDF2.PdfReader(f)
                    for page in reader.pages:
                        text += page.extract_text() or ""
-                return text
+            except Exception as e:
-            elif ext in ['.doc', '.docx']:
+                print(f"Błąd PDF: {str(e)}")
-                return docx2txt.process(file_path)
+            return text
-            elif ext in ['.jpg', '.jpeg', '.png']:
+        elif ext in ['.doc', '.docx']:
-                return pytesseract.image_to_string(Image.open(file_path))
+            return docx2txt.process(file_path)
-            else:
+        elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
-                return ""
+            return pytesseract.image_to_string(Image.open(file_path))
-        except Exception as e:
+        else:
-            print(f"Błąd przetwarzania {file_path}: {str(e)}")
+            print(f"Nieobsługiwany format pliku: {ext}")
            return ""
    except Exception as e:
        print(f"Błąd ekstrakcji tekstu: {str(e)}")
        return ""
-    def prepare_data(self, data_dir, catalog_path):
+def prepare_dataset(directory, catalog_path, source_mapper):
-        catalog = self.load_file_catalog(catalog_path)
+    file_catalog = load_file_catalog(catalog_path)
-        data = []
+    data = []
        source_mapper = self.SourceMapper()
-        for root, _, files in os.walk(data_dir):
+    print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
            for file in files:
                file_path = os.path.join(root, file)
                text = self.extract_text(file_path)
-                if not text:
+    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            print(f"\nPrzetwarzanie pliku: {file_path}")
            try:
                text = extract_text_from_file(file_path)
                if not text.strip():
                    print("Pominięto - brak tekstu")
                    continue
-                doc_type = catalog.get(os.path.splitext(file)[0].lower(), "Opracowanie własne")
+                print(f"Długość tekstu: {len(text)} znaków")
                doc_type = identify_legal_document(file, file_catalog)
                print(f"Rozpoznany typ dokumentu: {doc_type}")
                if doc_type != "Opracowanie własne":
-                    articles = re.split(r'(?i)(Art\.\s*\d+[a-z]*)', text)
+                    articles = re.split(r'(?i)(Art[\.\s]+\d+[\.\s]?)', text)
-                    for i in range(1, len(articles), 2):
+                    articles = [a.strip() for a in articles if a.strip()]
                        art_num = articles[i].strip()
                        content = articles[i+1].strip()
-                        if len(content) < 100:
+                    print(f"Znaleziono {len(articles)} fragmentów")
                    for i in range(0, len(articles)-1, 2):
                        article_number = articles[i]
                        article_content = articles[i+1]
                        if len(article_content) < 50:
                            continue
-                        source = f"{doc_type}, {art_num}"
+                        source = f"{doc_type}, {article_number}"
                        source_mapper.add_source(source)
                        data.append({
-                            "text": f"[LEGAL] {art_num} {content}",
+                            "text": f"{article_number} {article_content}",
-                            "source_idx": source_mapper.get_idx(source),
+                            "source_idx": source_mapper.get_idx(source)
                            "is_legal": 1
                        })
                else:
-                    chunks = [f"[GENERAL] {text[i:i+512]}" for i in range(0, len(text), 512)]
+                    clean_text = re.sub(r'\s+', ' ', text).strip()
                    chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
                    chunks = [c for c in chunks if c.strip()]
                    for chunk in chunks:
                        data.append({
                            "text": chunk,
-                            "source_idx": -1,
+                            "source_idx": -1
                            "is_legal": 0
                        })
                    print(f"Dodano {len(chunks)} chunków")
-        features = Features({
+            except Exception as e:
-            "text": Value("string"),
+                print(f"Błąd podczas przetwarzania pliku: {str(e)}")
-            "source_idx": Value("int32"),
+                continue
            "is_legal": Value("int32")
        })
-        return Dataset.from_dict({
+    print(f"\nPodsumowanie przygotowania danych:")
-            "text": [d["text"] for d in data],
+    print(f"Łączna liczba przykładów: {len(data)}")
-            "source_idx": np.array([d["source_idx"] for d in data], dtype=np.int32),
+    if data:
-            "is_legal": np.array([d["is_legal"] for d in data], dtype=np.int32)
+        print("Przykładowy wpis:")
-        }, features=features), source_mapper
+        print(json.dumps(data[0], indent=2, ensure_ascii=False))
    else:
        print("BRAK DANYCH - sprawdź diagnostykę powyżej")
-    def train(self, model_name="crumb/nano-mistral", data_dir="data", catalog_path="catalog.json"):
+    return data
        dataset, source_mapper = self.prepare_data(data_dir, catalog_path)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
-        def tokenize_fn(examples):
+class CustomModel(nn.Module):
-            tokenized = tokenizer(
+    def __init__(self, model_name, config):
-                examples["text"],
+        super().__init__()
-                padding="max_length",
+        self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
-                truncation=True,
+        self.source_embedding = nn.Embedding(10000, config.hidden_size, padding_idx=-1)
-                max_length=512,
+        
-                return_tensors="pt"
+        for param in self.base_model.parameters():
            param.requires_grad = False
        for param in self.base_model.get_output_embeddings().parameters():
            param.requires_grad = True
    def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
        if source_idx is not None:
            valid_indices = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1)
            source_embeds = self.source_embedding(valid_indices).unsqueeze(1)
            inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds
            return self.base_model(
                inputs_embeds=inputs_embeds,
                attention_mask=attention_mask,
                labels=labels,
                **kwargs
            )
-            return {
+        return self.base_model(
-                "input_ids": tokenized["input_ids"].squeeze().tolist(),
+            input_ids=input_ids,
-                "attention_mask": tokenized["attention_mask"].squeeze().tolist(),
+            attention_mask=attention_mask,
-                "labels": tokenized["input_ids"].squeeze().clone().tolist(),
+            labels=labels,
-                "source_idx": examples["source_idx"]
+            **kwargs
            }
        tokenized_dataset = dataset.map(tokenize_fn, batched=True, batch_size=16)
        class CustomDataCollator(DataCollatorForLanguageModeling):
            def torch_call(self, examples):
                batch = super().torch_call(examples)
                if "source_idx" in examples[0]:
                    batch["source_idx"] = torch.tensor(
                        [ex["source_idx"] for ex in examples], 
                        dtype=torch.int32
                    )
                return batch
        config = AutoModelForCausalLM.from_pretrained(model_name).config
        model = self.LegalModel(model_name, config).to(self.device)
        training_args = TrainingArguments(
            output_dir="./legal_ai_model",
            num_train_epochs=3,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            learning_rate=2e-5,
            fp16=torch.cuda.is_available(),
            logging_steps=50,
            save_strategy="steps",
            save_steps=500,
            report_to="none",
            remove_unused_columns=False
        )
-        class LegalTrainer(Trainer):
+    def generate(self, *args, **kwargs):
-            def compute_loss(self, model, inputs, return_outputs=False):
+        return self.base_model.generate(*args, **kwargs)
                outputs = model(**inputs)
                loss = outputs["loss"]
-                target_conf = (inputs["source_idx"] != -1).float()
+class CustomDataCollator(DataCollatorForLanguageModeling):
-                conf_loss = nn.BCELoss()(outputs["confidence"].squeeze(), target_conf)
+    def torch_call(self, examples):
        # Przetwórz podstawowe pola
        input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples])
        attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples])
        labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples])
-                total_loss = loss + 0.7 * conf_loss
+        batch = {
-                return (total_loss, outputs) if return_outputs else total_loss
+            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }
-        trainer = LegalTrainer(
+        # Dodaj source_idx jeśli istnieje
-            model=model,
+        if "source_idx" in examples[0]:
-            args=training_args,
+            source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples])
-            train_dataset=tokenized_dataset,
+            batch["source_idx"] = source_idx
            data_collator=CustomDataCollator(tokenizer=tokenizer, mlm=False)
        )
-        print("Rozpoczęcie treningu...")
+        return batch
        trainer.train()
-        model.save_pretrained("./trained_legal_ai")
+def main():
-        tokenizer.save_pretrained("./trained_legal_ai")
+    source_mapper = SourceMapper()
-        with open("./trained_legal_ai/source_mapper.json", "w") as f:
+    model_name = "crumb/nano-mistral"
-            json.dump(source_mapper.idx_to_source, f)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
-        print("Trening zakończony!")
+    # Przygotowanie danych
    catalog_path = "file_catalog.json"
    data = prepare_dataset("files", catalog_path, source_mapper)
-    def generate_response(self, prompt, confidence_threshold=0.65):
+    if not data:
-        model = self.LegalModel.from_pretrained(
+        print("\nBrak danych do treningu!")
-            "./trained_legal_ai",
+        return
            config=AutoModelForCausalLM.from_pretrained("crumb/nano-mistral").config
        ).to(self.device)
-        tokenizer = AutoTokenizer.from_pretrained("./trained_legal_ai")
+    #dataset = Dataset.from_list(data)
    dataset = Dataset.from_dict({k: [d[k] for d in data] for k in data[0]})
        with open("./trained_legal_ai/source_mapper.json", "r") as f:
            source_mapper = json.load(f)
-        inputs = tokenizer(
+    def tokenize_function(examples):
-            f"[PROMPT] {prompt} [RESPONSE]",
+        tokenized = tokenizer(
-            return_tensors="pt",
+            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=512,
-            truncation=True
+            return_tensors="pt"
-        ).to(self.device)
+        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
            "labels": tokenized["input_ids"].squeeze().clone(),
            "source_idx": examples["source_idx"]  # Dodano bez konwersji do tensora
        }
-        with torch.no_grad():
+    tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)
            outputs = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=512,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                pad_token_id=tokenizer.eos_token_id,
                output_scores=True,
                return_dict_in_generate=True
            )
-        full_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
+    model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
-        confidence = torch.sigmoid(outputs.scores[-1][:, tokenizer.eos_token_id]).item()
+    model.source_mapper = source_mapper
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
-        citations = list(set(re.findall(r"Art\.\s*\d+[a-z]*", full_text)))
+    training_args = TrainingArguments(
-        verified = [c for c in citations if any(c in s for s in source_mapper.values())]
+        output_dir="./results",
-
+        num_train_epochs=3,
-        if confidence < confidence_threshold or not verified:
+        per_device_train_batch_size=2,
-            return "Nie mogę udzielić jednoznacznej odpowiedzi na podstawie dostępnych danych."
+        gradient_accumulation_steps=4,
-        else:
+        learning_rate=2e-5,
-            return f"{full_text}\n\nPotwierdzone źródła: {', '.join(verified)}"
+        fp16=torch.cuda.is_available(),
-
+        logging_steps=10,
-if __name__ == "__main__":
+        save_strategy="steps",
-    legal_ai = LegalAITrainer()
+        save_steps=1000,
-    
+        report_to="none",
-    legal_ai.train(
+        remove_unused_columns=False
        model_name="crumb/nano-mistral",
        data_dir="./legal_docs",
        catalog_path="./catalog.json"
    )
-    test_prompt = "Jakie są kary za nieprzestrzeganie przepisów RODO?"
+    trainer = Trainer(
-    print(legal_ai.generate_response(test_prompt))
+        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=CustomDataCollator(tokenizer=tokenizer, mlm=False)
    )
    print("\nRozpoczęcie treningu...")
    trainer.train()
 if __name__ == "__main__":
    main()