mod

2025-02-25 21:35:55 +01:00 · 2025-02-25 21:35:55 +01:00 · b3f2102b2a
parent f97eeea435
commit b3f2102b2a
2 changed files with 17 additions and 27 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/hft.py
+++ b/hft.py
@ -15,7 +15,7 @@ from huggingface_hub import login
 # Konfiguracja
 os.environ['TORCH_USE_CUDA_DSA'] = '1'
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-login(token="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")  # Zastąp swoim tokenem HF
+login(token="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")

 class SourceMapper:
    def __init__(self):
@ -42,7 +42,7 @@ def load_file_catalog(catalog_path):
        return {}

 def identify_legal_document(filename, file_catalog):
-    base_name = os.path.splitext(filename)[0]
+    base_name = os.path.splitext(filename)[0].lower()
    return file_catalog.get(base_name, "Opracowanie własne")

 def extract_text_from_file(file_path):
@ -55,10 +55,13 @@ def extract_text_from_file(file_path):
                return file.read()
        elif ext == '.pdf':
            text = ""
+            try:
                with open(file_path, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    for page in reader.pages:
-                    text += page.extract_text()
+                        text += page.extract_text() or ""
+            except Exception as e:
+                print(f"Błąd PDF: {str(e)}")
            return text
        elif ext in ['.doc', '.docx']:
            return docx2txt.process(file_path)
@ -77,15 +80,7 @@ def prepare_dataset(directory, catalog_path, source_mapper):
    
    print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
    
-    if not os.path.exists(directory):
-        print(f"Brak katalogu: {directory}")
-        return data
-        
    for root, _, files in os.walk(directory):
-        if not files:
-            print(f"Brak plików w katalogu: {root}")
-            continue
-            
        for file in files:
            file_path = os.path.join(root, file)
            print(f"\nPrzetwarzanie pliku: {file_path}")
@ -102,19 +97,15 @@ def prepare_dataset(directory, catalog_path, source_mapper):
                print(f"Rozpoznany typ dokumentu: {doc_type}")
                
                if doc_type != "Opracowanie własne":
-                    articles = re.split(r'(?i)(#+\s*art\.?\s*\d+[\.\s]?)', text)
+                    # Nowe wyrażenie regularne dla formatu "Art. XX."
+                    articles = re.split(r'(Art\. \d+\.?)', text)
                    print(f"Znaleziono {len(articles)} fragmentów")
                    
-                    if len(articles) < 2:
-                        print("Brak artykułów w dokumencie prawnym!")
-                        continue
-                        
                    for i in range(1, len(articles), 2):
-                        article_number = re.sub(r'#+\s*', '', articles[i].strip(), flags=re.IGNORECASE)
+                        article_number = articles[i].strip()
                        article_content = articles[i+1].strip() if i+1 < len(articles) else ""
                        
                        if not article_content:
-                            print(f"Pominięto pusty artykuł: {article_number}")
                            continue
                            
                        source = f"{doc_type}, {article_number}"
@ -126,7 +117,6 @@ def prepare_dataset(directory, catalog_path, source_mapper):
                            "source_idx": source_mapper.get_idx(source)
                        })
                else:
-                    print("Traktowanie jako opracowanie własne")
                    clean_text = re.sub(r'\s+', ' ', text).strip()
                    chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
                    chunks = [c for c in chunks if c.strip()]
@ -276,8 +266,8 @@ def main():
        answer = answer.replace(prompt, "").strip()
        
        sources = set()
-        for match in re.finditer(r'(?i)art\.?\s*\d+', answer):
-            article_ref = match.group(0).strip()
+        for match in re.finditer(r'(?i)art\.?\s*\d+\.?', answer):
+            article_ref = match.group(0).strip().rstrip('.')
            for source in source_mapper.idx_to_source.values():
                if article_ref.lower() in source.lower():
                    sources.add(source)
@ -290,9 +280,9 @@ def main():

    # Testy
    test_questions = [
-        "Jakie są zasady udzielania urlopu wypoczynkowego?",
-        "Co mówi art. 154 kodeksu pracy?",
-        "Jakie są obowiązki pracodawcy w zakresie BHP?"
+        "Jakie są prawa pracownika według art. 1?",
+        "Kto jest pracownikiem według art. 2?",
+        "Jakie są obowiązki pracodawcy według art. 3?"
    ]
    
    print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50)