From b3f2102b2a88582eeb4cccf3abda33a36d00d546 Mon Sep 17 00:00:00 2001
From: "l.gabrysiak" <l.gabrysiak@osadkowski.pl>
Date: Tue, 25 Feb 2025 21:35:55 +0100
Subject: [PATCH] mod

---
 .DS_Store | Bin 0 -> 6148 bytes
 hft.py    |  44 +++++++++++++++++---------------------------
 2 files changed, 17 insertions(+), 27 deletions(-)
 create mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..b2cf37d250f33e6f3721afac6800921711fd979d
GIT binary patch
literal 6148
zcmeHKOH0E*5Z>*>rW7Fug&qT53-*O5UP7#Yz=$4HYC@t0W41J@Ig~=q`iJ}_{vKy`
zH_&46C}L+|_nV!^ZsvpRgE7XPMKEB@WQ-Zm5IHIpg62wBO$Q@#IY;W}>CBIXUop+!
zG~u^5*#nDM%tE&Q{U81;j?%2(|KzoLqq);ET1LmXbDu=+W?r^P9dCYvqe~%Tztlbd
zDhl(dxpyX#%!`t6E)$|4gpk|oC<#RFh(!{lGS^iOqiwXOW_P(9j|M$!d^}wBtmVnD
z5Blh6wQ3vt2ZyH@)93gl5pRk}4vb6L)mXwiC~FnHy7MF!$s<_Hj53mt7$63S0b*b~
z8PMl}(b>-OscK??82Et!+#f7xh_=Q;q1-y4!|OBp8;B^N<68pJ(r9Ze6oLnYt5iUh
z%Jmb2t8}niI?mQuC{*c;%avgsy>j_@;c|7bTQZz+TOswt05P!1KwS+@Jpa$(m#KW@
zZ>G?Q7$64z83VjAaVHKGrO(!H<>6UtLA!^Bf^h{ZAfRtt0$_mqNLM<Q-$EVYY>kCN
UoCWPN9gr>pk`U^MfnQ+Y3t@Ol-v9sr

literal 0
HcmV?d00001

diff --git a/hft.py b/hft.py
index 11798dd..4e77248 100644
--- a/hft.py
+++ b/hft.py
@@ -15,7 +15,7 @@ from huggingface_hub import login
 # Konfiguracja
 os.environ['TORCH_USE_CUDA_DSA'] = '1'
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-login(token="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")  # Zastąp swoim tokenem HF
+login(token="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")
 
 class SourceMapper:
     def __init__(self):
@@ -42,7 +42,7 @@ def load_file_catalog(catalog_path):
         return {}
 
 def identify_legal_document(filename, file_catalog):
-    base_name = os.path.splitext(filename)[0]
+    base_name = os.path.splitext(filename)[0].lower()
     return file_catalog.get(base_name, "Opracowanie własne")
 
 def extract_text_from_file(file_path):
@@ -55,10 +55,13 @@ def extract_text_from_file(file_path):
                 return file.read()
         elif ext == '.pdf':
             text = ""
-            with open(file_path, 'rb') as file:
-                reader = PyPDF2.PdfReader(file)
-                for page in reader.pages:
-                    text += page.extract_text()
+            try:
+                with open(file_path, 'rb') as file:
+                    reader = PyPDF2.PdfReader(file)
+                    for page in reader.pages:
+                        text += page.extract_text() or ""
+            except Exception as e:
+                print(f"Błąd PDF: {str(e)}")
             return text
         elif ext in ['.doc', '.docx']:
             return docx2txt.process(file_path)
@@ -77,15 +80,7 @@ def prepare_dataset(directory, catalog_path, source_mapper):
     
     print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
     
-    if not os.path.exists(directory):
-        print(f"Brak katalogu: {directory}")
-        return data
-        
     for root, _, files in os.walk(directory):
-        if not files:
-            print(f"Brak plików w katalogu: {root}")
-            continue
-            
         for file in files:
             file_path = os.path.join(root, file)
             print(f"\nPrzetwarzanie pliku: {file_path}")
@@ -102,19 +97,15 @@ def prepare_dataset(directory, catalog_path, source_mapper):
                 print(f"Rozpoznany typ dokumentu: {doc_type}")
                 
                 if doc_type != "Opracowanie własne":
-                    articles = re.split(r'(?i)(#+\s*art\.?\s*\d+[\.\s]?)', text)
+                    # Nowe wyrażenie regularne dla formatu "Art. XX."
+                    articles = re.split(r'(Art\. \d+\.?)', text)
                     print(f"Znaleziono {len(articles)} fragmentów")
                     
-                    if len(articles) < 2:
-                        print("Brak artykułów w dokumencie prawnym!")
-                        continue
-                        
                     for i in range(1, len(articles), 2):
-                        article_number = re.sub(r'#+\s*', '', articles[i].strip(), flags=re.IGNORECASE)
+                        article_number = articles[i].strip()
                         article_content = articles[i+1].strip() if i+1 < len(articles) else ""
                         
                         if not article_content:
-                            print(f"Pominięto pusty artykuł: {article_number}")
                             continue
                             
                         source = f"{doc_type}, {article_number}"
@@ -126,7 +117,6 @@ def prepare_dataset(directory, catalog_path, source_mapper):
                             "source_idx": source_mapper.get_idx(source)
                         })
                 else:
-                    print("Traktowanie jako opracowanie własne")
                     clean_text = re.sub(r'\s+', ' ', text).strip()
                     chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
                     chunks = [c for c in chunks if c.strip()]
@@ -276,8 +266,8 @@ def main():
         answer = answer.replace(prompt, "").strip()
         
         sources = set()
-        for match in re.finditer(r'(?i)art\.?\s*\d+', answer):
-            article_ref = match.group(0).strip()
+        for match in re.finditer(r'(?i)art\.?\s*\d+\.?', answer):
+            article_ref = match.group(0).strip().rstrip('.')
             for source in source_mapper.idx_to_source.values():
                 if article_ref.lower() in source.lower():
                     sources.add(source)
@@ -290,9 +280,9 @@ def main():
 
     # Testy
     test_questions = [
-        "Jakie są zasady udzielania urlopu wypoczynkowego?",
-        "Co mówi art. 154 kodeksu pracy?",
-        "Jakie są obowiązki pracodawcy w zakresie BHP?"
+        "Jakie są prawa pracownika według art. 1?",
+        "Kto jest pracownikiem według art. 2?",
+        "Jakie są obowiązki pracodawcy według art. 3?"
     ]
     
     print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50)