From b3f2102b2a88582eeb4cccf3abda33a36d00d546 Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Tue, 25 Feb 2025 21:35:55 +0100 Subject: [PATCH] mod --- .DS_Store | Bin 0 -> 6148 bytes hft.py | 44 +++++++++++++++++--------------------------- 2 files changed, 17 insertions(+), 27 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..b2cf37d250f33e6f3721afac6800921711fd979d GIT binary patch literal 6148 zcmeHKOH0E*5Z>*>rW7Fug&qT53-*O5UP7#Yz=$4HYC@t0W41J@Ig~=q`iJ}_{vKy` zH_&46C}L+|_nV!^ZsvpRgE7XPMKEB@WQ-Zm5IHIpg62wBO$Q@#IY;W}>CBIXUop+! zG~u^5*#nDM%tE&Q{U81;j?%2(|KzoLqq);ET1LmXbDu=+W?r^P9dCYvqe~%Tztlbd zDhl(dxpyX#%!`t6E)$|4gpk|oC<#RFh(!{lGS^iOqiwXOW_P(9j|M$!d^}wBtmVnD z5Blh6wQ3vt2ZyH@)93gl5pRk}4vb6L)mXwiC~FnHy7MF!$s<_Hj53mt7$63S0b*b~ z8PMl}(b>-OscK??82Et!+#f7xh_=Q;q1-y4!|OBp8;B^N<68pJ(r9Ze6oLnYt5iUh z%Jmb2t8}niI?mQuC{*c;%avgsy>j_@;c|7bTQZz+TOswt05P!1KwS+@Jpa$(m#KW@ zZ>G?Q7$64z83VjAaVHKGrO(!H<>6UtLA!^Bf^h{ZAfRtt0$_mqNLMkCN UoCWPN9gr>pk`U^MfnQ+Y3t@Ol-v9sr literal 0 HcmV?d00001 diff --git a/hft.py b/hft.py index 11798dd..4e77248 100644 --- a/hft.py +++ b/hft.py @@ -15,7 +15,7 @@ from huggingface_hub import login # Konfiguracja os.environ['TORCH_USE_CUDA_DSA'] = '1' os.environ["TOKENIZERS_PARALLELISM"] = "false" -login(token="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") # Zastąp swoim tokenem HF +login(token="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") class SourceMapper: def __init__(self): @@ -42,7 +42,7 @@ def load_file_catalog(catalog_path): return {} def identify_legal_document(filename, file_catalog): - base_name = os.path.splitext(filename)[0] + base_name = os.path.splitext(filename)[0].lower() return file_catalog.get(base_name, "Opracowanie własne") def extract_text_from_file(file_path): @@ -55,10 +55,13 @@ def extract_text_from_file(file_path): return file.read() elif ext == '.pdf': text = "" - with open(file_path, 'rb') as file: - reader = PyPDF2.PdfReader(file) - for page in reader.pages: - text += page.extract_text() + try: + with open(file_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + for page in reader.pages: + text += page.extract_text() or "" + except Exception as e: + print(f"Błąd PDF: {str(e)}") return text elif ext in ['.doc', '.docx']: return docx2txt.process(file_path) @@ -77,15 +80,7 @@ def prepare_dataset(directory, catalog_path, source_mapper): print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}") - if not os.path.exists(directory): - print(f"Brak katalogu: {directory}") - return data - for root, _, files in os.walk(directory): - if not files: - print(f"Brak plików w katalogu: {root}") - continue - for file in files: file_path = os.path.join(root, file) print(f"\nPrzetwarzanie pliku: {file_path}") @@ -102,19 +97,15 @@ def prepare_dataset(directory, catalog_path, source_mapper): print(f"Rozpoznany typ dokumentu: {doc_type}") if doc_type != "Opracowanie własne": - articles = re.split(r'(?i)(#+\s*art\.?\s*\d+[\.\s]?)', text) + # Nowe wyrażenie regularne dla formatu "Art. XX." + articles = re.split(r'(Art\. \d+\.?)', text) print(f"Znaleziono {len(articles)} fragmentów") - if len(articles) < 2: - print("Brak artykułów w dokumencie prawnym!") - continue - for i in range(1, len(articles), 2): - article_number = re.sub(r'#+\s*', '', articles[i].strip(), flags=re.IGNORECASE) + article_number = articles[i].strip() article_content = articles[i+1].strip() if i+1 < len(articles) else "" if not article_content: - print(f"Pominięto pusty artykuł: {article_number}") continue source = f"{doc_type}, {article_number}" @@ -126,7 +117,6 @@ def prepare_dataset(directory, catalog_path, source_mapper): "source_idx": source_mapper.get_idx(source) }) else: - print("Traktowanie jako opracowanie własne") clean_text = re.sub(r'\s+', ' ', text).strip() chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)] chunks = [c for c in chunks if c.strip()] @@ -276,8 +266,8 @@ def main(): answer = answer.replace(prompt, "").strip() sources = set() - for match in re.finditer(r'(?i)art\.?\s*\d+', answer): - article_ref = match.group(0).strip() + for match in re.finditer(r'(?i)art\.?\s*\d+\.?', answer): + article_ref = match.group(0).strip().rstrip('.') for source in source_mapper.idx_to_source.values(): if article_ref.lower() in source.lower(): sources.add(source) @@ -290,9 +280,9 @@ def main(): # Testy test_questions = [ - "Jakie są zasady udzielania urlopu wypoczynkowego?", - "Co mówi art. 154 kodeksu pracy?", - "Jakie są obowiązki pracodawcy w zakresie BHP?" + "Jakie są prawa pracownika według art. 1?", + "Kto jest pracownikiem według art. 2?", + "Jakie są obowiązki pracodawcy według art. 3?" ] print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50)