This commit is contained in:
l.gabrysiak 2025-02-25 21:35:55 +01:00
parent f97eeea435
commit b3f2102b2a
2 changed files with 17 additions and 27 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

44
hft.py
View File

@ -15,7 +15,7 @@ from huggingface_hub import login
# Konfiguracja # Konfiguracja
os.environ['TORCH_USE_CUDA_DSA'] = '1' os.environ['TORCH_USE_CUDA_DSA'] = '1'
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
login(token="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") # Zastąp swoim tokenem HF login(token="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")
class SourceMapper: class SourceMapper:
def __init__(self): def __init__(self):
@ -42,7 +42,7 @@ def load_file_catalog(catalog_path):
return {} return {}
def identify_legal_document(filename, file_catalog): def identify_legal_document(filename, file_catalog):
base_name = os.path.splitext(filename)[0] base_name = os.path.splitext(filename)[0].lower()
return file_catalog.get(base_name, "Opracowanie własne") return file_catalog.get(base_name, "Opracowanie własne")
def extract_text_from_file(file_path): def extract_text_from_file(file_path):
@ -55,10 +55,13 @@ def extract_text_from_file(file_path):
return file.read() return file.read()
elif ext == '.pdf': elif ext == '.pdf':
text = "" text = ""
with open(file_path, 'rb') as file: try:
reader = PyPDF2.PdfReader(file) with open(file_path, 'rb') as file:
for page in reader.pages: reader = PyPDF2.PdfReader(file)
text += page.extract_text() for page in reader.pages:
text += page.extract_text() or ""
except Exception as e:
print(f"Błąd PDF: {str(e)}")
return text return text
elif ext in ['.doc', '.docx']: elif ext in ['.doc', '.docx']:
return docx2txt.process(file_path) return docx2txt.process(file_path)
@ -77,15 +80,7 @@ def prepare_dataset(directory, catalog_path, source_mapper):
print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}") print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
if not os.path.exists(directory):
print(f"Brak katalogu: {directory}")
return data
for root, _, files in os.walk(directory): for root, _, files in os.walk(directory):
if not files:
print(f"Brak plików w katalogu: {root}")
continue
for file in files: for file in files:
file_path = os.path.join(root, file) file_path = os.path.join(root, file)
print(f"\nPrzetwarzanie pliku: {file_path}") print(f"\nPrzetwarzanie pliku: {file_path}")
@ -102,19 +97,15 @@ def prepare_dataset(directory, catalog_path, source_mapper):
print(f"Rozpoznany typ dokumentu: {doc_type}") print(f"Rozpoznany typ dokumentu: {doc_type}")
if doc_type != "Opracowanie własne": if doc_type != "Opracowanie własne":
articles = re.split(r'(?i)(#+\s*art\.?\s*\d+[\.\s]?)', text) # Nowe wyrażenie regularne dla formatu "Art. XX."
articles = re.split(r'(Art\. \d+\.?)', text)
print(f"Znaleziono {len(articles)} fragmentów") print(f"Znaleziono {len(articles)} fragmentów")
if len(articles) < 2:
print("Brak artykułów w dokumencie prawnym!")
continue
for i in range(1, len(articles), 2): for i in range(1, len(articles), 2):
article_number = re.sub(r'#+\s*', '', articles[i].strip(), flags=re.IGNORECASE) article_number = articles[i].strip()
article_content = articles[i+1].strip() if i+1 < len(articles) else "" article_content = articles[i+1].strip() if i+1 < len(articles) else ""
if not article_content: if not article_content:
print(f"Pominięto pusty artykuł: {article_number}")
continue continue
source = f"{doc_type}, {article_number}" source = f"{doc_type}, {article_number}"
@ -126,7 +117,6 @@ def prepare_dataset(directory, catalog_path, source_mapper):
"source_idx": source_mapper.get_idx(source) "source_idx": source_mapper.get_idx(source)
}) })
else: else:
print("Traktowanie jako opracowanie własne")
clean_text = re.sub(r'\s+', ' ', text).strip() clean_text = re.sub(r'\s+', ' ', text).strip()
chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)] chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
chunks = [c for c in chunks if c.strip()] chunks = [c for c in chunks if c.strip()]
@ -276,8 +266,8 @@ def main():
answer = answer.replace(prompt, "").strip() answer = answer.replace(prompt, "").strip()
sources = set() sources = set()
for match in re.finditer(r'(?i)art\.?\s*\d+', answer): for match in re.finditer(r'(?i)art\.?\s*\d+\.?', answer):
article_ref = match.group(0).strip() article_ref = match.group(0).strip().rstrip('.')
for source in source_mapper.idx_to_source.values(): for source in source_mapper.idx_to_source.values():
if article_ref.lower() in source.lower(): if article_ref.lower() in source.lower():
sources.add(source) sources.add(source)
@ -290,9 +280,9 @@ def main():
# Testy # Testy
test_questions = [ test_questions = [
"Jakie są zasady udzielania urlopu wypoczynkowego?", "Jakie są prawa pracownika według art. 1?",
"Co mówi art. 154 kodeksu pracy?", "Kto jest pracownikiem według art. 2?",
"Jakie są obowiązki pracodawcy w zakresie BHP?" "Jakie są obowiązki pracodawcy według art. 3?"
] ]
print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50) print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50)