mod
This commit is contained in:
parent
f97eeea435
commit
b3f2102b2a
38
hft.py
38
hft.py
|
|
@ -15,7 +15,7 @@ from huggingface_hub import login
|
||||||
# Konfiguracja
|
# Konfiguracja
|
||||||
os.environ['TORCH_USE_CUDA_DSA'] = '1'
|
os.environ['TORCH_USE_CUDA_DSA'] = '1'
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
login(token="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") # Zastąp swoim tokenem HF
|
login(token="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")
|
||||||
|
|
||||||
class SourceMapper:
|
class SourceMapper:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
@ -42,7 +42,7 @@ def load_file_catalog(catalog_path):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def identify_legal_document(filename, file_catalog):
|
def identify_legal_document(filename, file_catalog):
|
||||||
base_name = os.path.splitext(filename)[0]
|
base_name = os.path.splitext(filename)[0].lower()
|
||||||
return file_catalog.get(base_name, "Opracowanie własne")
|
return file_catalog.get(base_name, "Opracowanie własne")
|
||||||
|
|
||||||
def extract_text_from_file(file_path):
|
def extract_text_from_file(file_path):
|
||||||
|
|
@ -55,10 +55,13 @@ def extract_text_from_file(file_path):
|
||||||
return file.read()
|
return file.read()
|
||||||
elif ext == '.pdf':
|
elif ext == '.pdf':
|
||||||
text = ""
|
text = ""
|
||||||
|
try:
|
||||||
with open(file_path, 'rb') as file:
|
with open(file_path, 'rb') as file:
|
||||||
reader = PyPDF2.PdfReader(file)
|
reader = PyPDF2.PdfReader(file)
|
||||||
for page in reader.pages:
|
for page in reader.pages:
|
||||||
text += page.extract_text()
|
text += page.extract_text() or ""
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Błąd PDF: {str(e)}")
|
||||||
return text
|
return text
|
||||||
elif ext in ['.doc', '.docx']:
|
elif ext in ['.doc', '.docx']:
|
||||||
return docx2txt.process(file_path)
|
return docx2txt.process(file_path)
|
||||||
|
|
@ -77,15 +80,7 @@ def prepare_dataset(directory, catalog_path, source_mapper):
|
||||||
|
|
||||||
print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
|
print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
|
||||||
|
|
||||||
if not os.path.exists(directory):
|
|
||||||
print(f"Brak katalogu: {directory}")
|
|
||||||
return data
|
|
||||||
|
|
||||||
for root, _, files in os.walk(directory):
|
for root, _, files in os.walk(directory):
|
||||||
if not files:
|
|
||||||
print(f"Brak plików w katalogu: {root}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
file_path = os.path.join(root, file)
|
file_path = os.path.join(root, file)
|
||||||
print(f"\nPrzetwarzanie pliku: {file_path}")
|
print(f"\nPrzetwarzanie pliku: {file_path}")
|
||||||
|
|
@ -102,19 +97,15 @@ def prepare_dataset(directory, catalog_path, source_mapper):
|
||||||
print(f"Rozpoznany typ dokumentu: {doc_type}")
|
print(f"Rozpoznany typ dokumentu: {doc_type}")
|
||||||
|
|
||||||
if doc_type != "Opracowanie własne":
|
if doc_type != "Opracowanie własne":
|
||||||
articles = re.split(r'(?i)(#+\s*art\.?\s*\d+[\.\s]?)', text)
|
# Nowe wyrażenie regularne dla formatu "Art. XX."
|
||||||
|
articles = re.split(r'(Art\. \d+\.?)', text)
|
||||||
print(f"Znaleziono {len(articles)} fragmentów")
|
print(f"Znaleziono {len(articles)} fragmentów")
|
||||||
|
|
||||||
if len(articles) < 2:
|
|
||||||
print("Brak artykułów w dokumencie prawnym!")
|
|
||||||
continue
|
|
||||||
|
|
||||||
for i in range(1, len(articles), 2):
|
for i in range(1, len(articles), 2):
|
||||||
article_number = re.sub(r'#+\s*', '', articles[i].strip(), flags=re.IGNORECASE)
|
article_number = articles[i].strip()
|
||||||
article_content = articles[i+1].strip() if i+1 < len(articles) else ""
|
article_content = articles[i+1].strip() if i+1 < len(articles) else ""
|
||||||
|
|
||||||
if not article_content:
|
if not article_content:
|
||||||
print(f"Pominięto pusty artykuł: {article_number}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
source = f"{doc_type}, {article_number}"
|
source = f"{doc_type}, {article_number}"
|
||||||
|
|
@ -126,7 +117,6 @@ def prepare_dataset(directory, catalog_path, source_mapper):
|
||||||
"source_idx": source_mapper.get_idx(source)
|
"source_idx": source_mapper.get_idx(source)
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
print("Traktowanie jako opracowanie własne")
|
|
||||||
clean_text = re.sub(r'\s+', ' ', text).strip()
|
clean_text = re.sub(r'\s+', ' ', text).strip()
|
||||||
chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
|
chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
|
||||||
chunks = [c for c in chunks if c.strip()]
|
chunks = [c for c in chunks if c.strip()]
|
||||||
|
|
@ -276,8 +266,8 @@ def main():
|
||||||
answer = answer.replace(prompt, "").strip()
|
answer = answer.replace(prompt, "").strip()
|
||||||
|
|
||||||
sources = set()
|
sources = set()
|
||||||
for match in re.finditer(r'(?i)art\.?\s*\d+', answer):
|
for match in re.finditer(r'(?i)art\.?\s*\d+\.?', answer):
|
||||||
article_ref = match.group(0).strip()
|
article_ref = match.group(0).strip().rstrip('.')
|
||||||
for source in source_mapper.idx_to_source.values():
|
for source in source_mapper.idx_to_source.values():
|
||||||
if article_ref.lower() in source.lower():
|
if article_ref.lower() in source.lower():
|
||||||
sources.add(source)
|
sources.add(source)
|
||||||
|
|
@ -290,9 +280,9 @@ def main():
|
||||||
|
|
||||||
# Testy
|
# Testy
|
||||||
test_questions = [
|
test_questions = [
|
||||||
"Jakie są zasady udzielania urlopu wypoczynkowego?",
|
"Jakie są prawa pracownika według art. 1?",
|
||||||
"Co mówi art. 154 kodeksu pracy?",
|
"Kto jest pracownikiem według art. 2?",
|
||||||
"Jakie są obowiązki pracodawcy w zakresie BHP?"
|
"Jakie są obowiązki pracodawcy według art. 3?"
|
||||||
]
|
]
|
||||||
|
|
||||||
print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50)
|
print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue