mod
This commit is contained in:
parent
f97eeea435
commit
b3f2102b2a
38
hft.py
38
hft.py
|
|
@ -15,7 +15,7 @@ from huggingface_hub import login
|
|||
# Konfiguracja
|
||||
os.environ['TORCH_USE_CUDA_DSA'] = '1'
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
login(token="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") # Zastąp swoim tokenem HF
|
||||
login(token="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")
|
||||
|
||||
class SourceMapper:
|
||||
def __init__(self):
|
||||
|
|
@ -42,7 +42,7 @@ def load_file_catalog(catalog_path):
|
|||
return {}
|
||||
|
||||
def identify_legal_document(filename, file_catalog):
|
||||
base_name = os.path.splitext(filename)[0]
|
||||
base_name = os.path.splitext(filename)[0].lower()
|
||||
return file_catalog.get(base_name, "Opracowanie własne")
|
||||
|
||||
def extract_text_from_file(file_path):
|
||||
|
|
@ -55,10 +55,13 @@ def extract_text_from_file(file_path):
|
|||
return file.read()
|
||||
elif ext == '.pdf':
|
||||
text = ""
|
||||
try:
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
for page in reader.pages:
|
||||
text += page.extract_text()
|
||||
text += page.extract_text() or ""
|
||||
except Exception as e:
|
||||
print(f"Błąd PDF: {str(e)}")
|
||||
return text
|
||||
elif ext in ['.doc', '.docx']:
|
||||
return docx2txt.process(file_path)
|
||||
|
|
@ -77,15 +80,7 @@ def prepare_dataset(directory, catalog_path, source_mapper):
|
|||
|
||||
print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
|
||||
|
||||
if not os.path.exists(directory):
|
||||
print(f"Brak katalogu: {directory}")
|
||||
return data
|
||||
|
||||
for root, _, files in os.walk(directory):
|
||||
if not files:
|
||||
print(f"Brak plików w katalogu: {root}")
|
||||
continue
|
||||
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
print(f"\nPrzetwarzanie pliku: {file_path}")
|
||||
|
|
@ -102,19 +97,15 @@ def prepare_dataset(directory, catalog_path, source_mapper):
|
|||
print(f"Rozpoznany typ dokumentu: {doc_type}")
|
||||
|
||||
if doc_type != "Opracowanie własne":
|
||||
articles = re.split(r'(?i)(#+\s*art\.?\s*\d+[\.\s]?)', text)
|
||||
# Nowe wyrażenie regularne dla formatu "Art. XX."
|
||||
articles = re.split(r'(Art\. \d+\.?)', text)
|
||||
print(f"Znaleziono {len(articles)} fragmentów")
|
||||
|
||||
if len(articles) < 2:
|
||||
print("Brak artykułów w dokumencie prawnym!")
|
||||
continue
|
||||
|
||||
for i in range(1, len(articles), 2):
|
||||
article_number = re.sub(r'#+\s*', '', articles[i].strip(), flags=re.IGNORECASE)
|
||||
article_number = articles[i].strip()
|
||||
article_content = articles[i+1].strip() if i+1 < len(articles) else ""
|
||||
|
||||
if not article_content:
|
||||
print(f"Pominięto pusty artykuł: {article_number}")
|
||||
continue
|
||||
|
||||
source = f"{doc_type}, {article_number}"
|
||||
|
|
@ -126,7 +117,6 @@ def prepare_dataset(directory, catalog_path, source_mapper):
|
|||
"source_idx": source_mapper.get_idx(source)
|
||||
})
|
||||
else:
|
||||
print("Traktowanie jako opracowanie własne")
|
||||
clean_text = re.sub(r'\s+', ' ', text).strip()
|
||||
chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
|
||||
chunks = [c for c in chunks if c.strip()]
|
||||
|
|
@ -276,8 +266,8 @@ def main():
|
|||
answer = answer.replace(prompt, "").strip()
|
||||
|
||||
sources = set()
|
||||
for match in re.finditer(r'(?i)art\.?\s*\d+', answer):
|
||||
article_ref = match.group(0).strip()
|
||||
for match in re.finditer(r'(?i)art\.?\s*\d+\.?', answer):
|
||||
article_ref = match.group(0).strip().rstrip('.')
|
||||
for source in source_mapper.idx_to_source.values():
|
||||
if article_ref.lower() in source.lower():
|
||||
sources.add(source)
|
||||
|
|
@ -290,9 +280,9 @@ def main():
|
|||
|
||||
# Testy
|
||||
test_questions = [
|
||||
"Jakie są zasady udzielania urlopu wypoczynkowego?",
|
||||
"Co mówi art. 154 kodeksu pracy?",
|
||||
"Jakie są obowiązki pracodawcy w zakresie BHP?"
|
||||
"Jakie są prawa pracownika według art. 1?",
|
||||
"Kto jest pracownikiem według art. 2?",
|
||||
"Jakie są obowiązki pracodawcy według art. 3?"
|
||||
]
|
||||
|
||||
print("\n" + "="*50 + "\nWYNIKI TESTOW\n" + "="*50)
|
||||
|
|
|
|||
Loading…
Reference in New Issue