From 61c45041aaf46614e431b37facc668b50960f3d6 Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Wed, 26 Feb 2025 09:41:42 +0100 Subject: [PATCH] mod gpt --- gpt.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/gpt.py b/gpt.py index 46ef5db..7ca8d7d 100644 --- a/gpt.py +++ b/gpt.py @@ -15,23 +15,36 @@ def prepare_dataset_from_file(file_path): text = f.read() # Wydziel artykuły za pomocą wyrażenia regularnego - articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z', text, flags=re.DOTALL) + articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL) formatted_articles = [] for article in articles: # Usuń zbędne białe znaki article = ' '.join(article.strip().split()) - # Wydziel numer artykułu - art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.\s*(.*)', article, re.DOTALL) + # Wydziel numer artykułu i treść + art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL) if art_match: art_number = art_match.group(1) art_text = art_match.group(2) - formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}" + + # Podziel na paragrafy, jeśli istnieją + paragraphs = re.split(r'(§\s*\d+\.)', art_text) + if len(paragraphs) > 1: + formatted_paragraphs = [] + for i in range(1, len(paragraphs), 2): + para_num = paragraphs[i].strip() + para_text = paragraphs[i+1].strip() + formatted_paragraphs.append(f"{para_num} {para_text}") + formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs) + else: + formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}" + formatted_articles.append({"text": formatted}) return formatted_articles + def main(): # Inicjalizacja tokenizera tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)