From 61c45041aaf46614e431b37facc668b50960f3d6 Mon Sep 17 00:00:00 2001
From: "l.gabrysiak" <l.gabrysiak@osadkowski.pl>
Date: Wed, 26 Feb 2025 09:41:42 +0100
Subject: [PATCH] mod gpt

---
 gpt.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/gpt.py b/gpt.py
index 46ef5db..7ca8d7d 100644
--- a/gpt.py
+++ b/gpt.py
@@ -15,23 +15,36 @@ def prepare_dataset_from_file(file_path):
         text = f.read()
 
     # Wydziel artykuły za pomocą wyrażenia regularnego
-    articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z', text, flags=re.DOTALL)
+    articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL)
     
     formatted_articles = []
     for article in articles:
         # Usuń zbędne białe znaki
         article = ' '.join(article.strip().split())
         
-        # Wydziel numer artykułu
-        art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.\s*(.*)', article, re.DOTALL)
+        # Wydziel numer artykułu i treść
+        art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL)
         if art_match:
             art_number = art_match.group(1)
             art_text = art_match.group(2)
-            formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
+            
+            # Podziel na paragrafy, jeśli istnieją
+            paragraphs = re.split(r'(§\s*\d+\.)', art_text)
+            if len(paragraphs) > 1:
+                formatted_paragraphs = []
+                for i in range(1, len(paragraphs), 2):
+                    para_num = paragraphs[i].strip()
+                    para_text = paragraphs[i+1].strip()
+                    formatted_paragraphs.append(f"{para_num} {para_text}")
+                formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs)
+            else:
+                formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
+            
             formatted_articles.append({"text": formatted})
     
     return formatted_articles
 
+
 def main():
     # Inicjalizacja tokenizera
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)