mod gpt

2025-02-26 09:41:42 +01:00 · 2025-02-26 09:41:42 +01:00 · 61c45041aa
parent 2cdf954b50
commit 61c45041aa
1 changed files with 17 additions and 4 deletions
--- a/gpt.py
+++ b/gpt.py
@ -15,23 +15,36 @@ def prepare_dataset_from_file(file_path):
        text = f.read()

    # Wydziel artykuły za pomocą wyrażenia regularnego
-    articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z', text, flags=re.DOTALL)
+    articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL)
    
    formatted_articles = []
    for article in articles:
        # Usuń zbędne białe znaki
        article = ' '.join(article.strip().split())
        
-        # Wydziel numer artykułu
-        art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.\s*(.*)', article, re.DOTALL)
+        # Wydziel numer artykułu i treść
+        art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL)
        if art_match:
            art_number = art_match.group(1)
            art_text = art_match.group(2)
+            
+            # Podziel na paragrafy, jeśli istnieją
+            paragraphs = re.split(r'(§\s*\d+\.)', art_text)
+            if len(paragraphs) > 1:
+                formatted_paragraphs = []
+                for i in range(1, len(paragraphs), 2):
+                    para_num = paragraphs[i].strip()
+                    para_text = paragraphs[i+1].strip()
+                    formatted_paragraphs.append(f"{para_num} {para_text}")
+                formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs)
+            else:
                formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
+            
            formatted_articles.append({"text": formatted})
    
    return formatted_articles

+
 def main():
    # Inicjalizacja tokenizera
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)