This commit is contained in:
l.gabrysiak 2025-02-26 09:41:42 +01:00
parent 2cdf954b50
commit 61c45041aa
1 changed files with 17 additions and 4 deletions

19
gpt.py
View File

@ -15,23 +15,36 @@ def prepare_dataset_from_file(file_path):
text = f.read() text = f.read()
# Wydziel artykuły za pomocą wyrażenia regularnego # Wydziel artykuły za pomocą wyrażenia regularnego
articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z', text, flags=re.DOTALL) articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL)
formatted_articles = [] formatted_articles = []
for article in articles: for article in articles:
# Usuń zbędne białe znaki # Usuń zbędne białe znaki
article = ' '.join(article.strip().split()) article = ' '.join(article.strip().split())
# Wydziel numer artykułu # Wydziel numer artykułu i treść
art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.\s*(.*)', article, re.DOTALL) art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL)
if art_match: if art_match:
art_number = art_match.group(1) art_number = art_match.group(1)
art_text = art_match.group(2) art_text = art_match.group(2)
# Podziel na paragrafy, jeśli istnieją
paragraphs = re.split(r'\s*\d+\.)', art_text)
if len(paragraphs) > 1:
formatted_paragraphs = []
for i in range(1, len(paragraphs), 2):
para_num = paragraphs[i].strip()
para_text = paragraphs[i+1].strip()
formatted_paragraphs.append(f"{para_num} {para_text}")
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs)
else:
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}" formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
formatted_articles.append({"text": formatted}) formatted_articles.append({"text": formatted})
return formatted_articles return formatted_articles
def main(): def main():
# Inicjalizacja tokenizera # Inicjalizacja tokenizera
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)