This commit is contained in:
l.gabrysiak 2025-02-26 09:41:42 +01:00
parent 2cdf954b50
commit 61c45041aa
1 changed files with 17 additions and 4 deletions

21
gpt.py
View File

@ -15,23 +15,36 @@ def prepare_dataset_from_file(file_path):
text = f.read()
# Wydziel artykuły za pomocą wyrażenia regularnego
articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z', text, flags=re.DOTALL)
articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL)
formatted_articles = []
for article in articles:
# Usuń zbędne białe znaki
article = ' '.join(article.strip().split())
# Wydziel numer artykułu
art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.\s*(.*)', article, re.DOTALL)
# Wydziel numer artykułu i treść
art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL)
if art_match:
art_number = art_match.group(1)
art_text = art_match.group(2)
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
# Podziel na paragrafy, jeśli istnieją
paragraphs = re.split(r'\s*\d+\.)', art_text)
if len(paragraphs) > 1:
formatted_paragraphs = []
for i in range(1, len(paragraphs), 2):
para_num = paragraphs[i].strip()
para_text = paragraphs[i+1].strip()
formatted_paragraphs.append(f"{para_num} {para_text}")
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs)
else:
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
formatted_articles.append({"text": formatted})
return formatted_articles
def main():
# Inicjalizacja tokenizera
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)