mod gpt
This commit is contained in:
parent
2cdf954b50
commit
61c45041aa
21
gpt.py
21
gpt.py
|
|
@ -15,23 +15,36 @@ def prepare_dataset_from_file(file_path):
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
# Wydziel artykuły za pomocą wyrażenia regularnego
|
# Wydziel artykuły za pomocą wyrażenia regularnego
|
||||||
articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z', text, flags=re.DOTALL)
|
articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL)
|
||||||
|
|
||||||
formatted_articles = []
|
formatted_articles = []
|
||||||
for article in articles:
|
for article in articles:
|
||||||
# Usuń zbędne białe znaki
|
# Usuń zbędne białe znaki
|
||||||
article = ' '.join(article.strip().split())
|
article = ' '.join(article.strip().split())
|
||||||
|
|
||||||
# Wydziel numer artykułu
|
# Wydziel numer artykułu i treść
|
||||||
art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.\s*(.*)', article, re.DOTALL)
|
art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL)
|
||||||
if art_match:
|
if art_match:
|
||||||
art_number = art_match.group(1)
|
art_number = art_match.group(1)
|
||||||
art_text = art_match.group(2)
|
art_text = art_match.group(2)
|
||||||
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
|
|
||||||
|
# Podziel na paragrafy, jeśli istnieją
|
||||||
|
paragraphs = re.split(r'(§\s*\d+\.)', art_text)
|
||||||
|
if len(paragraphs) > 1:
|
||||||
|
formatted_paragraphs = []
|
||||||
|
for i in range(1, len(paragraphs), 2):
|
||||||
|
para_num = paragraphs[i].strip()
|
||||||
|
para_text = paragraphs[i+1].strip()
|
||||||
|
formatted_paragraphs.append(f"{para_num} {para_text}")
|
||||||
|
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs)
|
||||||
|
else:
|
||||||
|
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
|
||||||
|
|
||||||
formatted_articles.append({"text": formatted})
|
formatted_articles.append({"text": formatted})
|
||||||
|
|
||||||
return formatted_articles
|
return formatted_articles
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Inicjalizacja tokenizera
|
# Inicjalizacja tokenizera
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue