mod gpt
This commit is contained in:
parent
2cdf954b50
commit
61c45041aa
19
gpt.py
19
gpt.py
|
|
@ -15,23 +15,36 @@ def prepare_dataset_from_file(file_path):
|
|||
text = f.read()
|
||||
|
||||
# Wydziel artykuły za pomocą wyrażenia regularnego
|
||||
articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z', text, flags=re.DOTALL)
|
||||
articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL)
|
||||
|
||||
formatted_articles = []
|
||||
for article in articles:
|
||||
# Usuń zbędne białe znaki
|
||||
article = ' '.join(article.strip().split())
|
||||
|
||||
# Wydziel numer artykułu
|
||||
art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.\s*(.*)', article, re.DOTALL)
|
||||
# Wydziel numer artykułu i treść
|
||||
art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL)
|
||||
if art_match:
|
||||
art_number = art_match.group(1)
|
||||
art_text = art_match.group(2)
|
||||
|
||||
# Podziel na paragrafy, jeśli istnieją
|
||||
paragraphs = re.split(r'(§\s*\d+\.)', art_text)
|
||||
if len(paragraphs) > 1:
|
||||
formatted_paragraphs = []
|
||||
for i in range(1, len(paragraphs), 2):
|
||||
para_num = paragraphs[i].strip()
|
||||
para_text = paragraphs[i+1].strip()
|
||||
formatted_paragraphs.append(f"{para_num} {para_text}")
|
||||
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs)
|
||||
else:
|
||||
formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
|
||||
|
||||
formatted_articles.append({"text": formatted})
|
||||
|
||||
return formatted_articles
|
||||
|
||||
|
||||
def main():
|
||||
# Inicjalizacja tokenizera
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
|
|
|
|||
Loading…
Reference in New Issue