From b4957ee652ad5ace74150bd2b151b5950f628eb0 Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Wed, 26 Feb 2025 00:19:51 +0100 Subject: [PATCH] mod --- gpt.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ hft.py | 75 ++++++++++++++++------------------------------ 2 files changed, 120 insertions(+), 50 deletions(-) create mode 100644 gpt.py diff --git a/gpt.py b/gpt.py new file mode 100644 index 0000000..9b43685 --- /dev/null +++ b/gpt.py @@ -0,0 +1,95 @@ +import os +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer +from datasets import Dataset +from collections import defaultdict + +# Konfiguracja +os.environ["TOKENIZERS_PARALLELISM"] = "false" +MODEL_NAME = "gpt2" # Tymczasowo używamy mniejszego modelu do testów +SPECIAL_TOKENS = ["[CITATION_START]", "[CITATION_END]"] + +class SourceMapper: + def __init__(self): + self.source_to_idx = defaultdict(lambda: len(self.source_to_idx)) + self.idx_to_source = {} + + def add_source(self, source): + if source not in self.source_to_idx: + idx = self.source_to_idx[source] + self.idx_to_source[idx] = source + +def prepare_simple_dataset(): + # Przykładowe dane - zastąp rzeczywistymi danymi + return [ + { + "text": "[CITATION_START] Kodeks Pracy, Art. 1 [CITATION_END] Tekst artykułu...", + "source_idx": 0 + }, + { + "text": "[CITATION_START] Kodeks Pracy, Art. 2 [CITATION_END] Inny tekst...", + "source_idx": 1 + } + ] + +def main(): + # Inicjalizacja + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS}) + tokenizer.pad_token = tokenizer.eos_token + + # Przygotowanie danych + source_mapper = SourceMapper() + data = prepare_simple_dataset() + + # Tworzenie datasetu + dataset = Dataset.from_dict({ + "text": [d["text"] for d in data], + "source_idx": [d["source_idx"] for d in data] + }) + + # Tokenizacja + def tokenize_function(examples): + tokenized = tokenizer( + examples["text"], + truncation=True, + padding="max_length", + max_length=128, + return_tensors="pt" + ) + return { + "input_ids": tokenized["input_ids"].squeeze(), + "attention_mask": tokenized["attention_mask"].squeeze(), + "labels": tokenized["input_ids"].squeeze().clone(), + } + + tokenized_dataset = dataset.map(tokenize_function, batched=True) + + # Model + model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) + model.resize_token_embeddings(len(tokenizer)) + + # Konfiguracja treningu + training_args = TrainingArguments( + output_dir="./results", + num_train_epochs=1, + per_device_train_batch_size=2, + gradient_accumulation_steps=1, + learning_rate=2e-5, + logging_steps=1, + remove_unused_columns=False + ) + + # Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_dataset, + ) + + # Rozpoczęcie treningu + print("Rozpoczęcie treningu...") + trainer.train() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/hft.py b/hft.py index cc0df26..1f2a4e7 100644 --- a/hft.py +++ b/hft.py @@ -17,22 +17,19 @@ os.environ['TORCH_USE_CUDA_DSA'] = '1' os.environ["TOKENIZERS_PARALLELISM"] = "false" login(token="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") -CITATION_START = "▌▌CITATION_START" -CITATION_END = "▌▌CITATION_END" - class SourceMapper: def __init__(self): self.source_to_idx = defaultdict(lambda: len(self.source_to_idx)) self.idx_to_source = {} - + def add_source(self, source): if source and source not in self.source_to_idx: idx = self.source_to_idx[source] self.idx_to_source[idx] = source - + def get_idx(self, source): return self.source_to_idx[source] if source else -1 - + def get_source(self, idx): return self.idx_to_source.get(idx, "Unknown") @@ -99,26 +96,8 @@ def prepare_dataset(directory, catalog_path, source_mapper): doc_type = identify_legal_document(file, file_catalog) print(f"Rozpoznany typ dokumentu: {doc_type}") - current_section = "" - current_chapter = "" - - structure_matches = re.finditer( - r'(DZIAŁ [A-ZĄĆĘŁŃÓŚŹŻ]+)\n+(.*?)\n(?=Art\.|Rozdział|DZIAŁ|$)' - r'|(Rozdział [A-ZĄĆĘŁŃÓŚŹŻ]+)\n+(.*?)\n(?=Art\.|DZIAŁ|$)', - text - ) - for match in structure_matches: - if match.group(1): - current_section = f"{match.group(1)} - {match.group(2).strip()}" - current_chapter = "" - else: - current_chapter = f"{match.group(3)} - {match.group(4).strip()}" - if doc_type != "Opracowanie własne": - articles = re.split( - r'(?i)(Art[\.\s]+\d+[a-z]*(?:[\s§\.-]\d+)*)\.?\s*', - text - ) + articles = re.split(r'(?i)(Art[\.\s]+\d+[\.\s]?)', text) articles = [a.strip() for a in articles if a.strip()] print(f"Znaleziono {len(articles)} fragmentów") @@ -130,20 +109,10 @@ def prepare_dataset(directory, catalog_path, source_mapper): if len(article_content) < 50: continue - citation_block = ( - f"{CITATION_START}\n" - f"Dokument: {doc_type}\n" - f"Artykuł: {article_number}\n" - f"Sekcja: {current_section}\n" - f"Rozdział: {current_chapter}\n" - f"{CITATION_END}\n" - f"{article_content}" - ) - source = f"{doc_type}, {article_number}" source_mapper.add_source(source) data.append({ - "text": citation_block, + "text": f"{article_number} {article_content}", "source_idx": source_mapper.get_idx(source) }) else: @@ -173,15 +142,11 @@ def prepare_dataset(directory, catalog_path, source_mapper): return data class CustomModel(nn.Module): - def __init__(self, model_name, tokenizer): + def __init__(self, model_name, config): super().__init__() - config = AutoModelForCausalLM.from_pretrained(model_name).config self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config) self.source_embedding = nn.Embedding(10000, config.hidden_size, padding_idx=-1) - tokenizer.add_special_tokens({'additional_special_tokens': [CITATION_START, CITATION_END]}) - self.base_model.resize_token_embeddings(len(tokenizer)) - for param in self.base_model.parameters(): param.requires_grad = False for param in self.base_model.get_output_embeddings().parameters(): @@ -210,10 +175,20 @@ class CustomModel(nn.Module): class CustomDataCollator(DataCollatorForLanguageModeling): def torch_call(self, examples): - batch = super().torch_call(examples) + # Przetwórz podstawowe pola + input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples]) + attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples]) + labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples]) + batch = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels + } + + # Dodaj source_idx jeśli istnieje if "source_idx" in examples[0]: - source_idx = torch.stack([ex["source_idx"] for ex in examples]) + source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples]) batch["source_idx"] = source_idx return batch @@ -221,10 +196,10 @@ class CustomDataCollator(DataCollatorForLanguageModeling): def main(): source_mapper = SourceMapper() model_name = "crumb/nano-mistral" - tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token - + + # Przygotowanie danych catalog_path = "catalog.json" data = prepare_dataset("docs", catalog_path, source_mapper) @@ -232,8 +207,10 @@ def main(): print("\nBrak danych do treningu!") return + #dataset = Dataset.from_list(data) dataset = Dataset.from_dict({k: [d[k] for d in data] for k in data[0]}) + def tokenize_function(examples): tokenized = tokenizer( examples["text"], @@ -242,19 +219,17 @@ def main(): max_length=512, return_tensors="pt" ) - - source_idx = torch.tensor(examples["source_idx"], dtype=torch.long) - return { "input_ids": tokenized["input_ids"].squeeze(), "attention_mask": tokenized["attention_mask"].squeeze(), "labels": tokenized["input_ids"].squeeze().clone(), - "source_idx": source_idx + "source_idx": examples["source_idx"] # Dodano bez konwersji do tensora } tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16) - model = CustomModel(model_name, tokenizer) + model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config) + model.source_mapper = source_mapper device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device)