ably.do/hft.py

import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import re
import json
import PyPDF2
import docx2txt
import pytesseract
from PIL import Image
from collections import defaultdict
from huggingface_hub import login

# Konfiguracja
os.environ['TORCH_USE_CUDA_DSA'] = '1'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
login(token="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")

class SourceMapper:
    def __init__(self):
        self.source_to_idx = defaultdict(lambda: len(self.source_to_idx))
        self.idx_to_source = {}
        
    def add_source(self, source):
        if source and source not in self.source_to_idx:
            idx = self.source_to_idx[source]
            self.idx_to_source[idx] = source
            
    def get_idx(self, source):
        return self.source_to_idx[source] if source else -1
    
    def get_source(self, idx):
        return self.idx_to_source.get(idx, "Unknown")

def load_file_catalog(catalog_path):
    try:
        with open(catalog_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Błąd wczytywania katalogu plików: {str(e)}")
        return {}

def identify_legal_document(filename, file_catalog):
    base_name = os.path.splitext(filename)[0].lower()
    return file_catalog.get(base_name, "Opracowanie własne")

def extract_text_from_file(file_path):
    try:
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()
        
        if ext in ['.txt', '.md']:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        elif ext == '.pdf':
            text = ""
            try:
                with open(file_path, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    for page in reader.pages:
                        text += page.extract_text() or ""
            except Exception as e:
                print(f"Błąd PDF: {str(e)}")
            return text
        elif ext in ['.doc', '.docx']:
            return docx2txt.process(file_path)
        elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
            return pytesseract.image_to_string(Image.open(file_path))
        else:
            print(f"Nieobsługiwany format pliku: {ext}")
            return ""
    except Exception as e:
        print(f"Błąd ekstrakcji tekstu: {str(e)}")
        return ""

def prepare_dataset(directory, catalog_path, source_mapper):
    file_catalog = load_file_catalog(catalog_path)
    data = []
    
    print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
    
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            print(f"\nPrzetwarzanie pliku: {file_path}")
            
            try:
                text = extract_text_from_file(file_path)
                if not text.strip():
                    print("Pominięto - brak tekstu")
                    continue
                    
                print(f"Długość tekstu: {len(text)} znaków")
                
                doc_type = identify_legal_document(file, file_catalog)
                print(f"Rozpoznany typ dokumentu: {doc_type}")
                
                if doc_type != "Opracowanie własne":
                    articles = re.split(r'(?i)(Art[\.\s]+\d+[\.\s]?)', text)
                    articles = [a.strip() for a in articles if a.strip()]
                    
                    print(f"Znaleziono {len(articles)} fragmentów")
                    
                    for i in range(0, len(articles)-1, 2):
                        article_number = articles[i]
                        article_content = articles[i+1]
                        
                        if len(article_content) < 50:
                            continue
                            
                        source = f"{doc_type}, {article_number}"
                        source_mapper.add_source(source)
                        data.append({
                            "text": f"{article_number} {article_content}",
                            "source_idx": source_mapper.get_idx(source)
                        })
                else:
                    clean_text = re.sub(r'\s+', ' ', text).strip()
                    chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
                    chunks = [c for c in chunks if c.strip()]
                    
                    for chunk in chunks:
                        data.append({
                            "text": chunk,
                            "source_idx": -1
                        })
                    print(f"Dodano {len(chunks)} chunków")
                    
            except Exception as e:
                print(f"Błąd podczas przetwarzania pliku: {str(e)}")
                continue
                
    print(f"\nPodsumowanie przygotowania danych:")
    print(f"Łączna liczba przykładów: {len(data)}")
    if data:
        print("Przykładowy wpis:")
        print(json.dumps(data[0], indent=2, ensure_ascii=False))
    else:
        print("BRAK DANYCH - sprawdź diagnostykę powyżej")
        
    return data

class CustomModel(nn.Module):
    def __init__(self, model_name, config):
        super().__init__()
        self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
        self.source_embedding = nn.Embedding(10000, config.hidden_size, padding_idx=-1)
        
        for param in self.base_model.parameters():
            param.requires_grad = False
        for param in self.base_model.get_output_embeddings().parameters():
            param.requires_grad = True
            
    def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
        if source_idx is not None:
            valid_indices = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1)
            source_embeds = self.source_embedding(valid_indices).unsqueeze(1)
            inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds
            return self.base_model(
                inputs_embeds=inputs_embeds,
                attention_mask=attention_mask,
                labels=labels,
                **kwargs
            )
        return self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            **kwargs
        )
    
    def generate(self, *args, **kwargs):
        return self.base_model.generate(*args, **kwargs)

class CustomDataCollator(DataCollatorForLanguageModeling):
    def torch_call(self, examples):
        # Przetwórz podstawowe pola
        input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples])
        attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples])
        labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples])
        
        batch = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }
        
        # Dodaj source_idx jeśli istnieje
        if "source_idx" in examples[0]:
            source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples])
            batch["source_idx"] = source_idx
            
        return batch

def main():
    source_mapper = SourceMapper()
    model_name = "crumb/nano-mistral"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Przygotowanie danych
    catalog_path = "catalog.json"
    data = prepare_dataset("docs", catalog_path, source_mapper)
    
    if not data:
        print("\nBrak danych do treningu!")
        return

    #dataset = Dataset.from_list(data)
    dataset = Dataset.from_dict({k: [d[k] for d in data] for k in data[0]})


    def tokenize_function(examples):
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
            "labels": tokenized["input_ids"].squeeze().clone(),
            "source_idx": examples["source_idx"]  # Dodano bez konwersji do tensora
        }

    tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)

    model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
    model.source_mapper = source_mapper
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=10,
        save_strategy="steps",
        save_steps=1000,
        report_to="none",
        remove_unused_columns=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=CustomDataCollator(tokenizer=tokenizer, mlm=False)
    )

    print("\nRozpoczęcie treningu...")
    trainer.train()

if __name__ == "__main__":
    main()
init 2025-02-25 04:03:59 -05:00			`import os`
			`import torch`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00			`import torch.nn as nn`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling`
			`from datasets import Dataset`
init 2025-02-25 04:03:59 -05:00			`import re`
dodanie import json 2025-02-25 06:21:39 -05:00			`import json`
mod 2025-02-25 15:23:33 -05:00			`import PyPDF2`
			`import docx2txt`
			`import pytesseract`
			`from PIL import Image`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`from collections import defaultdict`
login 2025-02-25 04:45:37 -05:00			`from huggingface_hub import login`

Ten kod działa! 2025-02-25 17:32:39 -05:00			`# Konfiguracja`
			`os.environ['TORCH_USE_CUDA_DSA'] = '1'`
trener mod 2025-02-25 07:17:17 -05:00			`os.environ["TOKENIZERS_PARALLELISM"] = "false"`
mod 2025-02-25 17:17:37 -05:00			`login(token="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")`
mod 2025-02-25 11:24:26 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`class SourceMapper:`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`def __init__(self):`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`self.source_to_idx = defaultdict(lambda: len(self.source_to_idx))`
			`self.idx_to_source = {}`
mod 2025-02-25 18:19:51 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def add_source(self, source):`
			`if source and source not in self.source_to_idx:`
			`idx = self.source_to_idx[source]`
			`self.idx_to_source[idx] = source`
mod 2025-02-25 18:19:51 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def get_idx(self, source):`
			`return self.source_to_idx[source] if source else -1`
mod 2025-02-25 18:19:51 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def get_source(self, idx):`
			`return self.idx_to_source.get(idx, "Unknown")`
ds modification and optimalization 2025-02-25 07:34:04 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def load_file_catalog(catalog_path):`
			`try:`
			`with open(catalog_path, 'r', encoding='utf-8') as file:`
			`return json.load(file)`
			`except Exception as e:`
			`print(f"Błąd wczytywania katalogu plików: {str(e)}")`
			`return {}`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def identify_legal_document(filename, file_catalog):`
			`base_name = os.path.splitext(filename)[0].lower()`
			`return file_catalog.get(base_name, "Opracowanie własne")`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def extract_text_from_file(file_path):`
			`try:`
			`_, ext = os.path.splitext(file_path)`
			`ext = ext.lower()`

			`if ext in ['.txt', '.md']:`
			`with open(file_path, 'r', encoding='utf-8') as file:`
			`return file.read()`
			`elif ext == '.pdf':`
			`text = ""`
			`try:`
			`with open(file_path, 'rb') as file:`
			`reader = PyPDF2.PdfReader(file)`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00			`for page in reader.pages:`
			`text += page.extract_text() or ""`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`except Exception as e:`
			`print(f"Błąd PDF: {str(e)}")`
			`return text`
			`elif ext in ['.doc', '.docx']:`
			`return docx2txt.process(file_path)`
			`elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:`
			`return pytesseract.image_to_string(Image.open(file_path))`
			`else:`
			`print(f"Nieobsługiwany format pliku: {ext}")`
mod 2025-02-25 15:30:01 -05:00			`return ""`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`except Exception as e:`
			`print(f"Błąd ekstrakcji tekstu: {str(e)}")`
			`return ""`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def prepare_dataset(directory, catalog_path, source_mapper):`
			`file_catalog = load_file_catalog(catalog_path)`
			`data = []`

			`print(f"\n{'='50}\nDIAGNOSTYKA DANYCH\n{'='50}")`

			`for root, _, files in os.walk(directory):`
			`for file in files:`
			`file_path = os.path.join(root, file)`
			`print(f"\nPrzetwarzanie pliku: {file_path}")`

			`try:`
			`text = extract_text_from_file(file_path)`
			`if not text.strip():`
			`print("Pominięto - brak tekstu")`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00			`continue`
Ten kod działa! 2025-02-25 17:32:39 -05:00
			`print(f"Długość tekstu: {len(text)} znaków")`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`doc_type = identify_legal_document(file, file_catalog)`
			`print(f"Rozpoznany typ dokumentu: {doc_type}")`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00
			`if doc_type != "Opracowanie własne":`
mod 2025-02-25 18:19:51 -05:00			`articles = re.split(r'(?i)(Art[\.\s]+\d+[\.\s]?)', text)`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`articles = [a.strip() for a in articles if a.strip()]`

			`print(f"Znaleziono {len(articles)} fragmentów")`

			`for i in range(0, len(articles)-1, 2):`
			`article_number = articles[i]`
			`article_content = articles[i+1]`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`if len(article_content) < 50:`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00			`continue`

Ten kod działa! 2025-02-25 17:32:39 -05:00			`source = f"{doc_type}, {article_number}"`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00			`source_mapper.add_source(source)`
			`data.append({`
mod 2025-02-25 18:19:51 -05:00			`"text": f"{article_number} {article_content}",`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`"source_idx": source_mapper.get_idx(source)`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00			`})`
mod 2025-02-25 16:21:41 -05:00			`else:`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`clean_text = re.sub(r'\s+', ' ', text).strip()`
			`chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]`
			`chunks = [c for c in chunks if c.strip()]`

TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00			`for chunk in chunks:`
			`data.append({`
			`"text": chunk,`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`"source_idx": -1`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00			`})`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`print(f"Dodano {len(chunks)} chunków")`

			`except Exception as e:`
			`print(f"Błąd podczas przetwarzania pliku: {str(e)}")`
			`continue`

			`print(f"\nPodsumowanie przygotowania danych:")`
			`print(f"Łączna liczba przykładów: {len(data)}")`
			`if data:`
			`print("Przykładowy wpis:")`
			`print(json.dumps(data[0], indent=2, ensure_ascii=False))`
			`else:`
			`print("BRAK DANYCH - sprawdź diagnostykę powyżej")`
mod 2025-02-25 16:17:13 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`return data`
init 2025-02-25 04:03:59 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`class CustomModel(nn.Module):`
mod 2025-02-25 18:19:51 -05:00			`def __init__(self, model_name, config):`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`super().__init__()`
			`self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)`
			`self.source_embedding = nn.Embedding(10000, config.hidden_size, padding_idx=-1)`

			`for param in self.base_model.parameters():`
			`param.requires_grad = False`
			`for param in self.base_model.get_output_embeddings().parameters():`
			`param.requires_grad = True`

			`def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):`
			`if source_idx is not None:`
			`valid_indices = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1)`
			`source_embeds = self.source_embedding(valid_indices).unsqueeze(1)`
			`inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds`
			`return self.base_model(`
			`inputs_embeds=inputs_embeds,`
			`attention_mask=attention_mask,`
			`labels=labels,`
			`**kwargs`
mod 2025-02-25 17:17:07 -05:00			`)`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`return self.base_model(`
			`input_ids=input_ids,`
			`attention_mask=attention_mask,`
			`labels=labels,`
			`**kwargs`
			`)`

			`def generate(self, args, *kwargs):`
			`return self.base_model.generate(args, *kwargs)`
mod 2025-02-25 16:50:35 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`class CustomDataCollator(DataCollatorForLanguageModeling):`
			`def torch_call(self, examples):`
mod 2025-02-25 18:19:51 -05:00			`# Przetwórz podstawowe pola`
			`input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples])`
			`attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples])`
			`labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples])`

			`batch = {`
			`"input_ids": input_ids,`
			`"attention_mask": attention_mask,`
			`"labels": labels`
			`}`
Ten kod działa! 2025-02-25 17:32:39 -05:00
mod 2025-02-25 18:19:51 -05:00			`# Dodaj source_idx jeśli istnieje`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`if "source_idx" in examples[0]:`
mod 2025-02-25 18:19:51 -05:00			`source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples])`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`batch["source_idx"] = source_idx`

			`return batch`
mod 2025-02-25 16:21:41 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def main():`
			`source_mapper = SourceMapper()`
			`model_name = "crumb/nano-mistral"`
mod 2025-02-25 18:08:31 -05:00			`tokenizer = AutoTokenizer.from_pretrained(model_name)`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`tokenizer.pad_token = tokenizer.eos_token`
mod 2025-02-25 18:19:51 -05:00
			`# Przygotowanie danych`
Ten kod działa!!! 2025-02-25 17:33:52 -05:00			`catalog_path = "catalog.json"`
			`data = prepare_dataset("docs", catalog_path, source_mapper)`
Ten kod działa! 2025-02-25 17:32:39 -05:00
			`if not data:`
			`print("\nBrak danych do treningu!")`
			`return`
mod 2025-02-25 16:50:35 -05:00
mod 2025-02-25 18:19:51 -05:00			`#dataset = Dataset.from_list(data)`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`dataset = Dataset.from_dict({k: [d[k] for d in data] for k in data[0]})`
mod dataset 2025-02-25 17:06:17 -05:00
mod 2025-02-25 18:19:51 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`def tokenize_function(examples):`
			`tokenized = tokenizer(`
			`examples["text"],`
			`truncation=True,`
			`padding="max_length",`
			`max_length=512,`
			`return_tensors="pt"`
mod 2025-02-25 16:50:35 -05:00			`)`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`return {`
			`"input_ids": tokenized["input_ids"].squeeze(),`
			`"attention_mask": tokenized["attention_mask"].squeeze(),`
			`"labels": tokenized["input_ids"].squeeze().clone(),`
mod 2025-02-25 18:19:51 -05:00			`"source_idx": examples["source_idx"] # Dodano bez konwersji do tensora`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`}`
mod 2025-02-25 16:50:35 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00
mod 2025-02-25 18:19:51 -05:00			`model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)`
			`model.source_mapper = source_mapper`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`
			`model.to(device)`
TEN KOD DZIAŁA 2025-02-25 16:54:44 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`training_args = TrainingArguments(`
			`output_dir="./results",`
			`num_train_epochs=3,`
			`per_device_train_batch_size=2,`
			`gradient_accumulation_steps=4,`
			`learning_rate=2e-5,`
			`fp16=torch.cuda.is_available(),`
			`logging_steps=10,`
			`save_strategy="steps",`
			`save_steps=1000,`
			`report_to="none",`
			`remove_unused_columns=False`
			`)`
mod 2025-02-25 17:17:07 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`trainer = Trainer(`
			`model=model,`
			`args=training_args,`
			`train_dataset=tokenized_dataset,`
			`data_collator=CustomDataCollator(tokenizer=tokenizer, mlm=False)`
			`)`
mod 2025-02-25 17:17:07 -05:00
Ten kod działa! 2025-02-25 17:32:39 -05:00			`print("\nRozpoczęcie treningu...")`
			`trainer.train()`
mod 2025-02-25 14:38:44 -05:00
			`if __name__ == "__main__":`
Ten kod działa! 2025-02-25 17:32:39 -05:00			`main()`