ably.do/hft.py

import os
import torch
import torch.nn as nn
from transformers import GPTNeoForCausalLM, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
from PIL import Image
import re
import pytesseract
import docx2txt
import PyPDF2
import json
from torch.amp import autocast
from collections import defaultdict
from huggingface_hub import login

torch.cuda.empty_cache()

# Logowanie do Hugging Face Hub
login(token="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")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

def free_memory():
    torch.empty_cache('cuda')
    torch.ipc_collect('cuda')

class SourceMapper:
    def __init__(self):
        self.source_to_idx = defaultdict(lambda: len(self.source_to_idx))
        self.idx_to_source = {}
        
    def add_source(self, source):
        if source and source not in self.source_to_idx:
            idx = self.source_to_idx[source]
            self.idx_to_source[idx] = source
            
    def get_idx(self, source):
        return self.source_to_idx[source] if source else -1
    
    def get_source(self, idx):
        return self.idx_to_source.get(idx, "Unknown")

def load_file_catalog(catalog_path):
    with open(catalog_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def identify_legal_document(filename, file_catalog):
    return file_catalog.get(filename, "Opracowanie własne")

def extract_text_from_file(file_path):
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()
    
    if ext in ['.txt', '.md']:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    elif ext == '.pdf':
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""
        return text
    elif ext in ['.doc', '.docx']:
        return docx2txt.process(file_path)
    elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
        return pytesseract.image_to_string(Image.open(file_path))
    else:
        return ""

def prepare_dataset(directory, catalog_path, source_mapper):
    file_catalog = load_file_catalog(catalog_path)
    data = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            text = extract_text_from_file(file_path)
            if not text:
                continue
                
            doc_type = identify_legal_document(file, file_catalog)
            if doc_type != "Opracowanie własne":
                articles = re.split(r'(Art\.\s+\d+\.)', text)
                for i in range(1, len(articles), 2):
                    article_number = articles[i].strip()
                    article_content = articles[i+1].strip() if i+1 < len(articles) else ""
                    source = f"{doc_type}, {article_number}"
                    source_mapper.add_source(source)
                    
                    data.append({
                        "text": f"{article_number} {article_content}",
                        "source_idx": source_mapper.get_idx(source)
                    })
            else:
                chunks = [text[i:i+512] for i in range(0, len(text), 512)]
                for chunk in chunks:
                    data.append({
                        "text": chunk,
                        "source_idx": -1
                    })
    return data

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    tokenized["source_idx"] = examples["source_idx"]
    return tokenized

def custom_collate_fn(batch):
    input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])
    attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
    labels = torch.stack([torch.tensor(b["labels"]) for b in batch])
    source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx}

class CustomModel(GPTNeoForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        self.source_embedding = nn.Embedding(
            num_embeddings=1000,
            embedding_dim=config.hidden_size,
            padding_idx=-1
        )
        
    def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
        with autocast():
            outputs = super().forward(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                **kwargs
            )
            if source_idx is not None:
                source_embeds = self.source_embedding(source_idx).unsqueeze(1)
                outputs.logits += source_embeds
        return outputs

source_mapper = SourceMapper()
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

data = prepare_dataset("files", "file_catalog.json", source_mapper)
dataset = Dataset.from_list(data)
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)

config = AutoModelForCausalLM.from_pretrained(model_name).config
model = CustomModel.from_pretrained(model_name)
model.config.gradient_checkpointing = True
model.config.use_cache = False
model.resize_token_embeddings(len(tokenizer))
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=custom_collate_fn
)

trainer.train()

free_memory()

# Funkcja generująca odpowiedź
def generate_answer(question, model, tokenizer, source_mapper, max_length=200):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512)
    
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        return_dict_in_generate=True,
        output_scores=True,
    )
    
    answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    
    # Pobierz źródło z ostatniego tokena
    last_token_id = outputs.sequences[0][-1].item()
    source_idx = model.source_embedding.weight.shape[0] - 1  # Tymczasowe rozwiązanie
    source = source_mapper.get_source(source_idx)
    
    return f"{answer}\n\nŹródło: {source if source else 'Opracowanie własne'}"

# Przykład użycia
question = "Ile dni urlopu przysługuje pracownikowi?"
answer = generate_answer(question, model, tokenizer, source_mapper)
print(answer)
init 2025-02-25 04:03:59 -05:00			`import os`
			`import torch`
			`import torch.nn as nn`
mod 2025-02-25 08:56:17 -05:00			`from transformers import GPTNeoForCausalLM, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM`
dataset update 2025-02-25 06:25:02 -05:00			`from datasets import Dataset`
init 2025-02-25 04:03:59 -05:00			`from PIL import Image`
			`import re`
			`import pytesseract`
			`import docx2txt`
			`import PyPDF2`
dodanie import json 2025-02-25 06:21:39 -05:00			`import json`
mod 2025-02-25 09:11:33 -05:00			`from torch.amp import autocast`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`from collections import defaultdict`
login 2025-02-25 04:45:37 -05:00			`from huggingface_hub import login`

mod 2025-02-25 07:46:35 -05:00			`torch.cuda.empty_cache()`

mod 2025-02-25 08:56:17 -05:00			`# Logowanie do Hugging Face Hub`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`login(token="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")`
trener mod 2025-02-25 07:17:17 -05:00			`os.environ["TOKENIZERS_PARALLELISM"] = "false"`
mod 2025-02-25 09:02:36 -05:00			`os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"`

			`def free_memory():`
mod 2025-02-25 09:11:33 -05:00			`torch.empty_cache('cuda')`
			`torch.ipc_collect('cuda')`
login 2025-02-25 04:45:37 -05:00
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`class SourceMapper:`
			`def __init__(self):`
			`self.source_to_idx = defaultdict(lambda: len(self.source_to_idx))`
			`self.idx_to_source = {}`

			`def add_source(self, source):`
			`if source and source not in self.source_to_idx:`
			`idx = self.source_to_idx[source]`
			`self.idx_to_source[idx] = source`

			`def get_idx(self, source):`
			`return self.source_to_idx[source] if source else -1`

			`def get_source(self, idx):`
			`return self.idx_to_source.get(idx, "Unknown")`

init 2025-02-25 04:03:59 -05:00			`def load_file_catalog(catalog_path):`
			`with open(catalog_path, 'r', encoding='utf-8') as file:`
			`return json.load(file)`

			`def identify_legal_document(filename, file_catalog):`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`return file_catalog.get(filename, "Opracowanie własne")`
init 2025-02-25 04:03:59 -05:00
			`def extract_text_from_file(file_path):`
			`_, ext = os.path.splitext(file_path)`
			`ext = ext.lower()`

			`if ext in ['.txt', '.md']:`
			`with open(file_path, 'r', encoding='utf-8') as file:`
			`return file.read()`
			`elif ext == '.pdf':`
			`text = ""`
			`with open(file_path, 'rb') as file:`
			`reader = PyPDF2.PdfReader(file)`
			`for page in reader.pages:`
mod 2025-02-25 09:02:36 -05:00			`text += page.extract_text() or ""`
init 2025-02-25 04:03:59 -05:00			`return text`
			`elif ext in ['.doc', '.docx']:`
			`return docx2txt.process(file_path)`
			`elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:`
			`return pytesseract.image_to_string(Image.open(file_path))`
			`else:`
			`return ""`

ds modification and optimalization 2025-02-25 07:34:04 -05:00			`def prepare_dataset(directory, catalog_path, source_mapper):`
init 2025-02-25 04:03:59 -05:00			`file_catalog = load_file_catalog(catalog_path)`
			`data = []`
ds modification and optimalization 2025-02-25 07:34:04 -05:00
init 2025-02-25 04:03:59 -05:00			`for root, _, files in os.walk(directory):`
			`for file in files:`
			`file_path = os.path.join(root, file)`
			`text = extract_text_from_file(file_path)`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`if not text:`
			`continue`

			`doc_type = identify_legal_document(file, file_catalog)`
			`if doc_type != "Opracowanie własne":`
mod 2025-02-25 09:02:36 -05:00			`articles = re.split(r'(Art\.\s+\d+\.)', text)`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`for i in range(1, len(articles), 2):`
			`article_number = articles[i].strip()`
			`article_content = articles[i+1].strip() if i+1 < len(articles) else ""`
			`source = f"{doc_type}, {article_number}"`
			`source_mapper.add_source(source)`

			`data.append({`
			`"text": f"{article_number} {article_content}",`
			`"source_idx": source_mapper.get_idx(source)`
			`})`
			`else:`
			`chunks = [text[i:i+512] for i in range(0, len(text), 512)]`
			`for chunk in chunks:`
			`data.append({`
			`"text": chunk,`
mod 2025-02-25 08:56:17 -05:00			`"source_idx": -1`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`})`
init 2025-02-25 04:03:59 -05:00			`return data`

			`def tokenize_function(examples):`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`tokenized = tokenizer(`
			`examples["text"],`
			`truncation=True,`
			`padding="max_length",`
			`max_length=512,`
			`return_tensors="pt"`
			`)`
			`tokenized["labels"] = tokenized["input_ids"].clone()`
mod 2025-02-25 07:42:51 -05:00			`tokenized["source_idx"] = examples["source_idx"]`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`return tokenized`
init 2025-02-25 04:03:59 -05:00
mod 2025-02-25 07:40:23 -05:00			`def custom_collate_fn(batch):`
			`input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])`
			`attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])`
			`labels = torch.stack([torch.tensor(b["labels"]) for b in batch])`
mod 2025-02-25 07:42:51 -05:00			`source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long)`
mod 2025-02-25 07:40:23 -05:00			`return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx}`

mod 2025-02-25 08:56:17 -05:00			`class CustomModel(GPTNeoForCausalLM):`
init 2025-02-25 04:03:59 -05:00			`def __init__(self, config):`
			`super().__init__(config)`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`self.source_embedding = nn.Embedding(`
mod 2025-02-25 08:29:02 -05:00			`num_embeddings=1000,`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`embedding_dim=config.hidden_size,`
			`padding_idx=-1`
			`)`

			`def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):`
mod 2025-02-25 08:56:17 -05:00			`with autocast():`
			`outputs = super().forward(`
			`input_ids=input_ids,`
			`attention_mask=attention_mask,`
			`labels=labels,`
			`**kwargs`
			`)`
			`if source_idx is not None:`
			`source_embeds = self.source_embedding(source_idx).unsqueeze(1)`
			`outputs.logits += source_embeds`
init 2025-02-25 04:03:59 -05:00			`return outputs`

ds modification and optimalization 2025-02-25 07:34:04 -05:00			`source_mapper = SourceMapper()`
zmiana modelu 2025-02-25 09:04:47 -05:00			`model_name = "EleutherAI/gpt-neo-1.3B"`
init 2025-02-25 04:03:59 -05:00			`tokenizer = AutoTokenizer.from_pretrained(model_name)`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`tokenizer.pad_token = tokenizer.eos_token`
init 2025-02-25 04:03:59 -05:00
mod 2025-02-25 08:56:17 -05:00			`data = prepare_dataset("files", "file_catalog.json", source_mapper)`
dataset update 2025-02-25 06:25:02 -05:00			`dataset = Dataset.from_list(data)`
mod 2025-02-25 09:02:36 -05:00			`tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)`
ds modification and optimalization 2025-02-25 07:34:04 -05:00
mod 2025-02-25 08:46:02 -05:00			`config = AutoModelForCausalLM.from_pretrained(model_name).config`
			`model = CustomModel.from_pretrained(model_name)`
mod 2025-02-25 08:50:09 -05:00			`model.config.gradient_checkpointing = True`
			`model.config.use_cache = False`
mod 2025-02-25 08:29:02 -05:00			`model.resize_token_embeddings(len(tokenizer))`
mod 2025-02-25 07:51:10 -05:00			`model.gradient_checkpointing_enable()`
init 2025-02-25 04:03:59 -05:00
			`training_args = TrainingArguments(`
			`output_dir="./results",`
			`num_train_epochs=3,`
mod 2025-02-25 09:02:36 -05:00			`gradient_accumulation_steps=8,`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`learning_rate=2e-5,`
			`fp16=True,`
mod 2025-02-25 09:02:36 -05:00			`logging_steps=50,`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`save_strategy="steps",`
mod 2025-02-25 09:02:36 -05:00			`save_steps=500,`
			`per_device_train_batch_size=2,`
			`per_device_eval_batch_size=2,`
mod 2025-02-25 08:56:17 -05:00			`logging_dir='./logs'`
init 2025-02-25 04:03:59 -05:00			`)`

mod 2025-02-25 09:02:36 -05:00			`trainer = Trainer(`
init 2025-02-25 04:03:59 -05:00			`model=model,`
			`args=training_args,`
modyfikacja trenera 2025-02-25 06:32:03 -05:00			`train_dataset=tokenized_dataset,`
mod 2025-02-25 08:56:17 -05:00			`data_collator=custom_collate_fn`
init 2025-02-25 04:03:59 -05:00			`)`
mod 2025-02-25 09:02:36 -05:00
init 2025-02-25 04:03:59 -05:00			`trainer.train()`

mod 2025-02-25 09:02:36 -05:00			`free_memory()`

ds modification and optimalization 2025-02-25 07:34:04 -05:00			`# Funkcja generująca odpowiedź`
			`def generate_answer(question, model, tokenizer, source_mapper, max_length=200):`
			`inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512)`

			`outputs = model.generate(`
			`**inputs,`
			`max_length=max_length,`
			`num_return_sequences=1,`
			`return_dict_in_generate=True,`
			`output_scores=True,`
			`)`
init 2025-02-25 04:03:59 -05:00
			`answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)`

ds modification and optimalization 2025-02-25 07:34:04 -05:00			`# Pobierz źródło z ostatniego tokena`
			`last_token_id = outputs.sequences[0][-1].item()`
mod 2025-02-25 08:46:02 -05:00			`source_idx = model.source_embedding.weight.shape[0] - 1 # Tymczasowe rozwiązanie`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`source = source_mapper.get_source(source_idx)`
init 2025-02-25 04:03:59 -05:00
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`return f"{answer}\n\nŹródło: {source if source else 'Opracowanie własne'}"`
init 2025-02-25 04:03:59 -05:00
			`# Przykład użycia`
			`question = "Ile dni urlopu przysługuje pracownikowi?"`
ds modification and optimalization 2025-02-25 07:34:04 -05:00			`answer = generate_answer(question, model, tokenizer, source_mapper)`
init 2025-02-25 04:03:59 -05:00			`print(answer)`