with save

This commit is contained in:
l.gabrysiak 2025-02-25 19:32:32 +01:00
parent ed04739b58
commit e9fe2712a0
1 changed files with 37 additions and 30 deletions

67
hft.py
View File

@ -1,7 +1,7 @@
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, GenerationMixin
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from PIL import Image
import re
@ -107,13 +107,15 @@ def tokenize_function(examples):
return tokenized
def custom_collate_fn(batch):
input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch]).cpu()
attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch]).cpu()
labels = torch.stack([torch.tensor(b["labels"]) for b in batch]).cpu()
source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long).cpu()
input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])
attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
labels = torch.stack([torch.tensor(b["labels"]) for b in batch])
source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long)
#print("source_idx shape:", source_idx.shape) # Debugowanie
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx}
class CustomModel(nn.Module, GenerationMixin):
class CustomModel(nn.Module):
def __init__(self, model_name, config):
super().__init__()
self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
@ -122,11 +124,11 @@ class CustomModel(nn.Module, GenerationMixin):
embedding_dim=config.hidden_size,
padding_idx=-1
)
self.config = config
self.device = next(self.base_model.parameters()).device
def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
if source_idx is not None:
#print("Max source_idx:", torch.max(source_idx))
#print("Num embeddings:", self.source_embedding.num_embeddings)
source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings - 1)
source_embeds = self.source_embedding(source_idx).unsqueeze(1).expand(-1, input_ids.size(1), -1)
hidden_states = self.base_model.get_input_embeddings()(input_ids) + source_embeds
@ -136,16 +138,8 @@ class CustomModel(nn.Module, GenerationMixin):
return outputs
def prepare_inputs_for_generation(self, input_ids, **kwargs):
return self.base_model.prepare_inputs_for_generation(input_ids, **kwargs)
def _reorder_cache(self, past, beam_idx):
return self.base_model._reorder_cache(past, beam_idx)
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}
labels = inputs.pop("labels")
source_idx = inputs.pop("source_idx", None)
outputs = model(**inputs, labels=labels, source_idx=source_idx)
@ -166,9 +160,9 @@ tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8)
# Inicjalizacja modelu
config = AutoModelForCausalLM.from_pretrained(model_name).config
#print("Vocabulary size:", config.vocab_size)
model = CustomModel(model_name, config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.to("cpu") # Zmienione na CPU dla debugowania
# Konfiguracja treningu
training_args = TrainingArguments(
@ -177,13 +171,13 @@ training_args = TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
learning_rate=2e-5,
fp16=torch.cuda.is_available(),
fp16=False, # Wyłączone dla CPU
logging_steps=1,
logging_dir="./logs",
save_strategy="steps",
save_steps=1000,
logging_strategy="no",
report_to="none"
report_to="none",
)
# Trening
@ -197,10 +191,9 @@ trainer.train()
# Funkcja generująca odpowiedź
def generate_answer(question, model, tokenizer, source_mapper, max_length=200):
device = next(model.parameters()).device
inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512).to(device)
inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512)
outputs = model.generate(
outputs = model.base_model.generate(
**inputs,
max_length=max_length,
num_return_sequences=1,
@ -212,12 +205,26 @@ def generate_answer(question, model, tokenizer, source_mapper, max_length=200):
# Pobierz źródło z ostatniego tokena
last_token_id = outputs.sequences[0][-1].item()
source_idx = model.source_embedding.weight.shape[0] - 1 # Tymczasowe rozwiązanie
source = source_mapper.get_source(source_idx)
source_idx = model.source_embeddi
return f"{answer}\n\nŹródło: {source if source else 'Opracowanie własne'}"
# Utwórz katalog do zapisu modelu
save_directory = "./trained_model/ably.do/hse"
os.makedirs(save_directory, exist_ok=True)
# Przykład użycia
question = "Ile dni urlopu przysługuje pracownikowi?"
answer = generate_answer(question, model, tokenizer, source_mapper)
print(answer)
# 1. Zapisz wagę modelu
torch.save(model.state_dict(), os.path.join(save_directory, "hse-nano-mistral.bin"))
# 2. Zapisz tokenizer
tokenizer.save_pretrained(save_directory)
# 3. Zapisz mapowanie źródeł
source_mapper_data = {
"source_to_idx": dict(source_mapper.source_to_idx),
"idx_to_source": source_mapper.idx_to_source
}
with open(os.path.join(save_directory, "source_mapper.json"), 'w') as f:
json.dump(source_mapper_data, f)
# 4. Zapisz konfigurację modelu (opcjonalnie, ale zalecane)
model.base_model.config.save_pretrained(save_directory)