with save

This commit is contained in:
l.gabrysiak 2025-02-25 19:32:32 +01:00
parent ed04739b58
commit e9fe2712a0
1 changed files with 37 additions and 30 deletions

67
hft.py
View File

@ -1,7 +1,7 @@
import os import os
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, GenerationMixin from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset from datasets import Dataset
from PIL import Image from PIL import Image
import re import re
@ -107,13 +107,15 @@ def tokenize_function(examples):
return tokenized return tokenized
def custom_collate_fn(batch): def custom_collate_fn(batch):
input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch]).cpu() input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])
attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch]).cpu() attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
labels = torch.stack([torch.tensor(b["labels"]) for b in batch]).cpu() labels = torch.stack([torch.tensor(b["labels"]) for b in batch])
source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long).cpu()
source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long)
#print("source_idx shape:", source_idx.shape) # Debugowanie
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx} return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx}
class CustomModel(nn.Module, GenerationMixin): class CustomModel(nn.Module):
def __init__(self, model_name, config): def __init__(self, model_name, config):
super().__init__() super().__init__()
self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config) self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
@ -122,11 +124,11 @@ class CustomModel(nn.Module, GenerationMixin):
embedding_dim=config.hidden_size, embedding_dim=config.hidden_size,
padding_idx=-1 padding_idx=-1
) )
self.config = config
self.device = next(self.base_model.parameters()).device
def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs): def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
if source_idx is not None: if source_idx is not None:
#print("Max source_idx:", torch.max(source_idx))
#print("Num embeddings:", self.source_embedding.num_embeddings)
source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings - 1) source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings - 1)
source_embeds = self.source_embedding(source_idx).unsqueeze(1).expand(-1, input_ids.size(1), -1) source_embeds = self.source_embedding(source_idx).unsqueeze(1).expand(-1, input_ids.size(1), -1)
hidden_states = self.base_model.get_input_embeddings()(input_ids) + source_embeds hidden_states = self.base_model.get_input_embeddings()(input_ids) + source_embeds
@ -136,16 +138,8 @@ class CustomModel(nn.Module, GenerationMixin):
return outputs return outputs
def prepare_inputs_for_generation(self, input_ids, **kwargs):
return self.base_model.prepare_inputs_for_generation(input_ids, **kwargs)
def _reorder_cache(self, past, beam_idx):
return self.base_model._reorder_cache(past, beam_idx)
class CustomTrainer(Trainer): class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs): def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}
labels = inputs.pop("labels") labels = inputs.pop("labels")
source_idx = inputs.pop("source_idx", None) source_idx = inputs.pop("source_idx", None)
outputs = model(**inputs, labels=labels, source_idx=source_idx) outputs = model(**inputs, labels=labels, source_idx=source_idx)
@ -166,9 +160,9 @@ tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8)
# Inicjalizacja modelu # Inicjalizacja modelu
config = AutoModelForCausalLM.from_pretrained(model_name).config config = AutoModelForCausalLM.from_pretrained(model_name).config
#print("Vocabulary size:", config.vocab_size)
model = CustomModel(model_name, config) model = CustomModel(model_name, config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to("cpu") # Zmienione na CPU dla debugowania
model = model.to(device)
# Konfiguracja treningu # Konfiguracja treningu
training_args = TrainingArguments( training_args = TrainingArguments(
@ -177,13 +171,13 @@ training_args = TrainingArguments(
per_device_train_batch_size=2, per_device_train_batch_size=2,
gradient_accumulation_steps=4, gradient_accumulation_steps=4,
learning_rate=2e-5, learning_rate=2e-5,
fp16=torch.cuda.is_available(), fp16=False, # Wyłączone dla CPU
logging_steps=1, logging_steps=1,
logging_dir="./logs", logging_dir="./logs",
save_strategy="steps", save_strategy="steps",
save_steps=1000, save_steps=1000,
logging_strategy="no", logging_strategy="no",
report_to="none" report_to="none",
) )
# Trening # Trening
@ -197,10 +191,9 @@ trainer.train()
# Funkcja generująca odpowiedź # Funkcja generująca odpowiedź
def generate_answer(question, model, tokenizer, source_mapper, max_length=200): def generate_answer(question, model, tokenizer, source_mapper, max_length=200):
device = next(model.parameters()).device inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512)
inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512).to(device)
outputs = model.generate( outputs = model.base_model.generate(
**inputs, **inputs,
max_length=max_length, max_length=max_length,
num_return_sequences=1, num_return_sequences=1,
@ -212,12 +205,26 @@ def generate_answer(question, model, tokenizer, source_mapper, max_length=200):
# Pobierz źródło z ostatniego tokena # Pobierz źródło z ostatniego tokena
last_token_id = outputs.sequences[0][-1].item() last_token_id = outputs.sequences[0][-1].item()
source_idx = model.source_embedding.weight.shape[0] - 1 # Tymczasowe rozwiązanie source_idx = model.source_embeddi
source = source_mapper.get_source(source_idx)
return f"{answer}\n\nŹródło: {source if source else 'Opracowanie własne'}" # Utwórz katalog do zapisu modelu
save_directory = "./trained_model/ably.do/hse"
os.makedirs(save_directory, exist_ok=True)
# Przykład użycia # 1. Zapisz wagę modelu
question = "Ile dni urlopu przysługuje pracownikowi?" torch.save(model.state_dict(), os.path.join(save_directory, "hse-nano-mistral.bin"))
answer = generate_answer(question, model, tokenizer, source_mapper)
print(answer) # 2. Zapisz tokenizer
tokenizer.save_pretrained(save_directory)
# 3. Zapisz mapowanie źródeł
source_mapper_data = {
"source_to_idx": dict(source_mapper.source_to_idx),
"idx_to_source": source_mapper.idx_to_source
}
with open(os.path.join(save_directory, "source_mapper.json"), 'w') as f:
json.dump(source_mapper_data, f)
# 4. Zapisz konfigurację modelu (opcjonalnie, ale zalecane)
model.base_model.config.save_pretrained(save_directory)