mod allegro

This commit is contained in:
l.gabrysiak 2025-02-28 21:26:21 +01:00
parent 04747ff17b
commit 12cef050a2
1 changed files with 48 additions and 30 deletions

View File

@ -1,12 +1,17 @@
import os import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch import torch
import weaviate import faiss
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from weaviate.connect import ConnectionParams from weaviate.connect import ConnectionParams
import weaviate import weaviate
import tempfile
from sentence_transformers import SentenceTransformer
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
# 1⃣ Połączenie z Weaviate # 1⃣ Połączenie z Weaviate
client = weaviate.WeaviateClient( client = weaviate.WeaviateClient(
@ -38,67 +43,80 @@ def fetch_documents():
documents = fetch_documents() documents = fetch_documents()
# 3⃣ Inicjalizacja modelu embeddings = embed_model.encode(documents)
model_name = "allegro/multislav-5lang"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) dim = embeddings.shape[1]
tokenizer = AutoTokenizer.from_pretrained(model_name) index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings, dtype=np.float32))
# 4⃣ Przygotowanie danych treningowych
def create_training_data(): def create_training_data():
return Dataset.from_dict({"text": documents}) data = {
"text": documents,
"embedding": embeddings.tolist()
}
return Dataset.from_dict(data)
dataset = create_training_data() dataset = create_training_data()
split_dataset = dataset.train_test_split(test_size=0.25) split_dataset = dataset.train_test_split(test_size=0.25)
train_dataset = split_dataset["train"] train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"] eval_dataset = split_dataset["test"]
# 5⃣ Tokenizacja device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "allegro/multislav-5lang"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
lora_config = LoraConfig(
r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
max_length = 384
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer( return tokenizer(
examples["text"], examples["text"],
padding="max_length", padding="max_length",
truncation=True, truncation=True,
max_length=512 max_length=max_length
) )
tokenized_train = train_dataset.map(tokenize_function, batched=True) tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True) tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
# 6⃣ Parametry treningu
training_args = TrainingArguments( training_args = TrainingArguments(
output_dir="./results", output_dir="./results",
evaluation_strategy="steps", eval_strategy="steps", # Ewaluacja co określoną liczbę kroków
eval_steps=500, eval_steps=500, # Ewaluacja co 500 kroków
save_steps=500, save_strategy="steps", # Zapis modelu co określoną liczbę kroków
learning_rate=2e-5, save_steps=500, # Zapis modelu co 500 kroków
learning_rate=1e-5,
per_device_train_batch_size=2, per_device_train_batch_size=2,
per_device_eval_batch_size=2, per_device_eval_batch_size=2,
num_train_epochs=16, num_train_epochs=16,
weight_decay=0.01, weight_decay=0.01,
save_total_limit=2, load_best_model_at_end=True, # Wczytaj najlepszy model na końcu
load_best_model_at_end=True, metric_for_best_model="loss", # Kryterium wyboru najlepszego modelu
metric_for_best_model="loss", greater_is_better=False, # Niższy loss = lepszy model
greater_is_better=False,
) )
# 7⃣ Data Collator data_collator = DataCollatorForLanguageModeling(
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) tokenizer=tokenizer,
mlm=False
)
# 8⃣ Trening
trainer = Trainer( trainer = Trainer(
model=model, model=model,
args=training_args, args=training_args,
train_dataset=tokenized_train, train_dataset=tokenized_train,
eval_dataset=tokenized_eval, eval_dataset=tokenized_eval, # Dodany zestaw ewaluacyjny
data_collator=data_collator, data_collator=data_collator,
) )
trainer.train() trainer.train()
# 9⃣ Zapis modelu model.save_pretrained("./trained_model/gemma")
model.save_pretrained("./models/allegro") tokenizer.save_pretrained("./trained_model/gemma")
tokenizer.save_pretrained("./models/allegro")
print("✅ Model został wytrenowany i zapisany!") print("✅ Model został wytrenowany i zapisany!")