ably.do/allegro.py

121 lines
3.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import weaviate
from weaviate.client import WeaviateClient
from weaviate.connect import ConnectionParams
# 1⃣ Inicjalizacja modelu do embeddingów
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
# 2⃣ Połączenie z Weaviate i pobranie dokumentów
client = WeaviateClient(
connection_params=ConnectionParams.from_params(
http_host="weaviate",
http_port=8080,
http_secure=False,
grpc_host="weaviate",
grpc_port=50051,
grpc_secure=False,
)
)
collection_name = "Document" # Zakładam, że to jest nazwa Twojej kolekcji
result = (
client.query.get(collection_name, ["content"])
.with_additional(["id"])
.do()
)
documents = [item['content'] for item in result['data']['Get'][collection_name]]
# 3⃣ Generowanie embeddingów
embeddings = embed_model.encode(documents)
# 4⃣ Przygotowanie danych treningowych
def create_training_data():
data = {
"text": documents,
"embedding": embeddings.tolist()
}
return Dataset.from_dict(data)
dataset = create_training_data()
# Podział danych na treningowe i ewaluacyjne
split_dataset = dataset.train_test_split(test_size=0.25)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
# 5⃣ Ładowanie modelu allegro/multislav-5lang
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "allegro/multislav-5lang"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 6⃣ Konfiguracja LoRA
lora_config = LoraConfig(
r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
# 7⃣ Tokenizacja danych
max_length = 384
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=max_length
)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
# 8⃣ Parametry treningu
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="steps",
eval_steps=500,
save_strategy="steps",
save_steps=500,
learning_rate=1e-5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=16,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="loss",
greater_is_better=False,
)
# 9⃣ Data Collator
data_collator = DataCollatorForSeq2Seq(
tokenizer=tokenizer,
model=model
)
# 🔟 Trening modelu
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
data_collator=data_collator,
)
trainer.train()
# 1⃣1⃣ Zapis modelu
model.save_pretrained("./models/allegro")
tokenizer.save_pretrained("./models/allegro")
print("✅ Model został wytrenowany i zapisany!")