ably.do/allegro.py

122 lines
3.6 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from weaviate.connect import ConnectionParams
import weaviate
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, MarianForCausalLM, MarianTokenizer
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
# 1⃣ Połączenie z Weaviate
client = weaviate.WeaviateClient(
connection_params=ConnectionParams.from_params(
http_host="weaviate",
http_port=8080,
http_secure=False,
grpc_host="weaviate",
grpc_port=50051,
grpc_secure=False,
)
)
client.connect()
collection = client.collections.get("Document")
# 2⃣ Pobranie dokumentów z Weaviate
def fetch_documents():
response = collection.query.fetch_objects()
documents = []
for o in response.objects:
file_name = o.properties.get("fileName", "unknown_file")
content = o.properties.get("content", "")
if content:
documents.append(f"fileName: {file_name}, content: {content}")
print(f"fileName: {file_name}")
return documents
#return documents
documents = fetch_documents()
embeddings = embed_model.encode(documents)
dim = embeddings.shape[1]
#index = faiss.IndexFlatL2(dim)
#index.add(np.array(embeddings, dtype=np.float32))
def create_training_data():
data = {
"text": documents,
"embedding": embeddings.tolist()
}
return Dataset.from_dict(data)
dataset = create_training_data()
split_dataset = dataset.train_test_split(test_size=0.25)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "allegro/multislav-5lang"
model = MarianForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
tokenizer = MarianForCausalLM.from_pretrained(model_name)
lora_config = LoraConfig(
r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
max_length = 384
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=max_length
)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="steps", # Ewaluacja co określoną liczbę kroków
eval_steps=500, # Ewaluacja co 500 kroków
save_strategy="steps", # Zapis modelu co określoną liczbę kroków
save_steps=500, # Zapis modelu co 500 kroków
learning_rate=1e-5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=16,
weight_decay=0.01,
load_best_model_at_end=True, # Wczytaj najlepszy model na końcu
metric_for_best_model="loss", # Kryterium wyboru najlepszego modelu
greater_is_better=False, # Niższy loss = lepszy model
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval, # Dodany zestaw ewaluacyjny
data_collator=data_collator,
)
trainer.train()
model.save_pretrained("./trained_model/gemma")
tokenizer.save_pretrained("./trained_model/gemma")
print("✅ Model został wytrenowany i zapisany!")