import os os.environ["TOKENIZERS_PARALLELISM"] = "false" import torch import numpy as np from sentence_transformers import SentenceTransformer from datasets import Dataset from peft import LoraConfig, get_peft_model from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq import weaviate # 1️⃣ Inicjalizacja modelu do embeddingów embed_model = SentenceTransformer("all-MiniLM-L6-v2") # 2️⃣ Połączenie z Weaviate i pobranie dokumentów client = weaviate.Client( url="http://weaviate:8080" # Dostosuj URL do swojego środowiska ) collection_name = "Document" # Zakładam, że to jest nazwa Twojej kolekcji response = ( client.query .get(collection_name, ["content"]) .with_additional(["id"]) .do() ) documents = [item['content'] for item in response['data']['Get'][collection_name]] # 3️⃣ Generowanie embeddingów embeddings = embed_model.encode(documents) # 4️⃣ Przygotowanie danych treningowych def create_training_data(): data = { "text": documents, "embedding": embeddings.tolist() } return Dataset.from_dict(data) dataset = create_training_data() # Podział danych na treningowe i ewaluacyjne split_dataset = dataset.train_test_split(test_size=0.25) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] # 5️⃣ Ładowanie modelu allegro/multislav-5lang device = "cuda" if torch.cuda.is_available() else "cpu" model_name = "allegro/multislav-5lang" model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) tokenizer = AutoTokenizer.from_pretrained(model_name) # 6️⃣ Konfiguracja LoRA lora_config = LoraConfig( r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="SEQ_2_SEQ_LM" ) model = get_peft_model(model, lora_config) # 7️⃣ Tokenizacja danych max_length = 384 def tokenize_function(examples): return tokenizer( examples["text"], padding="max_length", truncation=True, max_length=max_length ) tokenized_train = train_dataset.map(tokenize_function, batched=True) tokenized_eval = eval_dataset.map(tokenize_function, batched=True) # 8️⃣ Parametry treningu training_args = TrainingArguments( output_dir="./results", evaluation_strategy="steps", eval_steps=500, save_strategy="steps", save_steps=500, learning_rate=1e-5, per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=16, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model="loss", greater_is_better=False, ) # 9️⃣ Data Collator data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, model=model ) # 🔟 Trening modelu trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, data_collator=data_collator, ) trainer.train() # 1️⃣1️⃣ Zapis modelu model.save_pretrained("./models/allegro") tokenizer.save_pretrained("./models/allegro") print("✅ Model został wytrenowany i zapisany!")