import os os.environ["TOKENIZERS_PARALLELISM"] = "false" from weaviate.connect import ConnectionParams import weaviate import torch import numpy as np from sentence_transformers import SentenceTransformer from datasets import Dataset from peft import LoraConfig, get_peft_model from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, MarianForCausalLM, MarianTokenizer embed_model = SentenceTransformer("all-MiniLM-L6-v2") # 1️⃣ Połączenie z Weaviate client = weaviate.WeaviateClient( connection_params=ConnectionParams.from_params( http_host="weaviate", http_port=8080, http_secure=False, grpc_host="weaviate", grpc_port=50051, grpc_secure=False, ) ) client.connect() collection = client.collections.get("Document") # 2️⃣ Pobranie dokumentów z Weaviate def fetch_documents(): response = collection.query.fetch_objects() documents = [] for o in response.objects: file_name = o.properties.get("fileName", "unknown_file") content = o.properties.get("content", "") if content: documents.append(f"fileName: {file_name}, content: {content}") print(f"fileName: {file_name}") return documents #return documents documents = fetch_documents() embeddings = embed_model.encode(documents) dim = embeddings.shape[1] #index = faiss.IndexFlatL2(dim) #index.add(np.array(embeddings, dtype=np.float32)) def create_training_data(): data = { "text": documents, "embedding": embeddings.tolist() } return Dataset.from_dict(data) dataset = create_training_data() split_dataset = dataset.train_test_split(test_size=0.25) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] device = "cuda" if torch.cuda.is_available() else "cpu" model_name = "allegro/multislav-5lang" model = MarianForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device) tokenizer = MarianForCausalLM.from_pretrained(model_name) lora_config = LoraConfig( r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) max_length = 384 def tokenize_function(examples): return tokenizer( examples["text"], padding="max_length", truncation=True, max_length=max_length ) tokenized_train = train_dataset.map(tokenize_function, batched=True) tokenized_eval = eval_dataset.map(tokenize_function, batched=True) training_args = TrainingArguments( output_dir="./results", eval_strategy="steps", # Ewaluacja co określoną liczbę kroków eval_steps=500, # Ewaluacja co 500 kroków save_strategy="steps", # Zapis modelu co określoną liczbę kroków save_steps=500, # Zapis modelu co 500 kroków learning_rate=1e-5, per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=16, weight_decay=0.01, load_best_model_at_end=True, # Wczytaj najlepszy model na końcu metric_for_best_model="loss", # Kryterium wyboru najlepszego modelu greater_is_better=False, # Niższy loss = lepszy model ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, # Dodany zestaw ewaluacyjny data_collator=data_collator, ) trainer.train() model.save_pretrained("./trained_model/gemma") tokenizer.save_pretrained("./trained_model/gemma") print("✅ Model został wytrenowany i zapisany!")