import os import torch import weaviate import numpy as np from datasets import Dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq from weaviate.classes.config import Property, DataType, Configure from weaviate.classes.client import WeaviateClient from weaviate.classes.config import ConnectionParams # 1️⃣ Połączenie z Weaviate client = WeaviateClient( connection_params=ConnectionParams.from_params( http_host="weaviate", http_port=8080, http_secure=False, grpc_host="weaviate", grpc_port=50051, grpc_secure=False, ) ) # 2️⃣ Pobranie dokumentów z Weaviate def fetch_documents(): query = client.query.get("Document", ["content", "fileName"]).do() documents = [] for item in query["data"]["Get"]["Document"]: file_name = item.get("fileName", "unknown_file") content = item.get("content", "") if content: documents.append(f"fileName: {file_name}, content: {content}") return documents documents = fetch_documents() # 3️⃣ Inicjalizacja modelu model_name = "allegro/multislav-5lang" device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) tokenizer = AutoTokenizer.from_pretrained(model_name) # 4️⃣ Przygotowanie danych treningowych def create_training_data(): return Dataset.from_dict({"text": documents}) dataset = create_training_data() split_dataset = dataset.train_test_split(test_size=0.25) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] # 5️⃣ Tokenizacja def tokenize_function(examples): return tokenizer( examples["text"], padding="max_length", truncation=True, max_length=512 ) tokenized_train = train_dataset.map(tokenize_function, batched=True) tokenized_eval = eval_dataset.map(tokenize_function, batched=True) # 6️⃣ Parametry treningu training_args = TrainingArguments( output_dir="./results", evaluation_strategy="steps", eval_steps=500, save_steps=500, learning_rate=2e-5, per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=16, weight_decay=0.01, save_total_limit=2, load_best_model_at_end=True, metric_for_best_model="loss", greater_is_better=False, ) # 7️⃣ Data Collator data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # 8️⃣ Trening trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, data_collator=data_collator, ) trainer.train() # 9️⃣ Zapis modelu model.save_pretrained("./models/allegro") tokenizer.save_pretrained("./models/allegro") print("✅ Model został wytrenowany i zapisany!")