import os import torch import weaviate import numpy as np from datasets import Dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq from weaviate.connect import ConnectionParams import weaviate import tempfile # 1️⃣ Połączenie z Weaviate client = weaviate.WeaviateClient( connection_params=ConnectionParams.from_params( http_host="weaviate", http_port=8080, http_secure=False, grpc_host="weaviate", grpc_port=50051, grpc_secure=False, ) ) client.connect() collection = client.collections.get("Document") # 2️⃣ Pobranie dokumentów z Weaviate def fetch_documents(): response = collection.query.fetch_objects() documents = [] for o in response.objects: file_name = o.properties.get("fileName", "unknown_file") content = o.properties.get("content", "") if content: documents.append(f"fileName: {file_name}, content: {content}") print(f"fileName: {file_name}") return documents #return documents fetch_documents() client.close() """ # 3️⃣ Inicjalizacja modelu model_name = "allegro/multislav-5lang" device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) tokenizer = AutoTokenizer.from_pretrained(model_name) # 4️⃣ Przygotowanie danych treningowych def create_training_data(): return Dataset.from_dict({"text": documents}) dataset = create_training_data() split_dataset = dataset.train_test_split(test_size=0.25) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] # 5️⃣ Tokenizacja def tokenize_function(examples): return tokenizer( examples["text"], padding="max_length", truncation=True, max_length=512 ) tokenized_train = train_dataset.map(tokenize_function, batched=True) tokenized_eval = eval_dataset.map(tokenize_function, batched=True) # 6️⃣ Parametry treningu training_args = TrainingArguments( output_dir="./results", evaluation_strategy="steps", eval_steps=500, save_steps=500, learning_rate=2e-5, per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=16, weight_decay=0.01, save_total_limit=2, load_best_model_at_end=True, metric_for_best_model="loss", greater_is_better=False, ) # 7️⃣ Data Collator data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # 8️⃣ Trening trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, data_collator=data_collator, ) trainer.train() # 9️⃣ Zapis modelu model.save_pretrained("./models/allegro") tokenizer.save_pretrained("./models/allegro") print("✅ Model został wytrenowany i zapisany!") """