diff --git a/allegro.py b/allegro.py index dabe8f4..92f20d3 100644 --- a/allegro.py +++ b/allegro.py @@ -1,122 +1,37 @@ -import os -os.environ["TOKENIZERS_PARALLELISM"] = "false" +from transformers import MarianForCausalLM, MarianTokenizer, Trainer, TrainingArguments +from datasets import load_dataset -from weaviate.connect import ConnectionParams -import weaviate - -import torch -import numpy as np -from sentence_transformers import SentenceTransformer -from datasets import Dataset -from peft import LoraConfig, get_peft_model -from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, MarianForCausalLM, MarianTokenizer - -embed_model = SentenceTransformer("all-MiniLM-L6-v2") - -# 1️⃣ Połączenie z Weaviate -client = weaviate.WeaviateClient( - connection_params=ConnectionParams.from_params( - http_host="weaviate", - http_port=8080, - http_secure=False, - grpc_host="weaviate", - grpc_port=50051, - grpc_secure=False, - ) -) -client.connect() - -collection = client.collections.get("Document") - -# 2️⃣ Pobranie dokumentów z Weaviate -def fetch_documents(): - response = collection.query.fetch_objects() - documents = [] - for o in response.objects: - file_name = o.properties.get("fileName", "unknown_file") - content = o.properties.get("content", "") - if content: - documents.append(f"fileName: {file_name}, content: {content}") - print(f"fileName: {file_name}") - return documents - #return documents - -documents = fetch_documents() - -embeddings = embed_model.encode(documents) - -dim = embeddings.shape[1] -#index = faiss.IndexFlatL2(dim) -#index.add(np.array(embeddings, dtype=np.float32)) - -def create_training_data(): - data = { - "text": documents, - "embedding": embeddings.tolist() - } - return Dataset.from_dict(data) - -dataset = create_training_data() - -split_dataset = dataset.train_test_split(test_size=0.25) -train_dataset = split_dataset["train"] -eval_dataset = split_dataset["test"] - -device = "cuda" if torch.cuda.is_available() else "cpu" +# Załaduj model i tokenizer model_name = "allegro/multislav-5lang" -model = MarianForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device) -tokenizer = MarianForCausalLM.from_pretrained(model_name) +model = MarianForCausalLM.from_pretrained(model_name) +tokenizer = MarianTokenizer.from_pretrained(model_name) -lora_config = LoraConfig( - r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" -) -model = get_peft_model(model, lora_config) - -max_length = 384 +# Załaduj dane (przykład dla tłumaczenia z języka rumuńskiego na angielski) +dataset = load_dataset("wmt16", "ro-en") +# Przetwórz dane do formatu odpowiedniego dla modelu def tokenize_function(examples): - return tokenizer( - examples["text"], - padding="max_length", - truncation=True, - max_length=max_length - ) + return tokenizer(examples['translation'], truncation=True, padding='max_length', max_length=128) -tokenized_train = train_dataset.map(tokenize_function, batched=True) -tokenized_eval = eval_dataset.map(tokenize_function, batched=True) +tokenized_datasets = dataset.map(tokenize_function, batched=True) +# Skonfiguruj trenera training_args = TrainingArguments( output_dir="./results", - eval_strategy="steps", # Ewaluacja co określoną liczbę kroków - eval_steps=500, # Ewaluacja co 500 kroków - save_strategy="steps", # Zapis modelu co określoną liczbę kroków - save_steps=500, # Zapis modelu co 500 kroków - learning_rate=1e-5, - per_device_train_batch_size=2, - per_device_eval_batch_size=2, - num_train_epochs=16, + evaluation_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + num_train_epochs=3, weight_decay=0.01, - load_best_model_at_end=True, # Wczytaj najlepszy model na końcu - metric_for_best_model="loss", # Kryterium wyboru najlepszego modelu - greater_is_better=False, # Niższy loss = lepszy model -) - -data_collator = DataCollatorForLanguageModeling( - tokenizer=tokenizer, - mlm=False ) trainer = Trainer( model=model, args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, # Dodany zestaw ewaluacyjny - data_collator=data_collator, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], ) -trainer.train() - -model.save_pretrained("./trained_model/gemma") -tokenizer.save_pretrained("./trained_model/gemma") - -print("✅ Model został wytrenowany i zapisany!") \ No newline at end of file +# Trening modelu +trainer.train() \ No newline at end of file