diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9e525e4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +# Użyj oficjalnego obrazu Python jako bazowego +FROM --platform=linux/amd64 python:3.9-slim + +# Ustaw katalog roboczy w kontenerze +WORKDIR /app + +# Zainstaluj git +RUN apt-get update && apt-get install -y git nano wget curl iputils-ping + +# Skopiuj pliki wymagań (jeśli istnieją) i zainstaluj zależności +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Skopiuj plik requirements.txt do kontenera +COPY requirements.txt . + +# Zainstaluj zależności z pliku requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Zainstaluj Tesseract OCR +RUN apt-get install -y tesseract-ocr + +# Skopiuj kod źródłowy do kontenera +COPY . . +COPY entrypoint.sh /entrypoint.sh + +RUN chmod +x /entrypoint.sh + +# Uruchom aplikację +ENTRYPOINT ["/entrypoint.sh"] diff --git a/allegro.py b/allegro.py index e0cac0f..978af6f 100644 --- a/allegro.py +++ b/allegro.py @@ -1,119 +1,120 @@ import os -import re -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling -from datasets import Dataset - -# Konfiguracja os.environ["TOKENIZERS_PARALLELISM"] = "false" -MODEL_NAME = "allegro/herbert-base-cased" -SPECIAL_TOKENS = ["[CITATION_START]", "[CITATION_END]"] -TEXT_FILE_PATH = "./docs/kodekspracy.txt" # Zmień na właściwą ścieżkę -def prepare_dataset_from_file(file_path): - with open(file_path, 'r', encoding='utf-8') as f: - text = f.read() +import torch +import numpy as np +from sentence_transformers import SentenceTransformer +from datasets import Dataset +from peft import LoraConfig, get_peft_model +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq +import weaviate +from weaviate.client import WeaviateClient +from weaviate.connect import ConnectionParams - articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL) - - formatted_articles = [] - for article in articles: - article = ' '.join(article.strip().split()) - - art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL) - if art_match: - art_number = art_match.group(1) - art_text = art_match.group(2) - - paragraphs = re.split(r'(§\s*\d+\.)', art_text) - if len(paragraphs) > 1: - formatted_paragraphs = [] - for i in range(1, len(paragraphs), 2): - para_num = paragraphs[i].strip() - para_text = paragraphs[i+1].strip() - formatted_paragraphs.append(f"{para_num} {para_text}") - formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs) - else: - formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}" - - formatted_articles.append({"text": formatted}) - - questions = [ - f"Zacytuj artykuł {art_number} Kodeksu pracy.", - f"Co mówi artykuł {art_number} Kodeksu pracy?", - f"Podaj treść artykułu {art_number} Kodeksu pracy." - ] - for question in questions: - formatted_articles.append({"text": f"{question}\n{formatted}"}) - - return formatted_articles +# 1️⃣ Inicjalizacja modelu do embeddingów +embed_model = SentenceTransformer("all-MiniLM-L6-v2") -def main(): - # Inicjalizacja tokenizera - tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS}) +# 2️⃣ Połączenie z Weaviate i pobranie dokumentów +client = WeaviateClient( + connection_params=ConnectionParams.from_params( + http_host="weaviate", + http_port=8080, + http_secure=False, + grpc_host="weaviate", + grpc_port=50051, + grpc_secure=False, + ) +) - print(f"Pad token: {tokenizer.pad_token}") - print(f"Pad token ID: {tokenizer.pad_token_id}") +collection_name = "Document" # Zakładam, że to jest nazwa Twojej kolekcji +result = ( + client.query.get(collection_name, ["content"]) + .with_additional(["id"]) + .do() +) - # Przygotowanie danych - data = prepare_dataset_from_file(TEXT_FILE_PATH) - dataset = Dataset.from_dict({"text": [d["text"] for d in data]}) +documents = [item['content'] for item in result['data']['Get'][collection_name]] - # Tokenizacja - def tokenize_function(examples): - tokenized = tokenizer( - examples["text"], - truncation=True, - padding="max_length", - max_length=512, - return_tensors="pt" - ) - tokenized["labels"] = tokenized["input_ids"].clone() - return tokenized +# 3️⃣ Generowanie embeddingów +embeddings = embed_model.encode(documents) - tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names) +# 4️⃣ Przygotowanie danych treningowych +def create_training_data(): + data = { + "text": documents, + "embedding": embeddings.tolist() + } + return Dataset.from_dict(data) - # Model i data collator - model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) - model.resize_token_embeddings(len(tokenizer)) - model.config.pad_token_id = tokenizer.pad_token_id - - data_collator = DataCollatorForLanguageModeling( - tokenizer=tokenizer, - mlm=False +dataset = create_training_data() + +# Podział danych na treningowe i ewaluacyjne +split_dataset = dataset.train_test_split(test_size=0.25) +train_dataset = split_dataset["train"] +eval_dataset = split_dataset["test"] + +# 5️⃣ Ładowanie modelu allegro/multislav-5lang +device = "cuda" if torch.cuda.is_available() else "cpu" +model_name = "allegro/multislav-5lang" +model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) +tokenizer = AutoTokenizer.from_pretrained(model_name) + +# 6️⃣ Konfiguracja LoRA +lora_config = LoraConfig( + r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="SEQ_2_SEQ_LM" +) +model = get_peft_model(model, lora_config) + +# 7️⃣ Tokenizacja danych +max_length = 384 + +def tokenize_function(examples): + return tokenizer( + examples["text"], + padding="max_length", + truncation=True, + max_length=max_length ) - # Konfiguracja treningu - training_args = TrainingArguments( - output_dir="./results", - num_train_epochs=32, - per_device_train_batch_size=2, - learning_rate=1e-5, - logging_steps=10, - weight_decay=0.01, - report_to="none", - save_strategy="steps", - save_steps=500, - evaluation_strategy="steps", - eval_steps=500, - load_best_model_at_end=True, - ) +tokenized_train = train_dataset.map(tokenize_function, batched=True) +tokenized_eval = eval_dataset.map(tokenize_function, batched=True) - # Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset, - eval_dataset=tokenized_dataset, - data_collator=data_collator - ) +# 8️⃣ Parametry treningu +training_args = TrainingArguments( + output_dir="./results", + eval_strategy="steps", + eval_steps=500, + save_strategy="steps", + save_steps=500, + learning_rate=1e-5, + per_device_train_batch_size=2, + per_device_eval_batch_size=2, + num_train_epochs=16, + weight_decay=0.01, + load_best_model_at_end=True, + metric_for_best_model="loss", + greater_is_better=False, +) - print("Rozpoczęcie treningu...") - trainer.train() - trainer.save_model("./trained_model/allegro") - tokenizer.save_pretrained("./trained_model/allegro") +# 9️⃣ Data Collator +data_collator = DataCollatorForSeq2Seq( + tokenizer=tokenizer, + model=model +) -if __name__ == "__main__": - main() +# 🔟 Trening modelu +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + data_collator=data_collator, +) + +trainer.train() + +# 1️⃣1️⃣ Zapis modelu +model.save_pretrained("./models/allegro") +tokenizer.save_pretrained("./models/allegro") + +print("✅ Model został wytrenowany i zapisany!") diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..bc267ab --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,12 @@ +#!/bin/bash +git config --global credential.helper store +git config --global user.name ${GIT_USERNAME} +git config --global user.email ${GIT_EMAIL} +echo "https://${GIT_USERNAME}:${GIT_TOKEN}@${GIT_HOST}" > ~/.git-credentials +cd /home +git clone --single-branch --branch main/finetuning https://repo.pokash.pl/POKASH.PL/ably.do.git +python /app/${MODELNAME}.py + +# Po zakończeniu głównego procesu, przejdź w tryb czuwania +echo "Główny proces zakończony. Przechodzę w tryb czuwania..." +tail -f /dev/null diff --git a/requirements.txt b/requirements.txt index cfc1745..ae350df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,7 @@ Pillow>=9.4.0 pytesseract>=0.3.10 python-docx>=0.8.11 PyPDF2>=3.0.1 -huggingface-hub>=0.16.4 \ No newline at end of file +huggingface-hub>=0.16.4 +numpy +peft +weaviate-client \ No newline at end of file