mod herbert

2025-03-01 11:35:22 +01:00 · 2025-03-01 11:32:47 +01:00 · 2025-03-01 11:29:07 +01:00 · 2025-03-01 09:47:44 +01:00 · 2025-03-01 00:26:32 +01:00 · 2025-03-01 00:23:47 +01:00
25 changed files with 4659 additions and 534 deletions
--- a/30
+++ b/30
@ -1,30 +0,0 @@
 # Użyj oficjalnego obrazu Python jako bazowego
 FROM --platform=linux/amd64 python:3.9-slim
 # Ustaw katalog roboczy w kontenerze
 WORKDIR /app
 # Zainstaluj git
 RUN apt-get update && apt-get install -y git nano wget curl iputils-ping 
 # Skopiuj pliki wymagań (jeśli istnieją) i zainstaluj zależności
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Skopiuj plik requirements.txt do kontenera
 COPY requirements.txt .
 # Zainstaluj zależności z pliku requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 # Zainstaluj Tesseract OCR
 RUN apt-get install -y tesseract-ocr
 # Skopiuj kod źródłowy do kontenera
 COPY . .
 COPY entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh
 # Uruchom aplikację
 ENTRYPOINT ["/entrypoint.sh"]
--- a/allegro.py
+++ b/allegro.py
@ -0,0 +1,9 @@
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 model = AutoModelForSeq2SeqLM.from_pretrained("allegro/multislav-5lang")
 tokenizer = AutoTokenizer.from_pretrained("allegro/multislav-5lang")
 model.save_pretrained("./models/ably")
 tokenizer.save_pretrained("./models/ably")
 print("✅ Model został wytrenowany i zapisany!")
--- a/catalog.json
+++ b/catalog.json
@ -0,0 +1,5 @@
 {
    "kodekspracy": "Kodeks Pracy",
    "urlopproporcjonalny": "Rozporządzenie BHP",
    "ustawaopanstwowejinspekcjipracy": "Ustawa o Państwowej inspekcji pracy"
 }
--- a/docs/Rozporządzenie
+++ b/docs/Rozporządzenie
--- a/docs/Rozporządzenie
+++ b/docs/Rozporządzenie
--- a/docs/Rozporządzenie
+++ b/docs/Rozporządzenie
--- a/docs/Rozporządzenie
+++ b/docs/Rozporządzenie
--- a/Pożarnej.pdf
+++ b/Pożarnej.pdf
--- a/przeciwpożarowej.pdf
+++ b/przeciwpożarowej.pdf
--- a/zawodowych.pdf
+++ b/zawodowych.pdf
--- a/docs/kodekspracy.txt
+++ b/docs/kodekspracy.txt
--- a/docs/urlopproporcjonalny.txt
+++ b/docs/urlopproporcjonalny.txt
@ -0,0 +1,11 @@
 Podstawowe zasady naliczania urlopu proporcjonalnego
 Kalendarzowy miesiąc pracy odpowiada 1/12 wymiaru urlopu wypoczynkowego, który przysługuje pracownikowi na podstawie art. 154 § 1 i 2 k.p. To oznacza, że 1 kalendarzowy miesiąc pracy to 1/12 z 20 dni (1,66) lub 26 dni (2,16) urlopu wypoczynkowego dla pracownika na pełnym etacie. Niektórzy zaokrąglają wyniki do 1,67 dnia urlopu i 2,17 dnia urlopu.
 Niepełny kalendarzowy miesiąc pracy zaokrągla się w górę do pełnego miesiąca. Jeżeli pracownik przepracuje tylko 1 dzień w miesiącu, zyska prawo do urlopu za cały miesiąc.
 Niepełny dzień urlopu zaokrągla się w górę do pełnego dnia. Uwaga – nie musisz tak postąpić w przypadku urlopu liczonego proporcjonalnie dla osoby, która podjęła pierwszą pracę w życiu.
 Zaokrąglając niepełne dni urlopu, pamiętaj, że wymiar urlopu wypoczynkowego należny pracownikowi pełnoetatowemu w danym roku kalendarzowym nie może przekroczyć 20 lub 26 dni (w zależności od stażu pracy).
 Jeśli pracownik rozwiązuje umowę o pracę z dotychczasowym pracodawcą i zawiera nową umowę o pracę z kolejnym pracodawcą w tym samym miesiącu kalendarzowym, to tylko wcześniejszy pracodawca zaokrągla ten niepełny miesiąc pracy w górę.
--- a/docs/ustawaopanstwowejinspekcjipracy.pdf
+++ b/docs/ustawaopanstwowejinspekcjipracy.pdf
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -1,8 +0,0 @@
 #!/bin/bash
 git config --global credential.helper store
 git config --global user.name ${GIT_USERNAME}
 git config --global user.email ${GIT_EMAIL}
 echo "https://${GIT_USERNAME}:${GIT_TOKEN}@${GIT_HOST}" > ~/.git-credentials
 cd /home
 git clone --single-branch --branch ${GIT_BRANCH} https://repo.pokash.pl/POKASH.PL/ably.do.git
 python /app/monitoring.py
--- a/finding.py
+++ b/finding.py
@ -1,100 +0,0 @@
 import weaviate
 from weaviate.connect import ConnectionParams
 import re
 # Konfiguracja klienta Weaviate
 client = weaviate.WeaviateClient(
    connection_params=ConnectionParams.from_params(
        http_host="weaviate",
        http_port=8080,
        http_secure=False,
        grpc_host="weaviate",
        grpc_port=50051,
        grpc_secure=False,
    )
 )
 client.connect()
 # Pobierz kolekcję
 collection = client.collections.get("Document")
 def extract_full_article(content, article_number):
    pattern = rf"Art\.\s*{article_number}\..*?(?=Art\.\s*\d+\.|\Z)"
    match = re.search(pattern, content, re.DOTALL)
    if match:
        return match.group(0).strip()
    return None
 def extract_relevant_fragment(content, query, context_size=100):
    article_match = re.match(r"Art\.\s*(\d+)", query)
    if article_match:
        article_number = article_match.group(1)
        full_article = extract_full_article(content, article_number)
        if full_article:
            return full_article
    index = content.lower().find(query.lower())
    if index != -1:
        start = max(0, index - context_size)
        end = min(len(content), index + len(query) + context_size)
        return f"...{content[start:end]}..."
    return content[:200] + "..."
 def vector_search(query, limit=5):
    print(f"\nWyszukiwanie wektorowe dla zapytania: '{query}'")
    response = collection.query.near_text(
        query=query,
        limit=limit
    )
    for obj in response.objects:
        print(f"UUID: {obj.uuid}")
        relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
        print(f"Relewantny fragment:\n{relevant_fragment}")
        print(f"Nazwa pliku: {obj.properties['fileName']}")
        print("---")
 def hybrid_search(query, limit=5, alpha=0.5):
    print(f"\nWyszukiwanie hybrydowe dla zapytania: '{query}'")
    response = collection.query.hybrid(
        query=query,
        alpha=alpha,
        limit=limit
    )
    for obj in response.objects:
        print(f"UUID: {obj.uuid}")
        relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
        print(f"Relewantny fragment:\n{relevant_fragment}")
        print(f"Nazwa pliku: {obj.properties['fileName']}")
        print("---")
 #exists = client.collections.exists("Document")
 #print(f"Czy kolekcja 'Document' istnieje: {exists}")
 #schema = collection.config.get()
 #print(f"Nazwa kolekcji: {schema.name}")
 #print("Właściwości:")
 #for prop in schema.properties:
 #    print(f"- {prop.name}: {prop.data_type}")
 #collection = client.collections.get("Document")
 #count = collection.aggregate.over_all(total_count=True).total_count
 #print(f"Liczba obiektów w kolekcji: {count}")
 #results = collection.query.fetch_objects(limit=5)
 #for obj in results.objects:
 #    print(f"UUID: {obj.uuid}")
 #    print(f"Nazwa pliku: {obj.properties['fileName']}")
 #    print(f"Zawartość: {obj.properties['content'][:100]}...")  # Pierwsze 100 znaków
 #    print("---")
 # Przykładowe użycie
 queries = ["Art. 154", "urlop wypoczynkowy", "Państwowa Inspekcja Pracy", "Art. 154 Kodeks pracy"]
 for query in queries:
    vector_search(query)
    hybrid_search(query)
 # Zamknij połączenie
 client.close()
--- a/gemma-faiss.py
+++ b/gemma-faiss.py
@ -0,0 +1,93 @@
 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import faiss
 import numpy as np
 import ollama
 import gradio as gr
 import os
 import argparse
 from sentence_transformers import SentenceTransformer
 # === KONFIGURACJA ===
 model_name = "hse.ably.do:latest"  # Nazwa modelu Ollama
 faiss_index_path = "faiss_index.idx"  # Plik indeksu FAISS
 kodeks_file = "/home/ably.do/docs/kodekspracy.txt"  # Plik z treścią kodeksu pracy
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Model do embedowania tekstu
 # === KROK 1: WCZYTYWANIE KODEKSU PRACY ===
 def load_kodeks(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        content = file.read()
        articles = content.split("\n\n")  # Dzielimy na sekcje
    return [article.strip() for article in articles if article.strip().startswith("Art.")]
 # === KROK 2: TWORZENIE INDEKSU FAISS ===
 def create_faiss_index(sections):
    embeddings = embedding_model.encode(sections, convert_to_numpy=True)  # Tworzenie wektorów
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Indeks FAISS
    index.add(embeddings)  # Dodanie wektorów do FAISS
    faiss.write_index(index, faiss_index_path)  # Zapis indeksu
    return index, sections
 # === KROK 3: WYSZUKIWANIE NAJBLIŻSZEGO FRAGMENTU ===
 def search_faiss(query, index, sections, top_k=3):
    query_vector = embedding_model.encode([query], convert_to_numpy=True)
    _, idx = index.search(query_vector, top_k)  # Szukamy więcej wyników
    results = [sections[i] for i in idx[0] if i < len(sections)]
    return "\n\n".join(results)  # Połącz kilka najlepszych fragmentów
 # === KROK 4: GENEROWANIE ODPOWIEDZI Z OLLAMA ===
 def generate_response(user_query):
    if not os.path.exists(faiss_index_path):
        return "Błąd: Indeks FAISS nie istnieje. Uruchom aplikację z opcją --rebuild-index."
    try:
        index = faiss.read_index(faiss_index_path)
    except Exception as e:
        return f"Błąd ładowania FAISS: {str(e)}"
    sections = load_kodeks(kodeks_file)
    best_match = search_faiss(user_query, index, sections)
    # 👀 DEBUG: Sprawdź, co zwraca FAISS
    print(f"🔍 Najlepsze dopasowanie FAISS dla '{user_query}':\n{best_match}")
    prompt = f"""
    Odpowiedz na pytanie na podstawie następującego tekstu:
    {best_match}
    Pytanie: {user_query}
    Podaj dokładny tekst artykułu, jeśli go znajdziesz w treści powyżej.
    """
    response = ollama.chat(model=model_name, messages=[{"role": "user", "content": prompt}])
    print(f"📝 Odpowiedź modelu:\n{response}")  # 👀 DEBUG: Sprawdź odpowiedź Ollama
    return response.get("message", response.get("content", "Błąd: Nie udało się wygenerować odpowiedzi."))
 # === KROK 5: INTERFEJS WEBOWY ===
 iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Zadaj pytanie o kodeks pracy"),
    outputs=gr.Textbox(label="Odpowiedź"),
    title="Asystent Kodeksu Pracy",
    description="Wpisz pytanie, a system zwróci odpowiedni fragment kodeksu pracy."
 )
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--rebuild-index", action="store_true", help="Odbudowanie indeksu FAISS")
    args = parser.parse_args()
    if args.rebuild_index or not os.path.exists(faiss_index_path):
        print("Tworzenie nowego indeksu FAISS...")
        sections = load_kodeks(kodeks_file)
        create_faiss_index(sections)
    else:
        print("Indeks FAISS już istnieje.")
    iface.launch(share=True)
--- a/gemma.py
+++ b/gemma.py
@ -0,0 +1,119 @@
 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import torch
 import faiss
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from datasets import Dataset
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 # 1️⃣ Inicjalizacja modelu do embeddingów
 embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 # 2️⃣ Dodanie dokumentów i embeddingów
 def read_documents_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')
        documents = []
        for article in articles:
            if article.strip().startswith('Art.'):
                documents.append(article.strip())
    return documents
 #documents = [
 #    "Jak założyć firmę w Polsce?", 
 #    "Jak rozliczyć podatek VAT?", 
 #    "Procedura składania reklamacji w e-sklepie.",
 #    "Jakie dokumenty są potrzebne do rejestracji działalności?"
 #]
 file_path = './docs/kodekspracy.txt'  # Zmień na właściwą ścieżkę
 documents = read_documents_from_file(file_path)
 embeddings = embed_model.encode(documents)
 # 3️⃣ Inicjalizacja FAISS i dodanie wektorów
 dim = embeddings.shape[1]
 index = faiss.IndexFlatL2(dim)
 index.add(np.array(embeddings, dtype=np.float32))
 # 4️⃣ Przygotowanie danych treningowych
 def create_training_data():
    data = {
        "text": documents,
        "embedding": embeddings.tolist()
    }
    return Dataset.from_dict(data)
 dataset = create_training_data()
 # Podział danych na treningowe i ewaluacyjne
 split_dataset = dataset.train_test_split(test_size=0.25)
 train_dataset = split_dataset["train"]
 eval_dataset = split_dataset["test"]
 # 5️⃣ Ładowanie modelu Gemma 2B
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = "google/gemma-2-2b"
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 # 6️⃣ Konfiguracja LoRA
 lora_config = LoraConfig(
    r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
 )
 model = get_peft_model(model, lora_config)
 # 7️⃣ Tokenizacja danych
 max_length = 384
 def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )
 tokenized_train = train_dataset.map(tokenize_function, batched=True)
 tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
 # 8️⃣ Parametry treningu
 training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",          # Ewaluacja co określoną liczbę kroków
    eval_steps=500,                 # Ewaluacja co 500 kroków
    save_strategy="steps",          # Zapis modelu co określoną liczbę kroków
    save_steps=500,                 # Zapis modelu co 500 kroków
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=16,
    weight_decay=0.01,
    load_best_model_at_end=True,    # Wczytaj najlepszy model na końcu
    metric_for_best_model="loss",   # Kryterium wyboru najlepszego modelu
    greater_is_better=False,        # Niższy loss = lepszy model
 )
 # 9️⃣ Data Collator
 data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
 )
 # 🔟 Trening modelu
 trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,    # Dodany zestaw ewaluacyjny
    data_collator=data_collator,
 )
 trainer.train()
 # 1️⃣1️⃣ Zapis modelu
 model.save_pretrained("./trained_model/gemma")
 tokenizer.save_pretrained("./trained_model/gemma")
 print("✅ Model został wytrenowany i zapisany!")
--- a/gpt.py
+++ b/gpt.py
@ -0,0 +1,118 @@
 import os
 import re
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 from datasets import Dataset
 # Konfiguracja
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 MODEL_NAME = "gpt2-medium"
 SPECIAL_TOKENS = ["[CITATION_START]", "[CITATION_END]"]
 TEXT_FILE_PATH = "./docs/kodekspracy.txt"  # Zmień na właściwą ścieżkę
 def prepare_dataset_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # Wydziel artykuły za pomocą wyrażenia regularnego
    articles = re.findall(r'Art\.\s*\d+[a-z]*\..*?(?=\s*Art\.\s*\d+[a-z]*\.|\Z)', text, flags=re.DOTALL)
    formatted_articles = []
    for article in articles:
        # Usuń zbędne białe znaki
        article = ' '.join(article.strip().split())
        # Wydziel numer artykułu i treść
        art_match = re.match(r'Art\.\s*(\d+[a-z]*)\.?\s*(.*)', article, re.DOTALL)
        if art_match:
            art_number = art_match.group(1)
            art_text = art_match.group(2)
            # Podziel na paragrafy, jeśli istnieją
            paragraphs = re.split(r'(§\s*\d+\.)', art_text)
            if len(paragraphs) > 1:
                formatted_paragraphs = []
                for i in range(1, len(paragraphs), 2):
                    para_num = paragraphs[i].strip()
                    para_text = paragraphs[i+1].strip()
                    formatted_paragraphs.append(f"{para_num} {para_text}")
                formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END]\n" + "\n".join(formatted_paragraphs)
            else:
                formatted = f"[CITATION_START] Kodeks Pracy, Art. {art_number} [CITATION_END] {art_text}"
            formatted_articles.append({"text": formatted})
        # Dodaj przykłady pytań i odpowiedzi
        questions = [
            f"Zacytuj artykuł {art_number} Kodeksu pracy.",
            f"Co mówi artykuł {art_number} Kodeksu pracy?",
            f"Podaj treść artykułu {art_number} Kodeksu pracy."
        ]
        for question in questions:
            formatted_articles.append({"text": f"{question}\n{formatted}"})
    return formatted_articles
 def main():
    # Inicjalizacja tokenizera
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
    tokenizer.pad_token = tokenizer.eos_token
    # Przygotowanie danych
    data = prepare_dataset_from_file(TEXT_FILE_PATH)
    dataset = Dataset.from_dict({"text": [d["text"] for d in data]})
    # Tokenizacja
    def tokenize_function(examples):
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=1024,  # Zwiększono dla dłuższych artykułów
            return_tensors="pt"
        )
        tokenized["labels"] = tokenized["input_ids"].clone()
        return tokenized
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    # Model i data collator
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    # Konfiguracja treningu
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=32,  # Zwiększono liczbę epok
        per_device_train_batch_size=2,
        learning_rate=1e-5, #precyzja uczenia
        logging_steps=10,
        weight_decay=0.01,
        report_to="none",
        save_strategy="no",
        load_best_model_at_end=True,  # Ładowanie najlepszego modelu na końcu
    )
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator
    )
    print("Rozpoczęcie treningu...")
    trainer.train()
    trainer.save_model("./trained_model/gpt")
    tokenizer.save_pretrained("./trained_model/gpt")
 if __name__ == "__main__":
    main()
--- a/herbert.py
+++ b/herbert.py
@ -0,0 +1,119 @@
 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import torch
 import faiss
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from datasets import Dataset
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 # 1️⃣ Inicjalizacja modelu do embeddingów
 embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 # 2️⃣ Dodanie dokumentów i embeddingów
 def read_documents_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')
        documents = []
        for article in articles:
            if article.strip().startswith('Art.'):
                documents.append(article.strip())
    return documents
 #documents = [
 #    "Jak założyć firmę w Polsce?", 
 #    "Jak rozliczyć podatek VAT?", 
 #    "Procedura składania reklamacji w e-sklepie.",
 #    "Jakie dokumenty są potrzebne do rejestracji działalności?"
 #]
 file_path = './docs/kodekspracy.txt'  # Zmień na właściwą ścieżkę
 documents = read_documents_from_file(file_path)
 embeddings = embed_model.encode(documents)
 # 3️⃣ Inicjalizacja FAISS i dodanie wektorów
 dim = embeddings.shape[1]
 index = faiss.IndexFlatL2(dim)
 index.add(np.array(embeddings, dtype=np.float32))
 # 4️⃣ Przygotowanie danych treningowych
 def create_training_data():
    data = {
        "text": documents,
        "embedding": embeddings.tolist()
    }
    return Dataset.from_dict(data)
 dataset = create_training_data()
 # Podział danych na treningowe i ewaluacyjne
 split_dataset = dataset.train_test_split(test_size=0.25)
 train_dataset = split_dataset["train"]
 eval_dataset = split_dataset["test"]
 # 5️⃣ Ładowanie modelu Gemma 2B
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = "Lajonbot/vicuna-7b-v1.5-PL-lora_unload"
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 # 6️⃣ Konfiguracja LoRA
 lora_config = LoraConfig(
    r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
 )
 model = get_peft_model(model, lora_config)
 # 7️⃣ Tokenizacja danych
 max_length = 384
 def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )
 tokenized_train = train_dataset.map(tokenize_function, batched=True)
 tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
 # 8️⃣ Parametry treningu
 training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",          # Ewaluacja co określoną liczbę kroków
    eval_steps=500,                 # Ewaluacja co 500 kroków
    save_strategy="steps",          # Zapis modelu co określoną liczbę kroków
    save_steps=500,                 # Zapis modelu co 500 kroków
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=16,
    weight_decay=0.01,
    load_best_model_at_end=True,    # Wczytaj najlepszy model na końcu
    metric_for_best_model="loss",   # Kryterium wyboru najlepszego modelu
    greater_is_better=False,        # Niższy loss = lepszy model
 )
 # 9️⃣ Data Collator
 data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
 )
 # 🔟 Trening modelu
 trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,    # Dodany zestaw ewaluacyjny
    data_collator=data_collator,
 )
 trainer.train()
 # 1️⃣1️⃣ Zapis modelu
 model.save_pretrained("./models/herbert")
 tokenizer.save_pretrained("./models/herbert")
 print("✅ Model został wytrenowany i zapisany!")
--- a/hft.py
+++ b/hft.py
@ -0,0 +1,261 @@
 import os
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 from datasets import Dataset
 import re
 import json
 import PyPDF2
 import docx2txt
 import pytesseract
 from PIL import Image
 from collections import defaultdict
 from huggingface_hub import login
 # Konfiguracja
 os.environ['TORCH_USE_CUDA_DSA'] = '1'
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 login(token="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")
 class SourceMapper:
    def __init__(self):
        self.source_to_idx = defaultdict(lambda: len(self.source_to_idx))
        self.idx_to_source = {}
    def add_source(self, source):
        if source and source not in self.source_to_idx:
            idx = self.source_to_idx[source]
            self.idx_to_source[idx] = source
    def get_idx(self, source):
        return self.source_to_idx[source] if source else -1
    def get_source(self, idx):
        return self.idx_to_source.get(idx, "Unknown")
 def load_file_catalog(catalog_path):
    try:
        with open(catalog_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Błąd wczytywania katalogu plików: {str(e)}")
        return {}
 def identify_legal_document(filename, file_catalog):
    base_name = os.path.splitext(filename)[0].lower()
    return file_catalog.get(base_name, "Opracowanie własne")
 def extract_text_from_file(file_path):
    try:
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()
        if ext in ['.txt', '.md']:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        elif ext == '.pdf':
            text = ""
            try:
                with open(file_path, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    for page in reader.pages:
                        text += page.extract_text() or ""
            except Exception as e:
                print(f"Błąd PDF: {str(e)}")
            return text
        elif ext in ['.doc', '.docx']:
            return docx2txt.process(file_path)
        elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
            return pytesseract.image_to_string(Image.open(file_path))
        else:
            print(f"Nieobsługiwany format pliku: {ext}")
            return ""
    except Exception as e:
        print(f"Błąd ekstrakcji tekstu: {str(e)}")
        return ""
 def prepare_dataset(directory, catalog_path, source_mapper):
    file_catalog = load_file_catalog(catalog_path)
    data = []
    print(f"\n{'='*50}\nDIAGNOSTYKA DANYCH\n{'='*50}")
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            print(f"\nPrzetwarzanie pliku: {file_path}")
            try:
                text = extract_text_from_file(file_path)
                if not text.strip():
                    print("Pominięto - brak tekstu")
                    continue
                print(f"Długość tekstu: {len(text)} znaków")
                doc_type = identify_legal_document(file, file_catalog)
                print(f"Rozpoznany typ dokumentu: {doc_type}")
                if doc_type != "Opracowanie własne":
                    articles = re.split(r'(?i)(Art[\.\s]+\d+[\.\s]?)', text)
                    articles = [a.strip() for a in articles if a.strip()]
                    print(f"Znaleziono {len(articles)} fragmentów")
                    for i in range(0, len(articles)-1, 2):
                        article_number = articles[i]
                        article_content = articles[i+1]
                        if len(article_content) < 50:
                            continue
                        source = f"{doc_type}, {article_number}"
                        source_mapper.add_source(source)
                        data.append({
                            "text": f"{article_number} {article_content}",
                            "source_idx": source_mapper.get_idx(source)
                        })
                else:
                    clean_text = re.sub(r'\s+', ' ', text).strip()
                    chunks = [clean_text[i:i+512] for i in range(0, len(clean_text), 512)]
                    chunks = [c for c in chunks if c.strip()]
                    for chunk in chunks:
                        data.append({
                            "text": chunk,
                            "source_idx": -1
                        })
                    print(f"Dodano {len(chunks)} chunków")
            except Exception as e:
                print(f"Błąd podczas przetwarzania pliku: {str(e)}")
                continue
    print(f"\nPodsumowanie przygotowania danych:")
    print(f"Łączna liczba przykładów: {len(data)}")
    if data:
        print("Przykładowy wpis:")
        print(json.dumps(data[0], indent=2, ensure_ascii=False))
    else:
        print("BRAK DANYCH - sprawdź diagnostykę powyżej")
    return data
 class CustomModel(nn.Module):
    def __init__(self, model_name, config):
        super().__init__()
        self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
        self.source_embedding = nn.Embedding(10000, config.hidden_size, padding_idx=-1)
        for param in self.base_model.parameters():
            param.requires_grad = False
        for param in self.base_model.get_output_embeddings().parameters():
            param.requires_grad = True
    def forward(self, input_ids=None, attention_mask=None, labels=None, source_idx=None, **kwargs):
        if source_idx is not None:
            valid_indices = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings-1)
            source_embeds = self.source_embedding(valid_indices).unsqueeze(1)
            inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds
            return self.base_model(
                inputs_embeds=inputs_embeds,
                attention_mask=attention_mask,
                labels=labels,
                **kwargs
            )
        return self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            **kwargs
        )
    def generate(self, *args, **kwargs):
        return self.base_model.generate(*args, **kwargs)
 class CustomDataCollator(DataCollatorForLanguageModeling):
    def torch_call(self, examples):
        # Przetwórz podstawowe pola
        input_ids = torch.stack([torch.tensor(ex["input_ids"]) for ex in examples])
        attention_mask = torch.stack([torch.tensor(ex["attention_mask"]) for ex in examples])
        labels = torch.stack([torch.tensor(ex["labels"]) for ex in examples])
        batch = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }
        # Dodaj source_idx jeśli istnieje
        if "source_idx" in examples[0]:
            source_idx = torch.stack([torch.tensor(ex["source_idx"]) for ex in examples])
            batch["source_idx"] = source_idx
        return batch
 def main():
    source_mapper = SourceMapper()
    model_name = "crumb/nano-mistral"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # Przygotowanie danych
    catalog_path = "catalog.json"
    data = prepare_dataset("docs", catalog_path, source_mapper)
    if not data:
        print("\nBrak danych do treningu!")
        return
    #dataset = Dataset.from_list(data)
    dataset = Dataset.from_dict({k: [d[k] for d in data] for k in data[0]})
    def tokenize_function(examples):
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
            "labels": tokenized["input_ids"].squeeze().clone(),
            "source_idx": examples["source_idx"]  # Dodano bez konwersji do tensora
        }
    tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)
    model = CustomModel(model_name, AutoModelForCausalLM.from_pretrained(model_name).config)
    model.source_mapper = source_mapper
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=10,
        save_strategy="steps",
        save_steps=1000,
        report_to="none",
        remove_unused_columns=False
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=CustomDataCollator(tokenizer=tokenizer, mlm=False)
    )
    print("\nRozpoczęcie treningu...")
    trainer.train()
 if __name__ == "__main__":
    main()
--- a/manual.py
+++ b/manual.py
@ -1,159 +0,0 @@
 import os
 import weaviate
 from weaviate.connect import ConnectionParams
 from weaviate.collections import Collection
 from weaviate.classes.config import Configure, Property, DataType
 from weaviate.collections.classes.filters import Filter
 import pytesseract
 from PIL import Image
 from docx import Document
 from pypdf import PdfReader
 import textract
 import hashlib
 # Konfiguracja
 REPO_PATH = "/home/ably.do/docs"
 WEAVIATE_URL = "http://weaviate:8080"
 client = weaviate.WeaviateClient(
    connection_params=ConnectionParams.from_params(
        http_host="weaviate",
        http_port=8080,
        http_secure=False,
        grpc_host="weaviate",
        grpc_port=50051,
        grpc_secure=False,
    )
 )
 def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()
 def read_docx(file_path):
    doc = Document(file_path)
    return ' '.join([paragraph.text for paragraph in doc.paragraphs])
 def read_pdf(file_path):
    reader = PdfReader(file_path)
    return ' '.join([page.extract_text() for page in reader.pages])
 def read_image(file_path):
    return pytesseract.image_to_string(Image.open(file_path))
 def read_file(file_path):
    _, ext = os.path.splitext(file_path.lower())
    if ext in ['.txt', '.md']:
        return read_text_file(file_path)
    elif ext == '.docx':
        return read_docx(file_path)
    elif ext == '.pdf':
        return read_pdf(file_path)
    elif ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
        return read_image(file_path)
    elif ext in ['.doc', '.rtf']:
        return textract.process(file_path).decode('utf-8')
    else:
        return None
 def generate_content_hash(content):
    return hashlib.sha256(content.encode('utf-8')).hexdigest()
 def add_to_weaviate(file_name, content, content_hash):
    try:
        collection = client.collections.get("Document")
        # Poprawne użycie klasy Filter
        filters = Filter.by_property("fileName").equal(file_name)
        # Sprawdzenie, czy dokument już istnieje
        existing_docs = collection.query.fetch_objects(filters=filters)
        if existing_docs.objects:
            print(f"Dokument {file_name} już istnieje w bazie.")
            return
        # Dodanie nowego dokumentu
        collection.data.insert(
            properties={
                "fileName": file_name,
                "content": content,
                "contentHash": content_hash,
                "contentType": "publication"
            }
        )
        print(f"Dodano dokument {file_name} do Weaviate.")
    except Exception as e:
        print(f"Błąd podczas dodawania {file_name} do Weaviate: {e}")
 def process_file(file_path):
    if not os.path.exists(file_path):
        print(f"Plik nie istnieje: {file_path}")
        return
    try:
        content = read_file(file_path)
        if content:
            file_name = os.path.basename(file_path)
            content_hash = generate_content_hash(content)
            add_to_weaviate(file_name, content, content_hash)
        else:
            print(f"Plik jest pusty lub nie można go odczytać: {file_path}")
    except Exception as e:
        print(f"Błąd podczas przetwarzania pliku {file_path}: {str(e)}")
 def load_all_documents():
    print("Wczytywanie wszystkich dokumentów z katalogu...")
    for root, dirs, files in os.walk(REPO_PATH):
        for file in files:
            process_file(os.path.join(root, file))
    print("Zakończono wczytywanie dokumentów.")
 if __name__ == "__main__":
    # Upewnij się, że kolekcja "Document" istnieje w Weaviate
    client.connect()
    try:
        # Sprawdzenie, czy kolekcja istnieje i czy należy ją usunąć
        collection_name = "Document"
        if client.collections.exists(collection_name):
            print(f"Usuwanie istniejącej kolekcji '{collection_name}' (CLEAR_COLLECTION=true)...")
            client.collections.delete(collection_name)
            print(f"Kolekcja '{collection_name}' została usunięta.")
        else:
            print(f"Kolekcja '{collection_name}' nie istnieje.")
        # Tworzenie kolekcji od nowa, jeśli została usunięta lub nie istniała
        if not client.collections.exists(collection_name):
            print(f"Tworzenie nowej kolekcji '{collection_name}'...")
            client.collections.create(
                name=collection_name,
                properties=[
                    Property(name="content", data_type=DataType.TEXT),
                    Property(name="fileName", data_type=DataType.TEXT),
                    Property(name="contentHash", data_type=DataType.TEXT),  # Nowe pole
                    Property(name="contentType", data_type=DataType.TEXT)  # Nowe pole
                ],
                vectorizer_config=Configure.Vectorizer.text2vec_transformers()
            )
            print(f"Kolekcja '{collection_name}' została utworzona.")
            # Wczytanie dokumentów po utworzeniu nowej kolekcji
            print("Wczytywanie dokumentów do nowej kolekcji...")
            load_all_documents()
            print("Wszystkie dokumenty zostały wgrane.")
        else:
            print("Kolekcja już istnieje. Pominięto jej ponowne tworzenie.")
            # Sprawdzenie, czy kolekcja jest pusta i ewentualne wczytanie dokumentów
            collection = client.collections.get(collection_name)
            if collection.aggregate.over_all(total_count=True).total_count == 0:
                print("Kolekcja jest pusta. Wczytywanie dokumentów...")
                load_all_documents()
                print("Wszystkie dokumenty zostały wgrane do istniejącej kolekcji.")
    except Exception as e:
        print(f"Wystąpił błąd podczas operacji na kolekcji '{collection_name}': {e}")
    client.close()
--- a/monitoring.py
+++ b/monitoring.py
@ -1,227 +0,0 @@
 import os
 import time
 import subprocess
 import threading
 import weaviate
 from weaviate.connect import ConnectionParams
 from weaviate.collections import Collection
 from weaviate.classes.config import Configure, Property, DataType
 from weaviate.collections.classes.filters import Filter
 import pytesseract
 from PIL import Image
 from docx import Document
 from pypdf import PdfReader
 import textract
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 #from flask import Flask, request, jsonify, cli
 from fastapi import FastAPI, Request, HTTPException
 import uvicorn
 import hmac
 import hashlib
 # Konfiguracja
 REPO_PATH = "/home/ably.do/docs"
 WEBHOOK_SECRET = "twoj_tajny_klucz"
 WEBHOOK_PORT = 5000
 WEAVIATE_URL = "http://weaviate:8080"
 app = FastAPI()
 client = weaviate.WeaviateClient(
    connection_params=ConnectionParams.from_params(
        http_host="weaviate",
        http_port=8080,
        http_secure=False,
        grpc_host="weaviate",
        grpc_port=50051,
        grpc_secure=False,
    )
 )
 def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()
 def read_docx(file_path):
    doc = Document(file_path)
    return ' '.join([paragraph.text for paragraph in doc.paragraphs])
 def read_pdf(file_path):
    reader = PdfReader(file_path)
    return ' '.join([page.extract_text() for page in reader.pages])
 def read_image(file_path):
    return pytesseract.image_to_string(Image.open(file_path))
 def read_file(file_path):
    _, ext = os.path.splitext(file_path.lower())
    if ext in ['.txt', '.md']:
        return read_text_file(file_path)
    elif ext == '.docx':
        return read_docx(file_path)
    elif ext == '.pdf':
        return read_pdf(file_path)
    elif ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
        return read_image(file_path)
    elif ext in ['.doc', '.rtf']:
        return textract.process(file_path).decode('utf-8')
    else:
        return None
 def generate_content_hash(content):
    return hashlib.sha256(content.encode('utf-8')).hexdigest()
 def add_to_weaviate(file_name, content, content_hash):
    try:
        collection = client.collections.get("Document")
        # Poprawne użycie klasy Filter
        filters = Filter.by_property("fileName").equal(file_name)
        # Sprawdzenie, czy dokument już istnieje
        existing_docs = collection.query.fetch_objects(filters=filters)
        if existing_docs.objects:
            print(f"Dokument {file_name} już istnieje w bazie.")
            return
        # Dodanie nowego dokumentu
        collection.data.insert(
            properties={
                "fileName": file_name,
                "content": content,
                "contentHash": content_hash,
                "contentType": "publication"
            }
        )
        print(f"Dodano dokument {file_name} do Weaviate.")
    except Exception as e:
        print(f"Błąd podczas dodawania {file_name} do Weaviate: {e}")
 def process_file(file_path):
    if not os.path.exists(file_path):
        print(f"Plik nie istnieje: {file_path}")
        return
    try:
        content = read_file(file_path)
        if content:
            file_name = os.path.basename(file_path)
            content_hash = generate_content_hash(content)
            add_to_weaviate(file_name, content, content_hash)
        else:
            print(f"Plik jest pusty lub nie można go odczytać: {file_path}")
    except Exception as e:
        print(f"Błąd podczas przetwarzania pliku {file_path}: {str(e)}")
 class RepoHandler(FileSystemEventHandler):
    def on_any_event(self, event):
        if not event.is_directory:
            print(f"Wykryto zmianę: {event.src_path}")
            self.pull_changes()
            process_file(event.src_path)
    def pull_changes(self):
        try:
            subprocess.run(["git", "pull"], check=True, cwd=REPO_PATH)
            print("Zmiany pobrane z Gitea")
        except subprocess.CalledProcessError as e:
            print(f"Błąd podczas pobierania zmian: {e}")
 def start_file_monitor():
    print(f"Rozpoczeto monitoring folderu")
    event_handler = RepoHandler()
    observer = Observer()
    observer.schedule(event_handler, REPO_PATH, recursive=True)
    observer.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
@app.post("/webhook")
 async def webhook(request: Request):
    signature = request.headers.get("X-Gitea-Signature")
    if not signature:
        raise HTTPException(status_code=400, detail="No signature")
    payload = await request.body()
    computed_signature = hmac.new(WEBHOOK_SECRET.encode(), payload, hashlib.sha256).hexdigest()
    if hmac.compare_digest(signature, computed_signature):
        print("Otrzymano ważny webhook z Gitea")
        RepoHandler().pull_changes()
        for root, dirs, files in os.walk(REPO_PATH):
            for file in files:
                process_file(os.path.join(root, file))
        return {"message": "Zmiany pobrane i przetworzone pomyślnie"}
    else:
        raise HTTPException(status_code=401, detail="Invalid signature")
 def load_all_documents():
    print("Wczytywanie wszystkich dokumentów z katalogu...")
    for root, dirs, files in os.walk(REPO_PATH):
        for file in files:
            process_file(os.path.join(root, file))
    print("Zakończono wczytywanie dokumentów.")
 if __name__ == "__main__":
    client.connect()
    try:
        collection_name = "Document"
        # Sprawdzenie, czy kolekcja istnieje i czy należy ją usunąć
        if client.collections.exists(collection_name):
            print(f"Usuwanie istniejącej kolekcji '{collection_name}' (CLEAR_COLLECTION=true)...")
            client.collections.delete(collection_name)
            print(f"Kolekcja '{collection_name}' została usunięta.")
        else:
            print(f"Kolekcja '{collection_name}' nie istnieje.")
        # Tworzenie kolekcji od nowa, jeśli została usunięta lub nie istniała
        if not client.collections.exists(collection_name):
            print(f"Tworzenie nowej kolekcji '{collection_name}'...")
            client.collections.create(
                name=collection_name,
                properties=[
                    Property(name="content", data_type=DataType.TEXT),
                    Property(name="fileName", data_type=DataType.TEXT),
                    Property(name="contentHash", data_type=DataType.TEXT),  # Nowe pole
                    Property(name="contentType", data_type=DataType.TEXT)  # Nowe pole
                ],
                vectorizer_config=Configure.Vectorizer.text2vec_transformers()
            )
            print(f"Kolekcja '{collection_name}' została utworzona.")
            # Wczytanie dokumentów po utworzeniu nowej kolekcji
            print("Wczytywanie dokumentów do nowej kolekcji...")
            load_all_documents()
            print("Wszystkie dokumenty zostały wgrane.")
        else:
            print("Kolekcja już istnieje. Pominięto jej ponowne tworzenie.")
            # Sprawdzenie, czy kolekcja jest pusta i ewentualne wczytanie dokumentów
            collection = client.collections.get(collection_name)
            if collection.aggregate.over_all(total_count=True).total_count == 0:
                print("Kolekcja jest pusta. Wczytywanie dokumentów...")
                load_all_documents()
                print("Wszystkie dokumenty zostały wgrane do istniejącej kolekcji.")
    except Exception as e:
        print(f"Wystąpił błąd podczas operacji na kolekcji '{collection_name}': {e}")
    print(client.collections.list_all())
    # Uruchom monitorowanie plików w osobnym wątku
    monitor_thread = threading.Thread(target=start_file_monitor)
    monitor_thread.start()
    # Uruchom serwer Flask dla webhooka
    try:
        uvicorn.run(app, host="0.0.0.0", port=WEBHOOK_PORT)
    finally:
        client.close()
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,35 @@
  # Przeczytaj uważnie przed uruchomieniem tego repo 📝  
  To jest biblia szkolenia modeli obsługiwanych przez Ably.do 
  ## Konfiguracja Git 🔥  
  **git config --global credential.helper store** \
  Przejdź do folderu, w którym będziesz przechowywał lokalne repo. (np. **cd /home**) \
  Pobierz repo: \
  **git clone https://repo.pokash.pl/POKASH.PL/ably.do.git** \
  pierwszym razem zostaniesz poproszony o zalogowanie się.
  ## Konfiguracja Hugging Face Transpormers  🚀  
  **huggingface-cli login** \
  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
  **W przypadku niektórych obrazów modeli musisz przejść przez akceptację licencji**
  ## Trenowanie modelu 🔥  
  Przejdź do folderu, w którym będziesz pobierał wiedzę z repo. (np. /home). \
  Pobierz najnowsze zmiany (**git pull**) \
  Uruchom skrypt Python, który rozpocznie trenowanie modelu: \
  **python3 hft.py**
  ## Wdrażanie modelu ✨  
  Po wytrenowaniu modelu, 
  musisz przekonwertować go do formatu GGUF, który obsługuje Ollama. \
  Konwersja do GGUF
  1.	Skorzystaj z narzędzia dostarczonego przez Ollama do konwersji: \
  **ollama convert your_model.bin --output your_model.gguf** \
  2.	Umieść przekonwertowany model w katalogu Ollama: \
  **mv your_model.gguf ~/.ollama/models/**
  Uruchomienie dostrojonego modelu \
  Użyj nazwy swojego modelu w poleceniu: \
  **ollama run your_model** *"Jakie są wymagania dotyczące ochrony słuchu?"*
--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,8 @@
-watchdog 
+torch>=2.0.1
-Flask 
+transformers>=4.30.2
-weaviate-client
+datasets>=2.13.1
-python-docx 
+Pillow>=9.4.0
-pytesseract 
+pytesseract>=0.3.10
-textract 
+python-docx>=0.8.11
-pillow 
+PyPDF2>=3.0.1
-pypdf
+huggingface-hub>=0.16.4
 uvicorn
 FastAPI
--- a/test.py
+++ b/test.py
@ -0,0 +1,22 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 model_path = "./trained_model/gpt"
 model = AutoModelForCausalLM.from_pretrained(model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 tokenizer.pad_token = tokenizer.eos_token
 model.config.pad_token_id = tokenizer.eos_token_id
 def generate_response(prompt, max_length=1000):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        pad_token_id=tokenizer.pad_token_id,
        max_length=100
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response
 prompt = "Zacytuj paragraf pierwszy artykułu 154 Kodeksu pracy."
 response = generate_response(prompt)
 print(response)
Author	SHA1	Message	Date
l.gabrysiak	61fbc79211	mod herbert	2025-03-01 11:35:22 +01:00
l.gabrysiak	d6e1f45686	mod herbert	2025-03-01 11:32:47 +01:00
l.gabrysiak	9bbe7188ca	mod herbert	2025-03-01 11:29:07 +01:00
l.gabrysiak	3d477870ad	mod herbert	2025-03-01 09:47:44 +01:00
l.gabrysiak	a5e5401548	mod herbert	2025-03-01 00:26:32 +01:00
l.gabrysiak	4f486f021b	dodano model herbert	2025-03-01 00:23:47 +01:00
l.gabrysiak	d9541a9a28	mod allegro	2025-02-28 23:04:03 +01:00
l.gabrysiak	e3a94fa5ae	mod allegro	2025-02-28 22:41:35 +01:00
l.gabrysiak	cd5fab2206	mod allegro	2025-02-28 22:40:07 +01:00
l.gabrysiak	a47fc31bda	Merge branch 'master' of https://repo.pokash.pl/POKASH.PL/ably.do	2025-02-28 22:27:25 +01:00
l.gabrysiak	2bc3384235	mod allegro	2025-02-28 22:25:30 +01:00
Karol	049b4703a8	Przeniesiono z folderu GŁÓWNEGO do DOCS	2025-02-27 19:43:35 +01:00
Karol	4315cef3c7	Merge branch 'master' of https://repo.pokash.pl/POKASH.PL/ably.do	2025-02-27 14:13:53 +01:00
Karol	9f367c2fa4	-Rozporządzenie Ministra Gospodarki i Pracy z dnia 27 lipca 2004 r. w sprawie szkolenia w dziedzinie bezpieczeństwa i higieny pracy. -Rozporządzenie ministra pracy i polityki społecznej z dnia 26.09.1997r. w sprawie ogólnych przepisów BHP.pdf - Rozporządzenie Ministra Rodziny i Polityki Społecznej z dnia 4 listopada 2021 r. zmieniające rozporządzenie w sprawie ogólnych przepisów bezpieczeństwa i higieny pracy.pdf -Rozporządzenie z dnia 2 września 1997 r. w sprawie służby BHP.pdf - Ustawa z dnia 24 sierpnia 1991 r. o ochronie przeciwpożarowej.pdf -Ustawa z dnia 24 sierpnia 1991 r. o Państwowej Straży Pożarnej.pdf -Ustawa z dnia 30 października 2002 r. o ubezpieczeniu społecznym z tytułu wypadków przy pracy i chorób zawodowych.pdf	2025-02-26 14:43:40 +01:00