init

2025-02-27 13:10:09 +01:00 · 2025-02-27 13:10:09 +01:00 · 481cae2a61
parent c617cb3f95
commit 481cae2a61
2 changed files with 0 additions and 257 deletions
--- a/finding.py
+++ b/finding.py
@ -1,100 +0,0 @@
 import weaviate
 from weaviate.connect import ConnectionParams
 import re
 # Konfiguracja klienta Weaviate
 client = weaviate.WeaviateClient(
    connection_params=ConnectionParams.from_params(
        http_host="weaviate",
        http_port=8080,
        http_secure=False,
        grpc_host="weaviate",
        grpc_port=50051,
        grpc_secure=False,
    )
 )
 client.connect()
 # Pobierz kolekcję
 collection = client.collections.get("Document")
 def extract_full_article(content, article_number):
    pattern = rf"Art\.\s*{article_number}\..*?(?=Art\.\s*\d+\.|\Z)"
    match = re.search(pattern, content, re.DOTALL)
    if match:
        return match.group(0).strip()
    return None
 def extract_relevant_fragment(content, query, context_size=100):
    article_match = re.match(r"Art\.\s*(\d+)", query)
    if article_match:
        article_number = article_match.group(1)
        full_article = extract_full_article(content, article_number)
        if full_article:
            return full_article
    index = content.lower().find(query.lower())
    if index != -1:
        start = max(0, index - context_size)
        end = min(len(content), index + len(query) + context_size)
        return f"...{content[start:end]}..."
    return content[:200] + "..."
 def vector_search(query, limit=5):
    print(f"\nWyszukiwanie wektorowe dla zapytania: '{query}'")
    response = collection.query.near_text(
        query=query,
        limit=limit
    )
    for obj in response.objects:
        print(f"UUID: {obj.uuid}")
        relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
        print(f"Relewantny fragment:\n{relevant_fragment}")
        print(f"Nazwa pliku: {obj.properties['fileName']}")
        print("---")
 def hybrid_search(query, limit=5, alpha=0.5):
    print(f"\nWyszukiwanie hybrydowe dla zapytania: '{query}'")
    response = collection.query.hybrid(
        query=query,
        alpha=alpha,
        limit=limit
    )
    for obj in response.objects:
        print(f"UUID: {obj.uuid}")
        relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
        print(f"Relewantny fragment:\n{relevant_fragment}")
        print(f"Nazwa pliku: {obj.properties['fileName']}")
        print("---")
 #exists = client.collections.exists("Document")
 #print(f"Czy kolekcja 'Document' istnieje: {exists}")
 #schema = collection.config.get()
 #print(f"Nazwa kolekcji: {schema.name}")
 #print("Właściwości:")
 #for prop in schema.properties:
 #    print(f"- {prop.name}: {prop.data_type}")
 #collection = client.collections.get("Document")
 #count = collection.aggregate.over_all(total_count=True).total_count
 #print(f"Liczba obiektów w kolekcji: {count}")
 #results = collection.query.fetch_objects(limit=5)
 #for obj in results.objects:
 #    print(f"UUID: {obj.uuid}")
 #    print(f"Nazwa pliku: {obj.properties['fileName']}")
 #    print(f"Zawartość: {obj.properties['content'][:100]}...")  # Pierwsze 100 znaków
 #    print("---")
 # Przykładowe użycie
 queries = ["Art. 154", "urlop wypoczynkowy", "Państwowa Inspekcja Pracy"]
 for query in queries:
    vector_search(query)
    hybrid_search(query)
 # Zamknij połączenie
 client.close()
--- a/manual.py
+++ b/manual.py
@ -1,157 +0,0 @@
 import os
 import weaviate
 from weaviate.connect import ConnectionParams
 from weaviate.collections import Collection
 from weaviate.classes.config import Configure, Property, DataType
 from weaviate.collections.classes.filters import Filter
 import pytesseract
 from PIL import Image
 from docx import Document
 from pypdf import PdfReader
 import textract
 import hashlib
 # Konfiguracja
 REPO_PATH = "/home/ably.do/docs"
 WEAVIATE_URL = "http://weaviate:8080"
 client = weaviate.WeaviateClient(
    connection_params=ConnectionParams.from_params(
        http_host="weaviate",
        http_port=8080,
        http_secure=False,
        grpc_host="weaviate",
        grpc_port=50051,
        grpc_secure=False,
    )
 )
 def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()
 def read_docx(file_path):
    doc = Document(file_path)
    return ' '.join([paragraph.text for paragraph in doc.paragraphs])
 def read_pdf(file_path):
    reader = PdfReader(file_path)
    return ' '.join([page.extract_text() for page in reader.pages])
 def read_image(file_path):
    return pytesseract.image_to_string(Image.open(file_path))
 def read_file(file_path):
    _, ext = os.path.splitext(file_path.lower())
    if ext in ['.txt', '.md']:
        return read_text_file(file_path)
    elif ext == '.docx':
        return read_docx(file_path)
    elif ext == '.pdf':
        return read_pdf(file_path)
    elif ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
        return read_image(file_path)
    elif ext in ['.doc', '.rtf']:
        return textract.process(file_path).decode('utf-8')
    else:
        return None
 def generate_content_hash(content):
    return hashlib.sha256(content.encode('utf-8')).hexdigest()
 def add_to_weaviate(file_name, content, content_hash):
    try:
        collection = client.collections.get("Document")
        # Poprawne użycie klasy Filter
        filters = Filter.by_property("fileName").equal(file_name)
        # Sprawdzenie, czy dokument już istnieje
        existing_docs = collection.query.fetch_objects(filters=filters)
        if existing_docs.objects:
            print(f"Dokument {file_name} już istnieje w bazie.")
            return
        # Dodanie nowego dokumentu
        collection.data.insert(
            properties={
                "fileName": file_name,
                "content": content,
                "contentHash": content_hash
            }
        )
        print(f"Dodano dokument {file_name} do Weaviate.")
    except Exception as e:
        print(f"Błąd podczas dodawania {file_name} do Weaviate: {e}")
 def process_file(file_path):
    if not os.path.exists(file_path):
        print(f"Plik nie istnieje: {file_path}")
        return
    try:
        content = read_file(file_path)
        if content:
            file_name = os.path.basename(file_path)
            content_hash = generate_content_hash(content)
            add_to_weaviate(file_name, content, content_hash)
        else:
            print(f"Plik jest pusty lub nie można go odczytać: {file_path}")
    except Exception as e:
        print(f"Błąd podczas przetwarzania pliku {file_path}: {str(e)}")
 def load_all_documents():
    print("Wczytywanie wszystkich dokumentów z katalogu...")
    for root, dirs, files in os.walk(REPO_PATH):
        for file in files:
            process_file(os.path.join(root, file))
    print("Zakończono wczytywanie dokumentów.")
 if __name__ == "__main__":
    # Upewnij się, że kolekcja "Document" istnieje w Weaviate
    client.connect()
    try:
        # Sprawdzenie, czy kolekcja istnieje i czy należy ją usunąć
        collection_name = "Document"
        if client.collections.exists(collection_name):
            print(f"Usuwanie istniejącej kolekcji '{collection_name}' (CLEAR_COLLECTION=true)...")
            client.collections.delete(collection_name)
            print(f"Kolekcja '{collection_name}' została usunięta.")
        else:
            print(f"Kolekcja '{collection_name}' nie istnieje.")
        # Tworzenie kolekcji od nowa, jeśli została usunięta lub nie istniała
        if not client.collections.exists(collection_name):
            print(f"Tworzenie nowej kolekcji '{collection_name}'...")
            client.collections.create(
                name=collection_name,
                properties=[
                    Property(name="content", data_type=DataType.TEXT),
                    Property(name="fileName", data_type=DataType.TEXT),
                    Property(name="contentHash", data_type=DataType.TEXT)  # Nowe pole
                ],
                vectorizer_config=Configure.Vectorizer.text2vec_transformers()
            )
            print(f"Kolekcja '{collection_name}' została utworzona.")
            # Wczytanie dokumentów po utworzeniu nowej kolekcji
            print("Wczytywanie dokumentów do nowej kolekcji...")
            load_all_documents()
            print("Wszystkie dokumenty zostały wgrane.")
        else:
            print("Kolekcja już istnieje. Pominięto jej ponowne tworzenie.")
            # Sprawdzenie, czy kolekcja jest pusta i ewentualne wczytanie dokumentów
            collection = client.collections.get(collection_name)
            if collection.aggregate.over_all(total_count=True).total_count == 0:
                print("Kolekcja jest pusta. Wczytywanie dokumentów...")
                load_all_documents()
                print("Wszystkie dokumenty zostały wgrane do istniejącej kolekcji.")
    except Exception as e:
        print(f"Wystąpił błąd podczas operacji na kolekcji '{collection_name}': {e}")
    client.close()