init

2025-02-27 13:10:09 +01:00 · 2025-02-27 13:10:09 +01:00 · 481cae2a61
parent c617cb3f95
commit 481cae2a61
2 changed files with 0 additions and 257 deletions
--- a/finding.py
+++ b/finding.py
@ -1,100 +0,0 @@
-import weaviate
-from weaviate.connect import ConnectionParams
-import re
-
-# Konfiguracja klienta Weaviate
-client = weaviate.WeaviateClient(
-    connection_params=ConnectionParams.from_params(
-        http_host="weaviate",
-        http_port=8080,
-        http_secure=False,
-        grpc_host="weaviate",
-        grpc_port=50051,
-        grpc_secure=False,
-    )
-)
-client.connect()
-
-# Pobierz kolekcję
-collection = client.collections.get("Document")
-
-def extract_full_article(content, article_number):
-    pattern = rf"Art\.\s*{article_number}\..*?(?=Art\.\s*\d+\.|\Z)"
-    match = re.search(pattern, content, re.DOTALL)
-    if match:
-        return match.group(0).strip()
-    return None
-
-def extract_relevant_fragment(content, query, context_size=100):
-    article_match = re.match(r"Art\.\s*(\d+)", query)
-    if article_match:
-        article_number = article_match.group(1)
-        full_article = extract_full_article(content, article_number)
-        if full_article:
-            return full_article
-
-    index = content.lower().find(query.lower())
-    if index != -1:
-        start = max(0, index - context_size)
-        end = min(len(content), index + len(query) + context_size)
-        return f"...{content[start:end]}..."
-    return content[:200] + "..."
-
-def vector_search(query, limit=5):
-    print(f"\nWyszukiwanie wektorowe dla zapytania: '{query}'")
-    response = collection.query.near_text(
-        query=query,
-        limit=limit
-    )
-    for obj in response.objects:
-        print(f"UUID: {obj.uuid}")
-        relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
-        print(f"Relewantny fragment:\n{relevant_fragment}")
-        print(f"Nazwa pliku: {obj.properties['fileName']}")
-        print("---")
-
-def hybrid_search(query, limit=5, alpha=0.5):
-    print(f"\nWyszukiwanie hybrydowe dla zapytania: '{query}'")
-    response = collection.query.hybrid(
-        query=query,
-        alpha=alpha,
-        limit=limit
-    )
-    for obj in response.objects:
-        print(f"UUID: {obj.uuid}")
-        relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
-        print(f"Relewantny fragment:\n{relevant_fragment}")
-        print(f"Nazwa pliku: {obj.properties['fileName']}")
-        print("---")
-
-
-#exists = client.collections.exists("Document")
-#print(f"Czy kolekcja 'Document' istnieje: {exists}")
-
-#schema = collection.config.get()
-#print(f"Nazwa kolekcji: {schema.name}")
-#print("Właściwości:")
-#for prop in schema.properties:
-#    print(f"- {prop.name}: {prop.data_type}")
-
-#collection = client.collections.get("Document")
-#count = collection.aggregate.over_all(total_count=True).total_count
-#print(f"Liczba obiektów w kolekcji: {count}")
-
-#results = collection.query.fetch_objects(limit=5)
-#for obj in results.objects:
-#    print(f"UUID: {obj.uuid}")
-#    print(f"Nazwa pliku: {obj.properties['fileName']}")
-#    print(f"Zawartość: {obj.properties['content'][:100]}...")  # Pierwsze 100 znaków
-#    print("---")
-
-
-# Przykładowe użycie
-queries = ["Art. 154", "urlop wypoczynkowy", "Państwowa Inspekcja Pracy"]
-
-for query in queries:
-    vector_search(query)
-    hybrid_search(query)
-
-# Zamknij połączenie
-client.close()
--- a/manual.py
+++ b/manual.py
@ -1,157 +0,0 @@
-import os
-import weaviate
-from weaviate.connect import ConnectionParams
-from weaviate.collections import Collection
-from weaviate.classes.config import Configure, Property, DataType
-from weaviate.collections.classes.filters import Filter
-import pytesseract
-from PIL import Image
-from docx import Document
-from pypdf import PdfReader
-import textract
-import hashlib
-
-# Konfiguracja
-REPO_PATH = "/home/ably.do/docs"
-WEAVIATE_URL = "http://weaviate:8080"
-
-client = weaviate.WeaviateClient(
-    connection_params=ConnectionParams.from_params(
-        http_host="weaviate",
-        http_port=8080,
-        http_secure=False,
-        grpc_host="weaviate",
-        grpc_port=50051,
-        grpc_secure=False,
-    )
-)
-
-def read_text_file(file_path):
-    with open(file_path, 'r', encoding='utf-8') as file:
-        return file.read()
-
-def read_docx(file_path):
-    doc = Document(file_path)
-    return ' '.join([paragraph.text for paragraph in doc.paragraphs])
-
-def read_pdf(file_path):
-    reader = PdfReader(file_path)
-    return ' '.join([page.extract_text() for page in reader.pages])
-
-def read_image(file_path):
-    return pytesseract.image_to_string(Image.open(file_path))
-
-def read_file(file_path):
-    _, ext = os.path.splitext(file_path.lower())
-    if ext in ['.txt', '.md']:
-        return read_text_file(file_path)
-    elif ext == '.docx':
-        return read_docx(file_path)
-    elif ext == '.pdf':
-        return read_pdf(file_path)
-    elif ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
-        return read_image(file_path)
-    elif ext in ['.doc', '.rtf']:
-        return textract.process(file_path).decode('utf-8')
-    else:
-        return None
-
-def generate_content_hash(content):
-    return hashlib.sha256(content.encode('utf-8')).hexdigest()
-
-def add_to_weaviate(file_name, content, content_hash):
-    try:
-        collection = client.collections.get("Document")
-        
-        # Poprawne użycie klasy Filter
-        filters = Filter.by_property("fileName").equal(file_name)
-
-        # Sprawdzenie, czy dokument już istnieje
-        existing_docs = collection.query.fetch_objects(filters=filters)
-
-        if existing_docs.objects:
-            print(f"Dokument {file_name} już istnieje w bazie.")
-            return
-        
-        # Dodanie nowego dokumentu
-        collection.data.insert(
-            properties={
-                "fileName": file_name,
-                "content": content,
-                "contentHash": content_hash
-            }
-        )
-        print(f"Dodano dokument {file_name} do Weaviate.")
-    
-    except Exception as e:
-        print(f"Błąd podczas dodawania {file_name} do Weaviate: {e}")
-
-def process_file(file_path):
-    if not os.path.exists(file_path):
-        print(f"Plik nie istnieje: {file_path}")
-        return
-
-    try:
-        content = read_file(file_path)
-        if content:
-            file_name = os.path.basename(file_path)
-            content_hash = generate_content_hash(content)
-            add_to_weaviate(file_name, content, content_hash)
-        else:
-            print(f"Plik jest pusty lub nie można go odczytać: {file_path}")
-    except Exception as e:
-        print(f"Błąd podczas przetwarzania pliku {file_path}: {str(e)}")
-
-def load_all_documents():
-    print("Wczytywanie wszystkich dokumentów z katalogu...")
-    for root, dirs, files in os.walk(REPO_PATH):
-        for file in files:
-            process_file(os.path.join(root, file))
-    print("Zakończono wczytywanie dokumentów.")
-
-if __name__ == "__main__":
-    # Upewnij się, że kolekcja "Document" istnieje w Weaviate
-    client.connect()
-    try:
-        # Sprawdzenie, czy kolekcja istnieje i czy należy ją usunąć
-        collection_name = "Document"
-        if client.collections.exists(collection_name):
-            print(f"Usuwanie istniejącej kolekcji '{collection_name}' (CLEAR_COLLECTION=true)...")
-            client.collections.delete(collection_name)
-            print(f"Kolekcja '{collection_name}' została usunięta.")
-        else:
-            print(f"Kolekcja '{collection_name}' nie istnieje.")
-
-        # Tworzenie kolekcji od nowa, jeśli została usunięta lub nie istniała
-        if not client.collections.exists(collection_name):
-            print(f"Tworzenie nowej kolekcji '{collection_name}'...")
-            client.collections.create(
-                name=collection_name,
-                properties=[
-                    Property(name="content", data_type=DataType.TEXT),
-                    Property(name="fileName", data_type=DataType.TEXT),
-                    Property(name="contentHash", data_type=DataType.TEXT)  # Nowe pole
-                ],
-                vectorizer_config=Configure.Vectorizer.text2vec_transformers()
-            )
-            print(f"Kolekcja '{collection_name}' została utworzona.")
-
-            # Wczytanie dokumentów po utworzeniu nowej kolekcji
-            print("Wczytywanie dokumentów do nowej kolekcji...")
-            load_all_documents()
-            print("Wszystkie dokumenty zostały wgrane.")
-
-        else:
-            print("Kolekcja już istnieje. Pominięto jej ponowne tworzenie.")
-            
-            # Sprawdzenie, czy kolekcja jest pusta i ewentualne wczytanie dokumentów
-            collection = client.collections.get(collection_name)
-            if collection.aggregate.over_all(total_count=True).total_count == 0:
-                print("Kolekcja jest pusta. Wczytywanie dokumentów...")
-                load_all_documents()
-                print("Wszystkie dokumenty zostały wgrane do istniejącej kolekcji.")
-
-    except Exception as e:
-        print(f"Wystąpił błąd podczas operacji na kolekcji '{collection_name}': {e}")
-        
-    client.close()