2025-02-27 07:09:02 -05:00
|
|
|
import weaviate
|
|
|
|
|
from weaviate.connect import ConnectionParams
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
# Konfiguracja klienta Weaviate
|
|
|
|
|
client = weaviate.WeaviateClient(
|
|
|
|
|
connection_params=ConnectionParams.from_params(
|
|
|
|
|
http_host="weaviate",
|
|
|
|
|
http_port=8080,
|
|
|
|
|
http_secure=False,
|
|
|
|
|
grpc_host="weaviate",
|
|
|
|
|
grpc_port=50051,
|
|
|
|
|
grpc_secure=False,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
client.connect()
|
|
|
|
|
|
|
|
|
|
# Pobierz kolekcję
|
|
|
|
|
collection = client.collections.get("Document")
|
|
|
|
|
|
|
|
|
|
def extract_full_article(content, article_number):
|
|
|
|
|
pattern = rf"Art\.\s*{article_number}\..*?(?=Art\.\s*\d+\.|\Z)"
|
|
|
|
|
match = re.search(pattern, content, re.DOTALL)
|
|
|
|
|
if match:
|
|
|
|
|
return match.group(0).strip()
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def extract_relevant_fragment(content, query, context_size=100):
|
|
|
|
|
article_match = re.match(r"Art\.\s*(\d+)", query)
|
|
|
|
|
if article_match:
|
|
|
|
|
article_number = article_match.group(1)
|
|
|
|
|
full_article = extract_full_article(content, article_number)
|
|
|
|
|
if full_article:
|
|
|
|
|
return full_article
|
|
|
|
|
|
|
|
|
|
index = content.lower().find(query.lower())
|
|
|
|
|
if index != -1:
|
|
|
|
|
start = max(0, index - context_size)
|
|
|
|
|
end = min(len(content), index + len(query) + context_size)
|
|
|
|
|
return f"...{content[start:end]}..."
|
|
|
|
|
return content[:200] + "..."
|
|
|
|
|
|
|
|
|
|
def vector_search(query, limit=5):
|
|
|
|
|
print(f"\nWyszukiwanie wektorowe dla zapytania: '{query}'")
|
|
|
|
|
response = collection.query.near_text(
|
|
|
|
|
query=query,
|
|
|
|
|
limit=limit
|
|
|
|
|
)
|
|
|
|
|
for obj in response.objects:
|
|
|
|
|
print(f"UUID: {obj.uuid}")
|
|
|
|
|
relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
|
|
|
|
|
print(f"Relewantny fragment:\n{relevant_fragment}")
|
|
|
|
|
print(f"Nazwa pliku: {obj.properties['fileName']}")
|
|
|
|
|
print("---")
|
|
|
|
|
|
|
|
|
|
def hybrid_search(query, limit=5, alpha=0.5):
|
|
|
|
|
print(f"\nWyszukiwanie hybrydowe dla zapytania: '{query}'")
|
|
|
|
|
response = collection.query.hybrid(
|
|
|
|
|
query=query,
|
|
|
|
|
alpha=alpha,
|
|
|
|
|
limit=limit
|
|
|
|
|
)
|
|
|
|
|
for obj in response.objects:
|
|
|
|
|
print(f"UUID: {obj.uuid}")
|
|
|
|
|
relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
|
|
|
|
|
print(f"Relewantny fragment:\n{relevant_fragment}")
|
|
|
|
|
print(f"Nazwa pliku: {obj.properties['fileName']}")
|
|
|
|
|
print("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#exists = client.collections.exists("Document")
|
|
|
|
|
#print(f"Czy kolekcja 'Document' istnieje: {exists}")
|
|
|
|
|
|
|
|
|
|
#schema = collection.config.get()
|
|
|
|
|
#print(f"Nazwa kolekcji: {schema.name}")
|
|
|
|
|
#print("Właściwości:")
|
|
|
|
|
#for prop in schema.properties:
|
|
|
|
|
# print(f"- {prop.name}: {prop.data_type}")
|
|
|
|
|
|
|
|
|
|
#collection = client.collections.get("Document")
|
|
|
|
|
#count = collection.aggregate.over_all(total_count=True).total_count
|
|
|
|
|
#print(f"Liczba obiektów w kolekcji: {count}")
|
|
|
|
|
|
|
|
|
|
#results = collection.query.fetch_objects(limit=5)
|
|
|
|
|
#for obj in results.objects:
|
|
|
|
|
# print(f"UUID: {obj.uuid}")
|
|
|
|
|
# print(f"Nazwa pliku: {obj.properties['fileName']}")
|
|
|
|
|
# print(f"Zawartość: {obj.properties['content'][:100]}...") # Pierwsze 100 znaków
|
|
|
|
|
# print("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Przykładowe użycie
|
2025-02-27 10:01:25 -05:00
|
|
|
queries = ["Art. 154", "urlop wypoczynkowy", "Państwowa Inspekcja Pracy", "Art. 154 Kodeks pracy"]
|
2025-02-27 07:09:02 -05:00
|
|
|
|
|
|
|
|
for query in queries:
|
|
|
|
|
vector_search(query)
|
|
|
|
|
hybrid_search(query)
|
|
|
|
|
|
|
|
|
|
# Zamknij połączenie
|
|
|
|
|
client.close()
|