From 38c7ab89563211611f2b9e8a5b8acb809ae47c43 Mon Sep 17 00:00:00 2001 From: "l.gabrysiak" Date: Sun, 2 Mar 2025 21:13:50 +0100 Subject: [PATCH] Wyszukiwanie ONLINE --- ollama_service.py | 65 +++++++++++++++++++++++++++++++++++++++++++++-- requirements.txt | 4 ++- 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/ollama_service.py b/ollama_service.py index 16da7de..a390162 100644 --- a/ollama_service.py +++ b/ollama_service.py @@ -14,6 +14,12 @@ import asyncio import os from elasticsearch import Elasticsearch from datetime import datetime +#### +import aiohttp +import asyncio +from bs4 import BeautifulSoup +import urllib.parse +import hashlib app = FastAPI() @@ -21,6 +27,7 @@ OLLAMA_BASE_URL = "http://ollama:11434" ES_BASE_URL = "http://elastic:9200" WEAVIATE_URL = "http://weaviate:8080" PROMPT_DIR_PATCH = "./prompts" +SEARXNG_BASE_URL = "http://searxng:8080" # Inicjalizacja klientów ollama_client = ollama.Client(host=OLLAMA_BASE_URL) @@ -38,6 +45,7 @@ weaviate_client = weaviate.WeaviateClient( weaviate_client.connect() collection = weaviate_client.collections.get("Document") files_content = None + class Message(BaseModel): role: str content: str @@ -109,7 +117,7 @@ def extract_relevant_fragment(content, query, context_size=100): start = max(0, index - context_size) end = min(len(content), index + len(query) + context_size) return f"...{content[start:end]}..." - return content[:200] + "..." + return content[:1000] + "..." def hybrid_search(keywords, limit=100, alpha=0.5): if isinstance(keywords, str): @@ -130,7 +138,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5): #print(f"UUID: {obj.uuid}") relevant_fragment = extract_relevant_fragment(obj.properties['content'], query) #print(f"Relewantny fragment:\n{relevant_fragment}") - #print(f"Nazwa pliku: {obj.properties['fileName']}") + print(f"Nazwa pliku: {obj.properties['fileName']}") #print("---") # Zmieniamy warunek na 'any' zamiast 'all' #if any(term.lower() in relevant_fragment.lower() for term in keywords): @@ -138,6 +146,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5): "uuid": obj.uuid, "relevant_fragment": relevant_fragment, "file_name": obj.properties['fileName'], + "content_type": obj.properties['contentType'], "keyword": query }) #print(f"Dodano do wyników: {obj.uuid}") @@ -146,6 +155,57 @@ def hybrid_search(keywords, limit=100, alpha=0.5): break return results[:limit] +async def fetch_json(session, url): + async with session.get(url) as response: + return await response.json() + +async def fetch_text(session, url): + async with session.get(url) as response: + html = await response.text() + soup = BeautifulSoup(html, "html.parser") + return soup.get_text() + +async def process_search_results(query): + search_url = f"{SEARXNG_BASE_URL}/search?q={urllib.parse.quote(query)}&categories=general&format=json" + async with aiohttp.ClientSession() as session: + data = await fetch_json(session, search_url) + + results = data.get("results", []) + results_sorted = sorted(results, key=lambda x: x.get("score", float('inf')))[:10] + + tasks = [fetch_text(session, result["url"]) for result in results_sorted] + texts = await asyncio.gather(*tasks) + + save_to_weaviate([{ + "fileName": result["url"], + "content": json.dumps({ + "prompt": query, + "completion": text + }), + "contentHash": generate_content_hash(text) + } for result, text in zip(results_sorted, texts)]) + +def generate_content_hash(content): + return hashlib.sha256(content.encode('utf-8')).hexdigest() + +def save_to_weaviate(data): + try: + collection = weaviate_client.collections.get("Document") + for item in data: + filters = Filter.by_property("fileName").equal(item["fileName"]) + existing_docs = collection.query.fetch_objects(filters=filters) + if existing_docs.objects: + return + + collection.data.insert({ + "fileName": item["fileName"], + "content": item["content"], + "contentHash": item["contentHash"], + "contentType": "website" + }) + except Exception as e: + print(f"Błąd podczas dodawania informacji do bazy. Error: {e}") + @app.get("/api/tags") async def tags_proxy(): async with httpx.AsyncClient() as client: @@ -254,6 +314,7 @@ async def chat_endpoint(request: ChatRequest): raise KeyError(f"Nie znaleziono promptu o nazwie '{prompt_data}'.") query = request.messages[-1].content if request.messages else "" + asyncio.run(process_search_results(query)) keywords = analyze_query(prompt_seach.format(query=query)) weaviate_results = hybrid_search(keywords) prompt_data += "\n".join([f"Źródło: {doc['file_name']}\n{doc['relevant_fragment']}\n\n" for doc in weaviate_results]) diff --git a/requirements.txt b/requirements.txt index eda4e31..ed6f997 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ uvicorn ollama weaviate-client unidecode -elasticsearch \ No newline at end of file +elasticsearch +aiohttp +beautifulsoup4 \ No newline at end of file