Wyszukiwanie ONLINE

2025-03-02 21:13:50 +01:00 · 2025-03-02 21:13:50 +01:00 · 38c7ab8956
parent f83da185f8
commit 38c7ab8956
2 changed files with 66 additions and 3 deletions
--- a/ollama_service.py
+++ b/ollama_service.py
@ -14,6 +14,12 @@ import asyncio
 import os
 from elasticsearch import Elasticsearch
 from datetime import datetime
+####
+import aiohttp
+import asyncio
+from bs4 import BeautifulSoup
+import urllib.parse
+import hashlib

 app = FastAPI()

@ -21,6 +27,7 @@ OLLAMA_BASE_URL = "http://ollama:11434"
 ES_BASE_URL = "http://elastic:9200"
 WEAVIATE_URL = "http://weaviate:8080"
 PROMPT_DIR_PATCH = "./prompts"
+SEARXNG_BASE_URL = "http://searxng:8080"

 # Inicjalizacja klientów
 ollama_client = ollama.Client(host=OLLAMA_BASE_URL)
@ -38,6 +45,7 @@ weaviate_client = weaviate.WeaviateClient(
 weaviate_client.connect()
 collection = weaviate_client.collections.get("Document")
 files_content = None
+
 class Message(BaseModel):
    role: str
    content: str
@ -109,7 +117,7 @@ def extract_relevant_fragment(content, query, context_size=100):
        start = max(0, index - context_size)
        end = min(len(content), index + len(query) + context_size)
        return f"...{content[start:end]}..."
-    return content[:200] + "..."
+    return content[:1000] + "..."

 def hybrid_search(keywords, limit=100, alpha=0.5):
    if isinstance(keywords, str):
@ -130,7 +138,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
        #print(f"UUID: {obj.uuid}")
        relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
        #print(f"Relewantny fragment:\n{relevant_fragment}")
-        #print(f"Nazwa pliku: {obj.properties['fileName']}")
+        print(f"Nazwa pliku: {obj.properties['fileName']}")
        #print("---")
        # Zmieniamy warunek na 'any' zamiast 'all'
        #if any(term.lower() in relevant_fragment.lower() for term in keywords):
@ -138,6 +146,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
            "uuid": obj.uuid,
            "relevant_fragment": relevant_fragment,
            "file_name": obj.properties['fileName'],
+            "content_type": obj.properties['contentType'],
            "keyword": query
        })
        #print(f"Dodano do wyników: {obj.uuid}")
@ -146,6 +155,57 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
            break
    return results[:limit]

+async def fetch_json(session, url):
+    async with session.get(url) as response:
+        return await response.json()
+
+async def fetch_text(session, url):
+    async with session.get(url) as response:
+        html = await response.text()
+        soup = BeautifulSoup(html, "html.parser")
+        return soup.get_text()
+
+async def process_search_results(query):
+    search_url = f"{SEARXNG_BASE_URL}/search?q={urllib.parse.quote(query)}&categories=general&format=json"
+    async with aiohttp.ClientSession() as session:
+        data = await fetch_json(session, search_url)
+        
+        results = data.get("results", [])
+        results_sorted = sorted(results, key=lambda x: x.get("score", float('inf')))[:10]
+        
+        tasks = [fetch_text(session, result["url"]) for result in results_sorted]
+        texts = await asyncio.gather(*tasks)
+        
+        save_to_weaviate([{
+            "fileName": result["url"],
+            "content": json.dumps({
+                "prompt": query,
+                "completion": text
+            }),
+            "contentHash": generate_content_hash(text)
+        } for result, text in zip(results_sorted, texts)])
+
+def generate_content_hash(content):
+    return hashlib.sha256(content.encode('utf-8')).hexdigest()
+
+def save_to_weaviate(data):
+    try:
+        collection = weaviate_client.collections.get("Document")
+        for item in data:
+            filters = Filter.by_property("fileName").equal(item["fileName"])
+            existing_docs = collection.query.fetch_objects(filters=filters)
+            if existing_docs.objects:
+                return
+            
+            collection.data.insert({
+                "fileName": item["fileName"],
+                "content": item["content"],
+                "contentHash": item["contentHash"],
+                "contentType": "website"
+            })
+    except Exception as e:
+        print(f"Błąd podczas dodawania informacji do bazy. Error: {e}")
+    
@app.get("/api/tags")
 async def tags_proxy():
    async with httpx.AsyncClient() as client:
@ -254,6 +314,7 @@ async def chat_endpoint(request: ChatRequest):
            raise KeyError(f"Nie znaleziono promptu o nazwie '{prompt_data}'.")
        
        query = request.messages[-1].content if request.messages else ""
+        asyncio.run(process_search_results(query))
        keywords = analyze_query(prompt_seach.format(query=query))
        weaviate_results = hybrid_search(keywords)
        prompt_data += "\n".join([f"Źródło: {doc['file_name']}\n{doc['relevant_fragment']}\n\n" for doc in weaviate_results])
--- a/requirements.txt
+++ b/requirements.txt
@ -3,4 +3,6 @@ uvicorn
 ollama
 weaviate-client
 unidecode
-elasticsearch
+elasticsearch
+aiohttp
+beautifulsoup4