Wyszukiwanie ONLINE

2025-03-02 21:13:50 +01:00 · 2025-03-02 21:13:50 +01:00 · 38c7ab8956
parent f83da185f8
commit 38c7ab8956
2 changed files with 66 additions and 3 deletions
--- a/ollama_service.py
+++ b/ollama_service.py
@ -14,6 +14,12 @@ import asyncio
 import os
 from elasticsearch import Elasticsearch
 from datetime import datetime
 ####
 import aiohttp
 import asyncio
 from bs4 import BeautifulSoup
 import urllib.parse
 import hashlib
 app = FastAPI()
@ -21,6 +27,7 @@ OLLAMA_BASE_URL = "http://ollama:11434"
 ES_BASE_URL = "http://elastic:9200"
 WEAVIATE_URL = "http://weaviate:8080"
 PROMPT_DIR_PATCH = "./prompts"
 SEARXNG_BASE_URL = "http://searxng:8080"
 # Inicjalizacja klientów
 ollama_client = ollama.Client(host=OLLAMA_BASE_URL)
@ -38,6 +45,7 @@ weaviate_client = weaviate.WeaviateClient(
 weaviate_client.connect()
 collection = weaviate_client.collections.get("Document")
 files_content = None
 class Message(BaseModel):
    role: str
    content: str
@ -109,7 +117,7 @@ def extract_relevant_fragment(content, query, context_size=100):
        start = max(0, index - context_size)
        end = min(len(content), index + len(query) + context_size)
        return f"...{content[start:end]}..."
-    return content[:200] + "..."
+    return content[:1000] + "..."
 def hybrid_search(keywords, limit=100, alpha=0.5):
    if isinstance(keywords, str):
@ -130,7 +138,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
        #print(f"UUID: {obj.uuid}")
        relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
        #print(f"Relewantny fragment:\n{relevant_fragment}")
-        #print(f"Nazwa pliku: {obj.properties['fileName']}")
+        print(f"Nazwa pliku: {obj.properties['fileName']}")
        #print("---")
        # Zmieniamy warunek na 'any' zamiast 'all'
        #if any(term.lower() in relevant_fragment.lower() for term in keywords):
@ -138,6 +146,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
            "uuid": obj.uuid,
            "relevant_fragment": relevant_fragment,
            "file_name": obj.properties['fileName'],
            "content_type": obj.properties['contentType'],
            "keyword": query
        })
        #print(f"Dodano do wyników: {obj.uuid}")
@ -146,6 +155,57 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
            break
    return results[:limit]
 async def fetch_json(session, url):
    async with session.get(url) as response:
        return await response.json()
 async def fetch_text(session, url):
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        return soup.get_text()
 async def process_search_results(query):
    search_url = f"{SEARXNG_BASE_URL}/search?q={urllib.parse.quote(query)}&categories=general&format=json"
    async with aiohttp.ClientSession() as session:
        data = await fetch_json(session, search_url)
        results = data.get("results", [])
        results_sorted = sorted(results, key=lambda x: x.get("score", float('inf')))[:10]
        tasks = [fetch_text(session, result["url"]) for result in results_sorted]
        texts = await asyncio.gather(*tasks)
        save_to_weaviate([{
            "fileName": result["url"],
            "content": json.dumps({
                "prompt": query,
                "completion": text
            }),
            "contentHash": generate_content_hash(text)
        } for result, text in zip(results_sorted, texts)])
 def generate_content_hash(content):
    return hashlib.sha256(content.encode('utf-8')).hexdigest()
 def save_to_weaviate(data):
    try:
        collection = weaviate_client.collections.get("Document")
        for item in data:
            filters = Filter.by_property("fileName").equal(item["fileName"])
            existing_docs = collection.query.fetch_objects(filters=filters)
            if existing_docs.objects:
                return
            collection.data.insert({
                "fileName": item["fileName"],
                "content": item["content"],
                "contentHash": item["contentHash"],
                "contentType": "website"
            })
    except Exception as e:
        print(f"Błąd podczas dodawania informacji do bazy. Error: {e}")
@app.get("/api/tags")
 async def tags_proxy():
    async with httpx.AsyncClient() as client:
@ -254,6 +314,7 @@ async def chat_endpoint(request: ChatRequest):
            raise KeyError(f"Nie znaleziono promptu o nazwie '{prompt_data}'.")
        query = request.messages[-1].content if request.messages else ""
        asyncio.run(process_search_results(query))
        keywords = analyze_query(prompt_seach.format(query=query))
        weaviate_results = hybrid_search(keywords)
        prompt_data += "\n".join([f"Źródło: {doc['file_name']}\n{doc['relevant_fragment']}\n\n" for doc in weaviate_results])
--- a/requirements.txt
+++ b/requirements.txt
@ -4,3 +4,5 @@ ollama
 weaviate-client
 unidecode
 elasticsearch
 aiohttp
 beautifulsoup4