Wyszukiwanie ONLINE

This commit is contained in:
l.gabrysiak 2025-03-02 21:13:50 +01:00
parent f83da185f8
commit 38c7ab8956
2 changed files with 66 additions and 3 deletions

View File

@ -14,6 +14,12 @@ import asyncio
import os
from elasticsearch import Elasticsearch
from datetime import datetime
####
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import urllib.parse
import hashlib
app = FastAPI()
@ -21,6 +27,7 @@ OLLAMA_BASE_URL = "http://ollama:11434"
ES_BASE_URL = "http://elastic:9200"
WEAVIATE_URL = "http://weaviate:8080"
PROMPT_DIR_PATCH = "./prompts"
SEARXNG_BASE_URL = "http://searxng:8080"
# Inicjalizacja klientów
ollama_client = ollama.Client(host=OLLAMA_BASE_URL)
@ -38,6 +45,7 @@ weaviate_client = weaviate.WeaviateClient(
weaviate_client.connect()
collection = weaviate_client.collections.get("Document")
files_content = None
class Message(BaseModel):
role: str
content: str
@ -109,7 +117,7 @@ def extract_relevant_fragment(content, query, context_size=100):
start = max(0, index - context_size)
end = min(len(content), index + len(query) + context_size)
return f"...{content[start:end]}..."
return content[:200] + "..."
return content[:1000] + "..."
def hybrid_search(keywords, limit=100, alpha=0.5):
if isinstance(keywords, str):
@ -130,7 +138,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
#print(f"UUID: {obj.uuid}")
relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
#print(f"Relewantny fragment:\n{relevant_fragment}")
#print(f"Nazwa pliku: {obj.properties['fileName']}")
print(f"Nazwa pliku: {obj.properties['fileName']}")
#print("---")
# Zmieniamy warunek na 'any' zamiast 'all'
#if any(term.lower() in relevant_fragment.lower() for term in keywords):
@ -138,6 +146,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
"uuid": obj.uuid,
"relevant_fragment": relevant_fragment,
"file_name": obj.properties['fileName'],
"content_type": obj.properties['contentType'],
"keyword": query
})
#print(f"Dodano do wyników: {obj.uuid}")
@ -146,6 +155,57 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
break
return results[:limit]
async def fetch_json(session, url):
async with session.get(url) as response:
return await response.json()
async def fetch_text(session, url):
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
return soup.get_text()
async def process_search_results(query):
search_url = f"{SEARXNG_BASE_URL}/search?q={urllib.parse.quote(query)}&categories=general&format=json"
async with aiohttp.ClientSession() as session:
data = await fetch_json(session, search_url)
results = data.get("results", [])
results_sorted = sorted(results, key=lambda x: x.get("score", float('inf')))[:10]
tasks = [fetch_text(session, result["url"]) for result in results_sorted]
texts = await asyncio.gather(*tasks)
save_to_weaviate([{
"fileName": result["url"],
"content": json.dumps({
"prompt": query,
"completion": text
}),
"contentHash": generate_content_hash(text)
} for result, text in zip(results_sorted, texts)])
def generate_content_hash(content):
return hashlib.sha256(content.encode('utf-8')).hexdigest()
def save_to_weaviate(data):
try:
collection = weaviate_client.collections.get("Document")
for item in data:
filters = Filter.by_property("fileName").equal(item["fileName"])
existing_docs = collection.query.fetch_objects(filters=filters)
if existing_docs.objects:
return
collection.data.insert({
"fileName": item["fileName"],
"content": item["content"],
"contentHash": item["contentHash"],
"contentType": "website"
})
except Exception as e:
print(f"Błąd podczas dodawania informacji do bazy. Error: {e}")
@app.get("/api/tags")
async def tags_proxy():
async with httpx.AsyncClient() as client:
@ -254,6 +314,7 @@ async def chat_endpoint(request: ChatRequest):
raise KeyError(f"Nie znaleziono promptu o nazwie '{prompt_data}'.")
query = request.messages[-1].content if request.messages else ""
asyncio.run(process_search_results(query))
keywords = analyze_query(prompt_seach.format(query=query))
weaviate_results = hybrid_search(keywords)
prompt_data += "\n".join([f"Źródło: {doc['file_name']}\n{doc['relevant_fragment']}\n\n" for doc in weaviate_results])

View File

@ -3,4 +3,6 @@ uvicorn
ollama
weaviate-client
unidecode
elasticsearch
elasticsearch
aiohttp
beautifulsoup4