Wyszukiwanie ONLINE
This commit is contained in:
parent
f83da185f8
commit
38c7ab8956
|
|
@ -14,6 +14,12 @@ import asyncio
|
|||
import os
|
||||
from elasticsearch import Elasticsearch
|
||||
from datetime import datetime
|
||||
####
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
import hashlib
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
|
@ -21,6 +27,7 @@ OLLAMA_BASE_URL = "http://ollama:11434"
|
|||
ES_BASE_URL = "http://elastic:9200"
|
||||
WEAVIATE_URL = "http://weaviate:8080"
|
||||
PROMPT_DIR_PATCH = "./prompts"
|
||||
SEARXNG_BASE_URL = "http://searxng:8080"
|
||||
|
||||
# Inicjalizacja klientów
|
||||
ollama_client = ollama.Client(host=OLLAMA_BASE_URL)
|
||||
|
|
@ -38,6 +45,7 @@ weaviate_client = weaviate.WeaviateClient(
|
|||
weaviate_client.connect()
|
||||
collection = weaviate_client.collections.get("Document")
|
||||
files_content = None
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
|
@ -109,7 +117,7 @@ def extract_relevant_fragment(content, query, context_size=100):
|
|||
start = max(0, index - context_size)
|
||||
end = min(len(content), index + len(query) + context_size)
|
||||
return f"...{content[start:end]}..."
|
||||
return content[:200] + "..."
|
||||
return content[:1000] + "..."
|
||||
|
||||
def hybrid_search(keywords, limit=100, alpha=0.5):
|
||||
if isinstance(keywords, str):
|
||||
|
|
@ -130,7 +138,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
|
|||
#print(f"UUID: {obj.uuid}")
|
||||
relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
|
||||
#print(f"Relewantny fragment:\n{relevant_fragment}")
|
||||
#print(f"Nazwa pliku: {obj.properties['fileName']}")
|
||||
print(f"Nazwa pliku: {obj.properties['fileName']}")
|
||||
#print("---")
|
||||
# Zmieniamy warunek na 'any' zamiast 'all'
|
||||
#if any(term.lower() in relevant_fragment.lower() for term in keywords):
|
||||
|
|
@ -138,6 +146,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
|
|||
"uuid": obj.uuid,
|
||||
"relevant_fragment": relevant_fragment,
|
||||
"file_name": obj.properties['fileName'],
|
||||
"content_type": obj.properties['contentType'],
|
||||
"keyword": query
|
||||
})
|
||||
#print(f"Dodano do wyników: {obj.uuid}")
|
||||
|
|
@ -146,6 +155,57 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
|
|||
break
|
||||
return results[:limit]
|
||||
|
||||
async def fetch_json(session, url):
|
||||
async with session.get(url) as response:
|
||||
return await response.json()
|
||||
|
||||
async def fetch_text(session, url):
|
||||
async with session.get(url) as response:
|
||||
html = await response.text()
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
return soup.get_text()
|
||||
|
||||
async def process_search_results(query):
|
||||
search_url = f"{SEARXNG_BASE_URL}/search?q={urllib.parse.quote(query)}&categories=general&format=json"
|
||||
async with aiohttp.ClientSession() as session:
|
||||
data = await fetch_json(session, search_url)
|
||||
|
||||
results = data.get("results", [])
|
||||
results_sorted = sorted(results, key=lambda x: x.get("score", float('inf')))[:10]
|
||||
|
||||
tasks = [fetch_text(session, result["url"]) for result in results_sorted]
|
||||
texts = await asyncio.gather(*tasks)
|
||||
|
||||
save_to_weaviate([{
|
||||
"fileName": result["url"],
|
||||
"content": json.dumps({
|
||||
"prompt": query,
|
||||
"completion": text
|
||||
}),
|
||||
"contentHash": generate_content_hash(text)
|
||||
} for result, text in zip(results_sorted, texts)])
|
||||
|
||||
def generate_content_hash(content):
|
||||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def save_to_weaviate(data):
|
||||
try:
|
||||
collection = weaviate_client.collections.get("Document")
|
||||
for item in data:
|
||||
filters = Filter.by_property("fileName").equal(item["fileName"])
|
||||
existing_docs = collection.query.fetch_objects(filters=filters)
|
||||
if existing_docs.objects:
|
||||
return
|
||||
|
||||
collection.data.insert({
|
||||
"fileName": item["fileName"],
|
||||
"content": item["content"],
|
||||
"contentHash": item["contentHash"],
|
||||
"contentType": "website"
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Błąd podczas dodawania informacji do bazy. Error: {e}")
|
||||
|
||||
@app.get("/api/tags")
|
||||
async def tags_proxy():
|
||||
async with httpx.AsyncClient() as client:
|
||||
|
|
@ -254,6 +314,7 @@ async def chat_endpoint(request: ChatRequest):
|
|||
raise KeyError(f"Nie znaleziono promptu o nazwie '{prompt_data}'.")
|
||||
|
||||
query = request.messages[-1].content if request.messages else ""
|
||||
asyncio.run(process_search_results(query))
|
||||
keywords = analyze_query(prompt_seach.format(query=query))
|
||||
weaviate_results = hybrid_search(keywords)
|
||||
prompt_data += "\n".join([f"Źródło: {doc['file_name']}\n{doc['relevant_fragment']}\n\n" for doc in weaviate_results])
|
||||
|
|
|
|||
|
|
@ -3,4 +3,6 @@ uvicorn
|
|||
ollama
|
||||
weaviate-client
|
||||
unidecode
|
||||
elasticsearch
|
||||
elasticsearch
|
||||
aiohttp
|
||||
beautifulsoup4
|
||||
Loading…
Reference in New Issue