Wyszukiwanie ONLINE
This commit is contained in:
parent
f83da185f8
commit
38c7ab8956
|
|
@ -14,6 +14,12 @@ import asyncio
|
||||||
import os
|
import os
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
####
|
||||||
|
import aiohttp
|
||||||
|
import asyncio
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import urllib.parse
|
||||||
|
import hashlib
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
|
@ -21,6 +27,7 @@ OLLAMA_BASE_URL = "http://ollama:11434"
|
||||||
ES_BASE_URL = "http://elastic:9200"
|
ES_BASE_URL = "http://elastic:9200"
|
||||||
WEAVIATE_URL = "http://weaviate:8080"
|
WEAVIATE_URL = "http://weaviate:8080"
|
||||||
PROMPT_DIR_PATCH = "./prompts"
|
PROMPT_DIR_PATCH = "./prompts"
|
||||||
|
SEARXNG_BASE_URL = "http://searxng:8080"
|
||||||
|
|
||||||
# Inicjalizacja klientów
|
# Inicjalizacja klientów
|
||||||
ollama_client = ollama.Client(host=OLLAMA_BASE_URL)
|
ollama_client = ollama.Client(host=OLLAMA_BASE_URL)
|
||||||
|
|
@ -38,6 +45,7 @@ weaviate_client = weaviate.WeaviateClient(
|
||||||
weaviate_client.connect()
|
weaviate_client.connect()
|
||||||
collection = weaviate_client.collections.get("Document")
|
collection = weaviate_client.collections.get("Document")
|
||||||
files_content = None
|
files_content = None
|
||||||
|
|
||||||
class Message(BaseModel):
|
class Message(BaseModel):
|
||||||
role: str
|
role: str
|
||||||
content: str
|
content: str
|
||||||
|
|
@ -109,7 +117,7 @@ def extract_relevant_fragment(content, query, context_size=100):
|
||||||
start = max(0, index - context_size)
|
start = max(0, index - context_size)
|
||||||
end = min(len(content), index + len(query) + context_size)
|
end = min(len(content), index + len(query) + context_size)
|
||||||
return f"...{content[start:end]}..."
|
return f"...{content[start:end]}..."
|
||||||
return content[:200] + "..."
|
return content[:1000] + "..."
|
||||||
|
|
||||||
def hybrid_search(keywords, limit=100, alpha=0.5):
|
def hybrid_search(keywords, limit=100, alpha=0.5):
|
||||||
if isinstance(keywords, str):
|
if isinstance(keywords, str):
|
||||||
|
|
@ -130,7 +138,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
|
||||||
#print(f"UUID: {obj.uuid}")
|
#print(f"UUID: {obj.uuid}")
|
||||||
relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
|
relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
|
||||||
#print(f"Relewantny fragment:\n{relevant_fragment}")
|
#print(f"Relewantny fragment:\n{relevant_fragment}")
|
||||||
#print(f"Nazwa pliku: {obj.properties['fileName']}")
|
print(f"Nazwa pliku: {obj.properties['fileName']}")
|
||||||
#print("---")
|
#print("---")
|
||||||
# Zmieniamy warunek na 'any' zamiast 'all'
|
# Zmieniamy warunek na 'any' zamiast 'all'
|
||||||
#if any(term.lower() in relevant_fragment.lower() for term in keywords):
|
#if any(term.lower() in relevant_fragment.lower() for term in keywords):
|
||||||
|
|
@ -138,6 +146,7 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
|
||||||
"uuid": obj.uuid,
|
"uuid": obj.uuid,
|
||||||
"relevant_fragment": relevant_fragment,
|
"relevant_fragment": relevant_fragment,
|
||||||
"file_name": obj.properties['fileName'],
|
"file_name": obj.properties['fileName'],
|
||||||
|
"content_type": obj.properties['contentType'],
|
||||||
"keyword": query
|
"keyword": query
|
||||||
})
|
})
|
||||||
#print(f"Dodano do wyników: {obj.uuid}")
|
#print(f"Dodano do wyników: {obj.uuid}")
|
||||||
|
|
@ -146,6 +155,57 @@ def hybrid_search(keywords, limit=100, alpha=0.5):
|
||||||
break
|
break
|
||||||
return results[:limit]
|
return results[:limit]
|
||||||
|
|
||||||
|
async def fetch_json(session, url):
|
||||||
|
async with session.get(url) as response:
|
||||||
|
return await response.json()
|
||||||
|
|
||||||
|
async def fetch_text(session, url):
|
||||||
|
async with session.get(url) as response:
|
||||||
|
html = await response.text()
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
return soup.get_text()
|
||||||
|
|
||||||
|
async def process_search_results(query):
|
||||||
|
search_url = f"{SEARXNG_BASE_URL}/search?q={urllib.parse.quote(query)}&categories=general&format=json"
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
data = await fetch_json(session, search_url)
|
||||||
|
|
||||||
|
results = data.get("results", [])
|
||||||
|
results_sorted = sorted(results, key=lambda x: x.get("score", float('inf')))[:10]
|
||||||
|
|
||||||
|
tasks = [fetch_text(session, result["url"]) for result in results_sorted]
|
||||||
|
texts = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
save_to_weaviate([{
|
||||||
|
"fileName": result["url"],
|
||||||
|
"content": json.dumps({
|
||||||
|
"prompt": query,
|
||||||
|
"completion": text
|
||||||
|
}),
|
||||||
|
"contentHash": generate_content_hash(text)
|
||||||
|
} for result, text in zip(results_sorted, texts)])
|
||||||
|
|
||||||
|
def generate_content_hash(content):
|
||||||
|
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
def save_to_weaviate(data):
|
||||||
|
try:
|
||||||
|
collection = weaviate_client.collections.get("Document")
|
||||||
|
for item in data:
|
||||||
|
filters = Filter.by_property("fileName").equal(item["fileName"])
|
||||||
|
existing_docs = collection.query.fetch_objects(filters=filters)
|
||||||
|
if existing_docs.objects:
|
||||||
|
return
|
||||||
|
|
||||||
|
collection.data.insert({
|
||||||
|
"fileName": item["fileName"],
|
||||||
|
"content": item["content"],
|
||||||
|
"contentHash": item["contentHash"],
|
||||||
|
"contentType": "website"
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Błąd podczas dodawania informacji do bazy. Error: {e}")
|
||||||
|
|
||||||
@app.get("/api/tags")
|
@app.get("/api/tags")
|
||||||
async def tags_proxy():
|
async def tags_proxy():
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
|
|
@ -254,6 +314,7 @@ async def chat_endpoint(request: ChatRequest):
|
||||||
raise KeyError(f"Nie znaleziono promptu o nazwie '{prompt_data}'.")
|
raise KeyError(f"Nie znaleziono promptu o nazwie '{prompt_data}'.")
|
||||||
|
|
||||||
query = request.messages[-1].content if request.messages else ""
|
query = request.messages[-1].content if request.messages else ""
|
||||||
|
asyncio.run(process_search_results(query))
|
||||||
keywords = analyze_query(prompt_seach.format(query=query))
|
keywords = analyze_query(prompt_seach.format(query=query))
|
||||||
weaviate_results = hybrid_search(keywords)
|
weaviate_results = hybrid_search(keywords)
|
||||||
prompt_data += "\n".join([f"Źródło: {doc['file_name']}\n{doc['relevant_fragment']}\n\n" for doc in weaviate_results])
|
prompt_data += "\n".join([f"Źródło: {doc['file_name']}\n{doc['relevant_fragment']}\n\n" for doc in weaviate_results])
|
||||||
|
|
|
||||||
|
|
@ -4,3 +4,5 @@ ollama
|
||||||
weaviate-client
|
weaviate-client
|
||||||
unidecode
|
unidecode
|
||||||
elasticsearch
|
elasticsearch
|
||||||
|
aiohttp
|
||||||
|
beautifulsoup4
|
||||||
Loading…
Reference in New Issue