code ready to deploy
This commit is contained in:
parent
b68ffa3112
commit
c617cb3f95
|
|
@ -0,0 +1,30 @@
|
|||
# Użyj oficjalnego obrazu Python jako bazowego
|
||||
FROM --platform=linux/amd64 python:3.9-slim
|
||||
|
||||
# Ustaw katalog roboczy w kontenerze
|
||||
WORKDIR /app
|
||||
|
||||
# Zainstaluj git
|
||||
RUN apt-get update && apt-get install -y git nano wget curl iputils-ping
|
||||
|
||||
# Skopiuj pliki wymagań (jeśli istnieją) i zainstaluj zależności
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Skopiuj plik requirements.txt do kontenera
|
||||
COPY requirements.txt .
|
||||
|
||||
# Zainstaluj zależności z pliku requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Zainstaluj Tesseract OCR
|
||||
RUN apt-get install -y tesseract-ocr
|
||||
|
||||
# Skopiuj kod źródłowy do kontenera
|
||||
COPY . .
|
||||
COPY entrypoint.sh /entrypoint.sh
|
||||
|
||||
RUN chmod +x /entrypoint.sh
|
||||
|
||||
# Uruchom aplikację
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#!/bin/bash
|
||||
git config --global credential.helper store
|
||||
git config --global user.name ${GIT_USERNAME}
|
||||
git config --global user.email ${GIT_EMAIL}
|
||||
echo "https://${GIT_USERNAME}:${GIT_TOKEN}@${GIT_HOST}" > ~/.git-credentials
|
||||
cd /home
|
||||
git clone --single-branch --branch ${GIT_BRANCH} https://repo.pokash.pl/POKASH.PL/ably.do.git
|
||||
python /app/monitoring.py
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
import weaviate
|
||||
from weaviate.connect import ConnectionParams
|
||||
import re
|
||||
|
||||
# Konfiguracja klienta Weaviate
|
||||
client = weaviate.WeaviateClient(
|
||||
connection_params=ConnectionParams.from_params(
|
||||
http_host="weaviate",
|
||||
http_port=8080,
|
||||
http_secure=False,
|
||||
grpc_host="weaviate",
|
||||
grpc_port=50051,
|
||||
grpc_secure=False,
|
||||
)
|
||||
)
|
||||
client.connect()
|
||||
|
||||
# Pobierz kolekcję
|
||||
collection = client.collections.get("Document")
|
||||
|
||||
def extract_full_article(content, article_number):
|
||||
pattern = rf"Art\.\s*{article_number}\..*?(?=Art\.\s*\d+\.|\Z)"
|
||||
match = re.search(pattern, content, re.DOTALL)
|
||||
if match:
|
||||
return match.group(0).strip()
|
||||
return None
|
||||
|
||||
def extract_relevant_fragment(content, query, context_size=100):
|
||||
article_match = re.match(r"Art\.\s*(\d+)", query)
|
||||
if article_match:
|
||||
article_number = article_match.group(1)
|
||||
full_article = extract_full_article(content, article_number)
|
||||
if full_article:
|
||||
return full_article
|
||||
|
||||
index = content.lower().find(query.lower())
|
||||
if index != -1:
|
||||
start = max(0, index - context_size)
|
||||
end = min(len(content), index + len(query) + context_size)
|
||||
return f"...{content[start:end]}..."
|
||||
return content[:200] + "..."
|
||||
|
||||
def vector_search(query, limit=5):
|
||||
print(f"\nWyszukiwanie wektorowe dla zapytania: '{query}'")
|
||||
response = collection.query.near_text(
|
||||
query=query,
|
||||
limit=limit
|
||||
)
|
||||
for obj in response.objects:
|
||||
print(f"UUID: {obj.uuid}")
|
||||
relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
|
||||
print(f"Relewantny fragment:\n{relevant_fragment}")
|
||||
print(f"Nazwa pliku: {obj.properties['fileName']}")
|
||||
print("---")
|
||||
|
||||
def hybrid_search(query, limit=5, alpha=0.5):
|
||||
print(f"\nWyszukiwanie hybrydowe dla zapytania: '{query}'")
|
||||
response = collection.query.hybrid(
|
||||
query=query,
|
||||
alpha=alpha,
|
||||
limit=limit
|
||||
)
|
||||
for obj in response.objects:
|
||||
print(f"UUID: {obj.uuid}")
|
||||
relevant_fragment = extract_relevant_fragment(obj.properties['content'], query)
|
||||
print(f"Relewantny fragment:\n{relevant_fragment}")
|
||||
print(f"Nazwa pliku: {obj.properties['fileName']}")
|
||||
print("---")
|
||||
|
||||
|
||||
#exists = client.collections.exists("Document")
|
||||
#print(f"Czy kolekcja 'Document' istnieje: {exists}")
|
||||
|
||||
#schema = collection.config.get()
|
||||
#print(f"Nazwa kolekcji: {schema.name}")
|
||||
#print("Właściwości:")
|
||||
#for prop in schema.properties:
|
||||
# print(f"- {prop.name}: {prop.data_type}")
|
||||
|
||||
#collection = client.collections.get("Document")
|
||||
#count = collection.aggregate.over_all(total_count=True).total_count
|
||||
#print(f"Liczba obiektów w kolekcji: {count}")
|
||||
|
||||
#results = collection.query.fetch_objects(limit=5)
|
||||
#for obj in results.objects:
|
||||
# print(f"UUID: {obj.uuid}")
|
||||
# print(f"Nazwa pliku: {obj.properties['fileName']}")
|
||||
# print(f"Zawartość: {obj.properties['content'][:100]}...") # Pierwsze 100 znaków
|
||||
# print("---")
|
||||
|
||||
|
||||
# Przykładowe użycie
|
||||
queries = ["Art. 154", "urlop wypoczynkowy", "Państwowa Inspekcja Pracy"]
|
||||
|
||||
for query in queries:
|
||||
vector_search(query)
|
||||
hybrid_search(query)
|
||||
|
||||
# Zamknij połączenie
|
||||
client.close()
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
import os
|
||||
import weaviate
|
||||
from weaviate.connect import ConnectionParams
|
||||
from weaviate.collections import Collection
|
||||
from weaviate.classes.config import Configure, Property, DataType
|
||||
from weaviate.collections.classes.filters import Filter
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from docx import Document
|
||||
from pypdf import PdfReader
|
||||
import textract
|
||||
import hashlib
|
||||
|
||||
# Konfiguracja
|
||||
REPO_PATH = "/home/ably.do/docs"
|
||||
WEAVIATE_URL = "http://weaviate:8080"
|
||||
|
||||
client = weaviate.WeaviateClient(
|
||||
connection_params=ConnectionParams.from_params(
|
||||
http_host="weaviate",
|
||||
http_port=8080,
|
||||
http_secure=False,
|
||||
grpc_host="weaviate",
|
||||
grpc_port=50051,
|
||||
grpc_secure=False,
|
||||
)
|
||||
)
|
||||
|
||||
def read_text_file(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
|
||||
def read_docx(file_path):
|
||||
doc = Document(file_path)
|
||||
return ' '.join([paragraph.text for paragraph in doc.paragraphs])
|
||||
|
||||
def read_pdf(file_path):
|
||||
reader = PdfReader(file_path)
|
||||
return ' '.join([page.extract_text() for page in reader.pages])
|
||||
|
||||
def read_image(file_path):
|
||||
return pytesseract.image_to_string(Image.open(file_path))
|
||||
|
||||
def read_file(file_path):
|
||||
_, ext = os.path.splitext(file_path.lower())
|
||||
if ext in ['.txt', '.md']:
|
||||
return read_text_file(file_path)
|
||||
elif ext == '.docx':
|
||||
return read_docx(file_path)
|
||||
elif ext == '.pdf':
|
||||
return read_pdf(file_path)
|
||||
elif ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
|
||||
return read_image(file_path)
|
||||
elif ext in ['.doc', '.rtf']:
|
||||
return textract.process(file_path).decode('utf-8')
|
||||
else:
|
||||
return None
|
||||
|
||||
def generate_content_hash(content):
|
||||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def add_to_weaviate(file_name, content, content_hash):
|
||||
try:
|
||||
collection = client.collections.get("Document")
|
||||
|
||||
# Poprawne użycie klasy Filter
|
||||
filters = Filter.by_property("fileName").equal(file_name)
|
||||
|
||||
# Sprawdzenie, czy dokument już istnieje
|
||||
existing_docs = collection.query.fetch_objects(filters=filters)
|
||||
|
||||
if existing_docs.objects:
|
||||
print(f"Dokument {file_name} już istnieje w bazie.")
|
||||
return
|
||||
|
||||
# Dodanie nowego dokumentu
|
||||
collection.data.insert(
|
||||
properties={
|
||||
"fileName": file_name,
|
||||
"content": content,
|
||||
"contentHash": content_hash
|
||||
}
|
||||
)
|
||||
print(f"Dodano dokument {file_name} do Weaviate.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Błąd podczas dodawania {file_name} do Weaviate: {e}")
|
||||
|
||||
def process_file(file_path):
|
||||
if not os.path.exists(file_path):
|
||||
print(f"Plik nie istnieje: {file_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
content = read_file(file_path)
|
||||
if content:
|
||||
file_name = os.path.basename(file_path)
|
||||
content_hash = generate_content_hash(content)
|
||||
add_to_weaviate(file_name, content, content_hash)
|
||||
else:
|
||||
print(f"Plik jest pusty lub nie można go odczytać: {file_path}")
|
||||
except Exception as e:
|
||||
print(f"Błąd podczas przetwarzania pliku {file_path}: {str(e)}")
|
||||
|
||||
def load_all_documents():
|
||||
print("Wczytywanie wszystkich dokumentów z katalogu...")
|
||||
for root, dirs, files in os.walk(REPO_PATH):
|
||||
for file in files:
|
||||
process_file(os.path.join(root, file))
|
||||
print("Zakończono wczytywanie dokumentów.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Upewnij się, że kolekcja "Document" istnieje w Weaviate
|
||||
client.connect()
|
||||
try:
|
||||
# Sprawdzenie, czy kolekcja istnieje i czy należy ją usunąć
|
||||
collection_name = "Document"
|
||||
if client.collections.exists(collection_name):
|
||||
print(f"Usuwanie istniejącej kolekcji '{collection_name}' (CLEAR_COLLECTION=true)...")
|
||||
client.collections.delete(collection_name)
|
||||
print(f"Kolekcja '{collection_name}' została usunięta.")
|
||||
else:
|
||||
print(f"Kolekcja '{collection_name}' nie istnieje.")
|
||||
|
||||
# Tworzenie kolekcji od nowa, jeśli została usunięta lub nie istniała
|
||||
if not client.collections.exists(collection_name):
|
||||
print(f"Tworzenie nowej kolekcji '{collection_name}'...")
|
||||
client.collections.create(
|
||||
name=collection_name,
|
||||
properties=[
|
||||
Property(name="content", data_type=DataType.TEXT),
|
||||
Property(name="fileName", data_type=DataType.TEXT),
|
||||
Property(name="contentHash", data_type=DataType.TEXT) # Nowe pole
|
||||
],
|
||||
vectorizer_config=Configure.Vectorizer.text2vec_transformers()
|
||||
)
|
||||
print(f"Kolekcja '{collection_name}' została utworzona.")
|
||||
|
||||
# Wczytanie dokumentów po utworzeniu nowej kolekcji
|
||||
print("Wczytywanie dokumentów do nowej kolekcji...")
|
||||
load_all_documents()
|
||||
print("Wszystkie dokumenty zostały wgrane.")
|
||||
|
||||
else:
|
||||
print("Kolekcja już istnieje. Pominięto jej ponowne tworzenie.")
|
||||
|
||||
# Sprawdzenie, czy kolekcja jest pusta i ewentualne wczytanie dokumentów
|
||||
collection = client.collections.get(collection_name)
|
||||
if collection.aggregate.over_all(total_count=True).total_count == 0:
|
||||
print("Kolekcja jest pusta. Wczytywanie dokumentów...")
|
||||
load_all_documents()
|
||||
print("Wszystkie dokumenty zostały wgrane do istniejącej kolekcji.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Wystąpił błąd podczas operacji na kolekcji '{collection_name}': {e}")
|
||||
|
||||
client.close()
|
||||
164
monitoring.py
164
monitoring.py
|
|
@ -1,27 +1,43 @@
|
|||
import time
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
import threading
|
||||
import weaviate
|
||||
from weaviate.connect import ConnectionParams
|
||||
from weaviate.collections import Collection
|
||||
from weaviate.classes.config import Configure, Property, DataType
|
||||
from weaviate.collections.classes.filters import Filter
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from docx import Document
|
||||
from PyPDF2 import PdfReader
|
||||
from pypdf import PdfReader
|
||||
import textract
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from flask import Flask, request, jsonify
|
||||
#from flask import Flask, request, jsonify, cli
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
import uvicorn
|
||||
import hmac
|
||||
import hashlib
|
||||
|
||||
# Konfiguracja
|
||||
REPO_PATH = "./docs"
|
||||
REPO_PATH = "/home/ably.do/docs"
|
||||
WEBHOOK_SECRET = "twoj_tajny_klucz"
|
||||
WEBHOOK_PORT = 5000
|
||||
WEAVIATE_URL = "http://weaviate:8080"
|
||||
|
||||
app = Flask(__name__)
|
||||
client = weaviate.Client(WEAVIATE_URL)
|
||||
app = FastAPI()
|
||||
|
||||
client = weaviate.WeaviateClient(
|
||||
connection_params=ConnectionParams.from_params(
|
||||
http_host="weaviate",
|
||||
http_port=8080,
|
||||
http_secure=False,
|
||||
grpc_host="weaviate",
|
||||
grpc_port=50051,
|
||||
grpc_secure=False,
|
||||
)
|
||||
)
|
||||
|
||||
def read_text_file(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
|
|
@ -53,21 +69,51 @@ def read_file(file_path):
|
|||
else:
|
||||
return None
|
||||
|
||||
def add_to_weaviate(content, file_name):
|
||||
data_object = {
|
||||
"content": content,
|
||||
"fileName": file_name
|
||||
}
|
||||
client.data_object.create("Document", data_object)
|
||||
print(f"Dodano {file_name} do Weaviate")
|
||||
def generate_content_hash(content):
|
||||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def add_to_weaviate(file_name, content, content_hash):
|
||||
try:
|
||||
collection = client.collections.get("Document")
|
||||
|
||||
# Poprawne użycie klasy Filter
|
||||
filters = Filter.by_property("fileName").equal(file_name)
|
||||
|
||||
# Sprawdzenie, czy dokument już istnieje
|
||||
existing_docs = collection.query.fetch_objects(filters=filters)
|
||||
|
||||
if existing_docs.objects:
|
||||
print(f"Dokument {file_name} już istnieje w bazie.")
|
||||
return
|
||||
|
||||
# Dodanie nowego dokumentu
|
||||
collection.data.insert(
|
||||
properties={
|
||||
"fileName": file_name,
|
||||
"content": content,
|
||||
"contentHash": content_hash
|
||||
}
|
||||
)
|
||||
print(f"Dodano dokument {file_name} do Weaviate.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Błąd podczas dodawania {file_name} do Weaviate: {e}")
|
||||
|
||||
def process_file(file_path):
|
||||
content = read_file(file_path)
|
||||
if content:
|
||||
file_name = os.path.basename(file_path)
|
||||
add_to_weaviate(content, file_name)
|
||||
else:
|
||||
print(f"Nie można odczytać {file_path}")
|
||||
if not os.path.exists(file_path):
|
||||
print(f"Plik nie istnieje: {file_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
content = read_file(file_path)
|
||||
if content:
|
||||
file_name = os.path.basename(file_path)
|
||||
content_hash = generate_content_hash(content)
|
||||
add_to_weaviate(file_name, content, content_hash)
|
||||
else:
|
||||
print(f"Plik jest pusty lub nie można go odczytać: {file_path}")
|
||||
except Exception as e:
|
||||
print(f"Błąd podczas przetwarzania pliku {file_path}: {str(e)}")
|
||||
|
||||
class RepoHandler(FileSystemEventHandler):
|
||||
def on_any_event(self, event):
|
||||
|
|
@ -84,6 +130,7 @@ class RepoHandler(FileSystemEventHandler):
|
|||
print(f"Błąd podczas pobierania zmian: {e}")
|
||||
|
||||
def start_file_monitor():
|
||||
print(f"Rozpoczeto monitoring folderu")
|
||||
event_handler = RepoHandler()
|
||||
observer = Observer()
|
||||
observer.schedule(event_handler, REPO_PATH, recursive=True)
|
||||
|
|
@ -95,39 +142,84 @@ def start_file_monitor():
|
|||
observer.stop()
|
||||
observer.join()
|
||||
|
||||
@app.route('/webhook', methods=['POST'])
|
||||
def webhook():
|
||||
signature = request.headers.get('X-Gitea-Signature')
|
||||
@app.post("/webhook")
|
||||
async def webhook(request: Request):
|
||||
signature = request.headers.get("X-Gitea-Signature")
|
||||
if not signature:
|
||||
return jsonify({"error": "No signature"}), 400
|
||||
raise HTTPException(status_code=400, detail="No signature")
|
||||
|
||||
payload = request.data
|
||||
payload = await request.body()
|
||||
computed_signature = hmac.new(WEBHOOK_SECRET.encode(), payload, hashlib.sha256).hexdigest()
|
||||
|
||||
if hmac.compare_digest(signature, computed_signature):
|
||||
print("Otrzymano ważny webhook z Gitea")
|
||||
RepoHandler().pull_changes()
|
||||
# Po pobraniu zmian, przetwórz wszystkie pliki w repozytorium
|
||||
for root, dirs, files in os.walk(REPO_PATH):
|
||||
for file in files:
|
||||
process_file(os.path.join(root, file))
|
||||
return jsonify({"message": "Zmiany pobrane i przetworzone pomyślnie"}), 200
|
||||
return {"message": "Zmiany pobrane i przetworzone pomyślnie"}
|
||||
else:
|
||||
return jsonify({"error": "Invalid signature"}), 401
|
||||
raise HTTPException(status_code=401, detail="Invalid signature")
|
||||
|
||||
def load_all_documents():
|
||||
print("Wczytywanie wszystkich dokumentów z katalogu...")
|
||||
for root, dirs, files in os.walk(REPO_PATH):
|
||||
for file in files:
|
||||
process_file(os.path.join(root, file))
|
||||
print("Zakończono wczytywanie dokumentów.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Upewnij się, że kolekcja "Document" istnieje w Weaviate
|
||||
client.schema.create_class({
|
||||
"class": "Document",
|
||||
"properties": [
|
||||
{"name": "content", "dataType": ["text"]},
|
||||
{"name": "fileName", "dataType": ["string"]}
|
||||
]
|
||||
})
|
||||
client.connect()
|
||||
try:
|
||||
collection_name = "Document"
|
||||
# Sprawdzenie, czy kolekcja istnieje i czy należy ją usunąć
|
||||
if client.collections.exists(collection_name):
|
||||
print(f"Usuwanie istniejącej kolekcji '{collection_name}' (CLEAR_COLLECTION=true)...")
|
||||
client.collections.delete(collection_name)
|
||||
print(f"Kolekcja '{collection_name}' została usunięta.")
|
||||
else:
|
||||
print(f"Kolekcja '{collection_name}' nie istnieje.")
|
||||
|
||||
# Tworzenie kolekcji od nowa, jeśli została usunięta lub nie istniała
|
||||
if not client.collections.exists(collection_name):
|
||||
print(f"Tworzenie nowej kolekcji '{collection_name}'...")
|
||||
client.collections.create(
|
||||
name=collection_name,
|
||||
properties=[
|
||||
Property(name="content", data_type=DataType.TEXT),
|
||||
Property(name="fileName", data_type=DataType.TEXT),
|
||||
Property(name="contentHash", data_type=DataType.TEXT) # Nowe pole
|
||||
],
|
||||
vectorizer_config=Configure.Vectorizer.text2vec_transformers()
|
||||
)
|
||||
print(f"Kolekcja '{collection_name}' została utworzona.")
|
||||
|
||||
# Wczytanie dokumentów po utworzeniu nowej kolekcji
|
||||
print("Wczytywanie dokumentów do nowej kolekcji...")
|
||||
load_all_documents()
|
||||
print("Wszystkie dokumenty zostały wgrane.")
|
||||
|
||||
else:
|
||||
print("Kolekcja już istnieje. Pominięto jej ponowne tworzenie.")
|
||||
|
||||
# Sprawdzenie, czy kolekcja jest pusta i ewentualne wczytanie dokumentów
|
||||
collection = client.collections.get(collection_name)
|
||||
if collection.aggregate.over_all(total_count=True).total_count == 0:
|
||||
print("Kolekcja jest pusta. Wczytywanie dokumentów...")
|
||||
load_all_documents()
|
||||
print("Wszystkie dokumenty zostały wgrane do istniejącej kolekcji.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Wystąpił błąd podczas operacji na kolekcji '{collection_name}': {e}")
|
||||
|
||||
print(client.collections.list_all())
|
||||
|
||||
# Uruchom monitorowanie plików w osobnym wątku
|
||||
monitor_thread = threading.Thread(target=start_file_monitor)
|
||||
monitor_thread.start()
|
||||
|
||||
# Uruchom serwer Flask dla webhooka
|
||||
app.run(port=WEBHOOK_PORT)
|
||||
try:
|
||||
uvicorn.run(app, host="0.0.0.0", port=WEBHOOK_PORT)
|
||||
finally:
|
||||
client.close()
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
watchdog
|
||||
Flask
|
||||
weaviate-client
|
||||
python-docx
|
||||
pytesseract
|
||||
textract
|
||||
pillow
|
||||
pypdf
|
||||
uvicorn
|
||||
FastAPI
|
||||
Loading…
Reference in New Issue