ably.do/monitoring.py

134 lines
4.0 KiB
Python
Raw Normal View History

2025-02-26 14:38:27 -05:00
import time
import os
import subprocess
import threading
import weaviate
import pytesseract
from PIL import Image
from docx import Document
from PyPDF2 import PdfReader
import textract
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from flask import Flask, request, jsonify
import hmac
import hashlib
# Konfiguracja
REPO_PATH = "./docs"
WEBHOOK_SECRET = "twoj_tajny_klucz"
WEBHOOK_PORT = 5000
WEAVIATE_URL = "http://weaviate:8080"
app = Flask(__name__)
client = weaviate.Client(WEAVIATE_URL)
def read_text_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def read_docx(file_path):
doc = Document(file_path)
return ' '.join([paragraph.text for paragraph in doc.paragraphs])
def read_pdf(file_path):
reader = PdfReader(file_path)
return ' '.join([page.extract_text() for page in reader.pages])
def read_image(file_path):
return pytesseract.image_to_string(Image.open(file_path))
def read_file(file_path):
_, ext = os.path.splitext(file_path.lower())
if ext in ['.txt', '.md']:
return read_text_file(file_path)
elif ext == '.docx':
return read_docx(file_path)
elif ext == '.pdf':
return read_pdf(file_path)
elif ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
return read_image(file_path)
elif ext in ['.doc', '.rtf']:
return textract.process(file_path).decode('utf-8')
else:
return None
def add_to_weaviate(content, file_name):
data_object = {
"content": content,
"fileName": file_name
}
client.data_object.create("Document", data_object)
print(f"Dodano {file_name} do Weaviate")
def process_file(file_path):
content = read_file(file_path)
if content:
file_name = os.path.basename(file_path)
add_to_weaviate(content, file_name)
else:
print(f"Nie można odczytać {file_path}")
class RepoHandler(FileSystemEventHandler):
def on_any_event(self, event):
if not event.is_directory:
print(f"Wykryto zmianę: {event.src_path}")
self.pull_changes()
process_file(event.src_path)
def pull_changes(self):
try:
subprocess.run(["git", "pull"], check=True, cwd=REPO_PATH)
print("Zmiany pobrane z Gitea")
except subprocess.CalledProcessError as e:
print(f"Błąd podczas pobierania zmian: {e}")
def start_file_monitor():
event_handler = RepoHandler()
observer = Observer()
observer.schedule(event_handler, REPO_PATH, recursive=True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
@app.route('/webhook', methods=['POST'])
def webhook():
signature = request.headers.get('X-Gitea-Signature')
if not signature:
return jsonify({"error": "No signature"}), 400
payload = request.data
computed_signature = hmac.new(WEBHOOK_SECRET.encode(), payload, hashlib.sha256).hexdigest()
if hmac.compare_digest(signature, computed_signature):
print("Otrzymano ważny webhook z Gitea")
RepoHandler().pull_changes()
# Po pobraniu zmian, przetwórz wszystkie pliki w repozytorium
for root, dirs, files in os.walk(REPO_PATH):
for file in files:
process_file(os.path.join(root, file))
return jsonify({"message": "Zmiany pobrane i przetworzone pomyślnie"}), 200
else:
return jsonify({"error": "Invalid signature"}), 401
if __name__ == "__main__":
# Upewnij się, że kolekcja "Document" istnieje w Weaviate
client.schema.create_class({
"class": "Document",
"properties": [
{"name": "content", "dataType": ["text"]},
{"name": "fileName", "dataType": ["string"]}
]
})
# Uruchom monitorowanie plików w osobnym wątku
monitor_thread = threading.Thread(target=start_file_monitor)
monitor_thread.start()
# Uruchom serwer Flask dla webhooka
app.run(port=WEBHOOK_PORT)