134 lines
4.0 KiB
Python
134 lines
4.0 KiB
Python
|
|
import time
|
||
|
|
import os
|
||
|
|
import subprocess
|
||
|
|
import threading
|
||
|
|
import weaviate
|
||
|
|
import pytesseract
|
||
|
|
from PIL import Image
|
||
|
|
from docx import Document
|
||
|
|
from PyPDF2 import PdfReader
|
||
|
|
import textract
|
||
|
|
from watchdog.observers import Observer
|
||
|
|
from watchdog.events import FileSystemEventHandler
|
||
|
|
from flask import Flask, request, jsonify
|
||
|
|
import hmac
|
||
|
|
import hashlib
|
||
|
|
|
||
|
|
# Konfiguracja
|
||
|
|
REPO_PATH = "./docs"
|
||
|
|
WEBHOOK_SECRET = "twoj_tajny_klucz"
|
||
|
|
WEBHOOK_PORT = 5000
|
||
|
|
WEAVIATE_URL = "http://weaviate:8080"
|
||
|
|
|
||
|
|
app = Flask(__name__)
|
||
|
|
client = weaviate.Client(WEAVIATE_URL)
|
||
|
|
|
||
|
|
def read_text_file(file_path):
|
||
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||
|
|
return file.read()
|
||
|
|
|
||
|
|
def read_docx(file_path):
|
||
|
|
doc = Document(file_path)
|
||
|
|
return ' '.join([paragraph.text for paragraph in doc.paragraphs])
|
||
|
|
|
||
|
|
def read_pdf(file_path):
|
||
|
|
reader = PdfReader(file_path)
|
||
|
|
return ' '.join([page.extract_text() for page in reader.pages])
|
||
|
|
|
||
|
|
def read_image(file_path):
|
||
|
|
return pytesseract.image_to_string(Image.open(file_path))
|
||
|
|
|
||
|
|
def read_file(file_path):
|
||
|
|
_, ext = os.path.splitext(file_path.lower())
|
||
|
|
if ext in ['.txt', '.md']:
|
||
|
|
return read_text_file(file_path)
|
||
|
|
elif ext == '.docx':
|
||
|
|
return read_docx(file_path)
|
||
|
|
elif ext == '.pdf':
|
||
|
|
return read_pdf(file_path)
|
||
|
|
elif ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
|
||
|
|
return read_image(file_path)
|
||
|
|
elif ext in ['.doc', '.rtf']:
|
||
|
|
return textract.process(file_path).decode('utf-8')
|
||
|
|
else:
|
||
|
|
return None
|
||
|
|
|
||
|
|
def add_to_weaviate(content, file_name):
|
||
|
|
data_object = {
|
||
|
|
"content": content,
|
||
|
|
"fileName": file_name
|
||
|
|
}
|
||
|
|
client.data_object.create("Document", data_object)
|
||
|
|
print(f"Dodano {file_name} do Weaviate")
|
||
|
|
|
||
|
|
def process_file(file_path):
|
||
|
|
content = read_file(file_path)
|
||
|
|
if content:
|
||
|
|
file_name = os.path.basename(file_path)
|
||
|
|
add_to_weaviate(content, file_name)
|
||
|
|
else:
|
||
|
|
print(f"Nie można odczytać {file_path}")
|
||
|
|
|
||
|
|
class RepoHandler(FileSystemEventHandler):
|
||
|
|
def on_any_event(self, event):
|
||
|
|
if not event.is_directory:
|
||
|
|
print(f"Wykryto zmianę: {event.src_path}")
|
||
|
|
self.pull_changes()
|
||
|
|
process_file(event.src_path)
|
||
|
|
|
||
|
|
def pull_changes(self):
|
||
|
|
try:
|
||
|
|
subprocess.run(["git", "pull"], check=True, cwd=REPO_PATH)
|
||
|
|
print("Zmiany pobrane z Gitea")
|
||
|
|
except subprocess.CalledProcessError as e:
|
||
|
|
print(f"Błąd podczas pobierania zmian: {e}")
|
||
|
|
|
||
|
|
def start_file_monitor():
|
||
|
|
event_handler = RepoHandler()
|
||
|
|
observer = Observer()
|
||
|
|
observer.schedule(event_handler, REPO_PATH, recursive=True)
|
||
|
|
observer.start()
|
||
|
|
try:
|
||
|
|
while True:
|
||
|
|
time.sleep(1)
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
observer.stop()
|
||
|
|
observer.join()
|
||
|
|
|
||
|
|
@app.route('/webhook', methods=['POST'])
|
||
|
|
def webhook():
|
||
|
|
signature = request.headers.get('X-Gitea-Signature')
|
||
|
|
if not signature:
|
||
|
|
return jsonify({"error": "No signature"}), 400
|
||
|
|
|
||
|
|
payload = request.data
|
||
|
|
computed_signature = hmac.new(WEBHOOK_SECRET.encode(), payload, hashlib.sha256).hexdigest()
|
||
|
|
|
||
|
|
if hmac.compare_digest(signature, computed_signature):
|
||
|
|
print("Otrzymano ważny webhook z Gitea")
|
||
|
|
RepoHandler().pull_changes()
|
||
|
|
# Po pobraniu zmian, przetwórz wszystkie pliki w repozytorium
|
||
|
|
for root, dirs, files in os.walk(REPO_PATH):
|
||
|
|
for file in files:
|
||
|
|
process_file(os.path.join(root, file))
|
||
|
|
return jsonify({"message": "Zmiany pobrane i przetworzone pomyślnie"}), 200
|
||
|
|
else:
|
||
|
|
return jsonify({"error": "Invalid signature"}), 401
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
# Upewnij się, że kolekcja "Document" istnieje w Weaviate
|
||
|
|
client.schema.create_class({
|
||
|
|
"class": "Document",
|
||
|
|
"properties": [
|
||
|
|
{"name": "content", "dataType": ["text"]},
|
||
|
|
{"name": "fileName", "dataType": ["string"]}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
# Uruchom monitorowanie plików w osobnym wątku
|
||
|
|
monitor_thread = threading.Thread(target=start_file_monitor)
|
||
|
|
monitor_thread.start()
|
||
|
|
|
||
|
|
# Uruchom serwer Flask dla webhooka
|
||
|
|
app.run(port=WEBHOOK_PORT)
|