# ============================================ # File: bookscraper/app.py (ASYNC SCRAPING) # ============================================ from dotenv import load_dotenv load_dotenv() import os from flask import ( Flask, render_template, request, jsonify, send_from_directory, redirect, url_for, ) print(">>> [WEB] Importing celery_app …") from celery_app import celery_app from celery.result import AsyncResult from db.db import init_db from db.repository import ( get_registered_books, fetch_book, fetch_all_books, get_progress, ) from scraper.logger import log_debug from scraper.abort import set_abort from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta from scraper.state import state as r from scraper.logger_decorators import logcall from scraper.utils.state_sync import sync_books_from_redis from scraper.services.init_service import InitService # INIT DB init_db() app = Flask(__name__) # ===================================================== # STATIC FILE SERVING # ===================================================== OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") @app.route("/output/") @logcall def serve_output(filename): return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False) # ===================================================== # SECTION 1 — NAVIGATION / HTML PAGES # ===================================================== @app.route("/", methods=["GET"]) @logcall def index(): return redirect(url_for("dashboard")) @app.route("/dashboard", methods=["GET"]) @logcall def dashboard(): logs_list = get_ui_logs() or [] # Filter hidden books ONLY for GUI reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", books=list_active_books(), # Redis registered=reg, # SQLite (filtered) logs=logs_list, ) @app.route("/book/") @logcall def book_detail(book_id): title = r.get(f"book:{book_id}:title") or book_id return render_template( "dashboard/book_detail.html", book_id=book_id, title=title, logs=get_ui_logs(), ) # ===================================================== # SECTION 2 — ACTION ROUTES (INIT, START, ABORT, HIDE) # ===================================================== @app.route("/init", methods=["POST"]) @logcall def init_book(): """ INIT-flow: - user enters URL - metadata fetch - insert into SQLite as 'registered' - return dashboard """ url = request.form.get("url", "").strip() if not url: return render_template( "dashboard/dashboard.html", error="Geen URL opgegeven.", books=list_active_books(), registered=get_registered_books(), logs=get_ui_logs(), ) try: result = InitService.execute(url) msg = f"Boek geregistreerd: {result.get('title')}" reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", message=msg, books=list_active_books(), registered=reg, logs=get_ui_logs(), ) except Exception as e: log_debug(f"[INIT] ERROR: {e}") reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", error=f"INIT mislukt: {e}", books=list_active_books(), registered=reg, logs=get_ui_logs(), ) @app.route("/hide/", methods=["POST"]) @logcall def hide_registered_book(book_id): """ Soft-delete/hide voor GUI. De DB blijft intact. """ # try: # hide_book(book_id) # return redirect("/dashboard") # # return jsonify({"status": "ok", "hidden": book_id}) # except Exception as e: # return jsonify({"status": "error", "message": str(e)}), 500 @app.route("/start", methods=["POST"]) @logcall def start_scraping(): """ Start FULL scraping vanuit een geregistreerd INIT-record. """ book_id = request.form.get("book_id") if not book_id: return jsonify({"status": "error", "message": "book_id ontbreekt"}), 400 book = fetch_book(book_id) if not book: return jsonify({"status": "error", "message": "Boek niet gevonden"}), 404 url = book.get("book_url") if not url: return jsonify({"status": "error", "message": "book_url ontbreekt"}), 500 reset_ui_logs() log_debug(f"[WEB] Starting FULL scrape for book_id={book_id}, url={url}") async_result = celery_app.send_task( "scraper.tasks.scraping.start_scrape_book", args=[url], queue="scraping", ) reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", scraping_task_id=async_result.id, books=list_active_books(), registered=reg, logs=get_ui_logs(), ) @app.route("/abort/", methods=["POST"]) @logcall def abort_download(book_id): log_debug(f"[WEB] Abort requested for book: {book_id}") set_abort(book_id) return jsonify({"status": "ok", "aborted": book_id}) # ===================================================== # SECTION 3 — API ROUTES (JSON) # ===================================================== @app.route("/api/books") @logcall def api_books(): return jsonify(list_active_books()) @app.route("/api/book//status") @logcall def api_book_status(book_id): return jsonify(getStatus(book_id)) @app.route("/api/book//logs") @logcall def api_book_logs(book_id): logs = r.lrange(f"logs:{book_id}", 0, -1) or [] return jsonify(logs) @app.route("/progress/") @logcall def progress(book_id): return jsonify(get_progress(book_id)) @app.route("/celery-result/") @logcall def celery_result(task_id): result = AsyncResult(task_id, app=celery_app) if result.successful(): return jsonify({"ready": True, "result": result.get()}) if result.failed(): return jsonify({"ready": True, "error": "failed"}) return jsonify({"ready": False}) @app.route("/clear-logs", methods=["POST"]) @logcall def clear_logs(): reset_ui_logs() return jsonify({"status": "ok"}) @app.route("/logs", methods=["GET"]) @logcall def logs(): # LAST_LOG_INDEX vanuit de client (default = -1 bij eerste call) try: last_index = int(request.args.get("last_index", -1)) except: last_index = -1 # Haal volledige huidige loglijst op all_logs = get_ui_logs() or [] # Delta: alle regels met index > last_index new_lines = [] new_last = last_index for idx, line in enumerate(all_logs): if idx > last_index: new_lines.append(line) new_last = idx return jsonify({"lines": new_lines, "last": new_last}) # ===================================================== # SECTION 4 — DEBUG ROUTES # ===================================================== @app.route("/debug/sync_state", methods=["GET"]) def debug_sync_state(): results = sync_books_from_redis() return {"status": "ok", "synced": results} from scraper.utils.state_sync import inspect_books_state @app.route("/debug/inspect_state", methods=["GET"]) def debug_inspect_state(): """ Shows: - raw SQLite values, - raw Redis values, - what the merged result WOULD be. No writes happen. """ results = inspect_books_state() return render_template("debug/inspect_state.html", results=results) @app.route("/debug/redis-keys") @logcall def debug_redis_keys(): cursor = 0 results = {} while True: cursor, keys = r.scan(cursor, match="*", count=200) for k in keys: try: results[k] = r.get(k) except: results[k] = "" if cursor == 0: break return jsonify(results) # ===================================================== # DB DEBUG # ===================================================== @app.route("/api/db/books") @logcall def api_db_books(): try: books = fetch_all_books() return jsonify({"status": "ok", "books": books}) except Exception as e: return jsonify({"status": "error", "message": str(e)}), 500 # ============================================= # DEBUG QUEUE VIEW (HTML) # ============================================= from flask import render_template from urllib.parse import urlparse import redis import os from celery_app import celery_app @app.route("/debug/queues") def debug_queues(): insp = celery_app.control.inspect() workers_active = insp.active() or {} workers_scheduled = insp.scheduled() or {} workers_reserved = insp.reserved() or {} # ---- Redis connection ---- redis_url = os.getenv("REDIS_BROKER") parsed = urlparse(redis_url) r = redis.Redis( host=parsed.hostname, port=parsed.port, db=int(parsed.path.strip("/") or 0), decode_responses=True, ) queue_names = ["scraping", "controller", "download", "parse", "save", "audio"] queues = [] for q in queue_names: key = f"celery:{q}" try: queues.append( { "name": q, "redis_key": key, "length": r.llen(key), "items": r.lrange(key, 0, 30), # first 30 entries } ) except Exception as e: queues.append( { "name": q, "redis_key": key, "length": "ERR", "items": [str(e)], } ) return render_template( "debug/queues.html", queues=queues, workers_active=workers_active, workers_reserved=workers_reserved, workers_scheduled=workers_scheduled, ) # ===================================================== # SECTION 5 — INTERNAL HELPERS # ===================================================== @logcall def getStatus(book_id): state = r.hgetall(f"book:{book_id}:state") status = state.get("status") or "unknown" dl_done = int(state.get("chapters_download_done", 0)) dl_skipped = int(state.get("chapters_download_skipped", 0)) dl_total = int(state.get("chapters_total", 0)) au_done = int(state.get("audio_done") or 0) title = state.get("title") or book_id return { "book_id": book_id, "title": title, "status": status, "download_done": dl_done, "download_skipped": dl_skipped, "download_total": dl_total, "audio_done": au_done, "audio_total": dl_total, } @logcall def list_active_books(): books = [] for key in r.scan_iter(match="book:*:state", count=1000): first = key.find(":") second = key.find(":", first + 1) book_id = key[first + 1 : second] books.append(getStatus(book_id)) return books # ===================================================== # SECTION 6 — FLASK RUNNER # ===================================================== if __name__ == "__main__": debug = os.getenv("FLASK_DEBUG", "0") == "1" host = os.getenv("HOST", "0.0.0.0") port = int(os.getenv("PORT", "5000")) log_debug(f"[WEB] Starting Flask server on {host}:{port}, debug={debug}") app.run(host=host, port=port, debug=debug)