# ============================================ # File: bookscraper/app.py (ASYNC SCRAPING) # ============================================ from dotenv import load_dotenv load_dotenv() import os from flask import ( Flask, render_template, request, jsonify, send_from_directory, redirect, url_for, ) print(">>> [WEB] Importing celery_app …") from celery_app import celery_app from celery.result import AsyncResult from db.db import init_db from db.repository import ( get_registered_books, fetch_book, fetch_all_books, get_progress, ) from logbus.publisher import log from scraper.logger import log_debug from scraper.abort import set_abort from scraper.ui_log import get_ui_logs, reset_ui_logs from scraper.state import state as r from scraper.logger_decorators import logcall from scraper.utils.state_sync import sync_books_from_redis from scraper.services.init_service import InitService # INIT DB init_db() app = Flask(__name__) # ===================================================== # STATIC FILE SERVING # ===================================================== OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") @app.route("/output/") @logcall def serve_output(filename): return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False) # ===================================================== # SECTION 1 — NAVIGATION / HTML PAGES # ===================================================== @app.route("/", methods=["GET"]) @logcall def index(): return redirect(url_for("dashboard")) @app.route("/dashboard", methods=["GET"]) @logcall def dashboard(): logs_list = get_ui_logs() or [] registered_books = get_registered_books() log(f"[WEB] Registered books: {registered_books}") from db.repository import fetch_all_books from pprint import pprint pprint(fetch_all_books()) pprint(get_registered_books()) # reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", books=list_active_books(), registered=registered_books, logs=logs_list, ) @app.route("/book/") @logcall def book_detail(book_idx): title = r.get(f"book:{book_idx}:title") or book_idx return render_template( "dashboard/book_detail.html", book_id=book_idx, title=title, logs=get_ui_logs(), ) # ===================================================== # SECTION 2 — ACTION ROUTES (INIT, START, ABORT, HIDE) # ===================================================== @app.route("/init", methods=["POST"]) @logcall def init_book(): # ------------------------------------------------- # Accept single URL (legacy) OR multi-line URLs # ------------------------------------------------- raw_urls = request.form.get("urls") or request.form.get("url") or "" urls = [line.strip() for line in raw_urls.splitlines() if line.strip()] if not urls: return render_template( "dashboard/dashboard.html", error="Geen URL(s) opgegeven.", books=list_active_books(), registered=get_registered_books(), logs=get_ui_logs(), ) # ------------------------------------------------- # Duplicate check: existing book_ids # ------------------------------------------------- existing_books = {b["book_idx"] for b in fetch_all_books()} results = [] # ------------------------------------------------- # Process each URL independently # ------------------------------------------------- for url in urls: try: book_id = InitService.derive_book_id(url) if book_id in existing_books: results.append( { "url": url, "status": "skipped", "book_id": book_id, "message": "Al geregistreerd", } ) continue result = InitService.execute(url) results.append( { "url": url, "status": "registered", "book_id": result.get("book_id"), "title": result.get("title"), } ) except Exception as e: log_debug(f"[INIT] ERROR for url={url}: {e}") results.append( { "url": url, "status": "error", "error": str(e), } ) # ------------------------------------------------- # Summary message # ------------------------------------------------- ok = sum(1 for r in results if r["status"] == "registered") skipped = sum(1 for r in results if r["status"] == "skipped") failed = sum(1 for r in results if r["status"] == "error") message = f"Geregistreerd: {ok}, overgeslagen: {skipped}, fouten: {failed}" reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", message=message, init_results=results, # optioneel voor UI-weergave books=list_active_books(), registered=reg, logs=get_ui_logs(), ) @app.route("/hide/", methods=["POST"]) @logcall def hide_registered_book(book_idx): # intentionally left disabled pass @app.route("/start", methods=["POST"]) @logcall def start_scraping(): # 1) Form field: book_idx book_idx = request.form.get("book_idx") log(f"[WEB][START] Received start request for book_idx={book_idx}") if not book_idx: msg = "book_idx ontbreekt in formulier" log(f"[WEB][START] ERROR: {msg}") return jsonify({"status": "error", "message": msg}), 400 # 2) Fetch boek uit SQLite try: book = fetch_book(book_idx) log(f"[WEB][START] Fetched book from DB: {book}") except Exception as e: log(f"[WEB][START] DB ERROR: {e}") return jsonify({"status": "error", "message": "DB fout"}), 500 if not book: msg = f"Boek '{book_idx}' niet gevonden in DB" log(f"[WEB][START] ERROR: {msg}") return jsonify({"status": "error", "message": msg}), 404 # 3) Boek moet een URL hebben url = book.get("book_url") if not url: msg = f"Boek '{book_idx}' heeft geen book_url in DB" log(f"[WEB][START] ERROR: {msg}") return jsonify({"status": "error", "message": msg}), 500 # 4) Reset UI logs reset_ui_logs() # 5) Logging log(f"[WEB][START] Starting full scrape book_idx={book_idx}, url={url}") log_debug(f"[WEB][START] DEBUG: book data = {book}") # 6) Celery controller taak starten try: async_result = celery_app.send_task( "scraper.tasks.controller_tasks.start_full_scrape", args=[book_idx], queue="controller", ) except Exception as e: log(f"[WEB][START] Celery ERROR: {e}") return jsonify({"status": "error", "message": f"Celery fout: {e}"}), 500 # 7) Successfully dispatched task log(f"[WEB][START] Task dispatched: {async_result.id}") reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", scraping_task_id=async_result.id, books=list_active_books(), registered=reg, logs=get_ui_logs(), ) @app.route("/abort/", methods=["POST"]) @logcall def abort_download(book_idx): log_debug(f"[WEB] Abort requested for book: {book_idx}") set_abort(book_idx) return jsonify({"status": "ok", "aborted": book_idx}) # ===================================================== # SECTION 3 — API ROUTES (JSON) # ===================================================== @app.route("/api/state/all", methods=["GET"]) @logcall def api_state_all(): """ Returns the merged SQL + Redis state for all books (same logic as /debug/inspect_state but JSON-only). """ from scraper.utils.state_sync import inspect_books_state return jsonify(inspect_books_state()) @app.route("/api/books") @logcall def api_books(): return jsonify(list_active_books()) @app.route("/api/book//status") @logcall def api_book_status(book_idx): return jsonify(getStatus(book_idx)) @app.route("/api/book//logs") @logcall def api_book_logs(book_idx): logs = r.lrange(f"logs:{book_idx}", 0, -1) or [] return jsonify(logs) @app.route("/progress/") @logcall def progress(book_idx): return jsonify(get_progress(book_idx)) @app.route("/celery-result/") @logcall def celery_result(task_id): result = AsyncResult(task_id, app=celery_app) if result.successful(): return jsonify({"ready": True, "result": result.get()}) if result.failed(): return jsonify({"ready": True, "error": "failed"}) return jsonify({"ready": False}) @app.route("/clear-logs", methods=["POST"]) @logcall def clear_logs(): reset_ui_logs() return jsonify({"status": "ok"}) @app.route("/logs", methods=["GET"]) @logcall def logs(): try: last_index = int(request.args.get("last_index", -1)) except: last_index = -1 all_logs = get_ui_logs() or [] new_lines = [] new_last = last_index for idx, line in enumerate(all_logs): if idx > last_index: new_lines.append(line) new_last = idx return jsonify({"lines": new_lines, "last": new_last}) # ===================================================== # SECTION 4 — DEBUG ROUTES # ===================================================== @app.route("/debug/sync_state", methods=["GET"]) def debug_sync_state(): results = sync_books_from_redis() return {"status": "ok", "synced": results} from scraper.utils.state_sync import inspect_books_state @app.route("/debug/inspect_state", methods=["GET"]) def debug_inspect_state(): results = inspect_books_state() return render_template("debug/inspect_state.html", results=results) @app.route("/debug/redis-keys") @logcall def debug_redis_keys(): cursor = 0 results = {} while True: cursor, keys = r.scan(cursor, match="*", count=200) for k in keys: try: results[k] = r.get(k) except: results[k] = "" if cursor == 0: break return jsonify(results) # ===================================================== # DB DEBUG # ===================================================== @app.route("/api/db/books") @logcall def api_db_books(): try: books = fetch_all_books() return jsonify({"status": "ok", "books": books}) except Exception as e: return jsonify({"status": "error", "message": str(e)}), 500 # ============================================= # DEBUG QUEUE VIEW (HTML) # ============================================= from flask import render_template from urllib.parse import urlparse import redis from celery_app import celery_app @app.route("/debug/queues") def debug_queues(): insp = celery_app.control.inspect() workers_active = insp.active() or {} workers_scheduled = insp.scheduled() or {} workers_reserved = insp.reserved() or {} redis_url = os.getenv("REDIS_BROKER") parsed = urlparse(redis_url) r2 = redis.Redis( host=parsed.hostname, port=parsed.port, db=int(parsed.path.strip("/") or 0), decode_responses=True, ) queue_names = ["scraping", "controller", "download", "parse", "save", "audio"] queues = [] for q in queue_names: key = f"celery:{q}" try: queues.append( { "name": q, "redis_key": key, "length": r2.llen(key), "items": r2.lrange(key, 0, 30), } ) except Exception as e: queues.append( { "name": q, "redis_key": key, "length": "ERR", "items": [str(e)], } ) return render_template( "debug/queues.html", queues=queues, workers_active=workers_active, workers_reserved=workers_reserved, workers_scheduled=workers_scheduled, ) # ===================================================== # SECTION 5 — INTERNAL HELPERS # ===================================================== @logcall def getStatus(book_idx): state = r.hgetall(f"book:{book_idx}:state") status = state.get("status") or "unknown" dl_done = int(state.get("chapters_download_done", 0)) dl_skipped = int(state.get("chapters_download_skipped", 0)) dl_total = int(state.get("chapters_total", 0)) au_done = int(state.get("audio_done") or 0) title = state.get("title") or book_idx return { "book_id": book_idx, "title": title, "status": status, "download_done": dl_done, "download_skipped": dl_skipped, "download_total": dl_total, "audio_done": au_done, "audio_total": dl_total, } @logcall def list_active_books(): books = [] for key in r.scan_iter(match="book:*:state", count=1000): first = key.find(":") second = key.find(":", first + 1) book_idx = key[first + 1 : second] books.append(getStatus(book_idx)) return books # ===================================================== # SECTION 6 — FLASK RUNNER # ===================================================== if __name__ == "__main__": debug = os.getenv("FLASK_DEBUG", "0") == "1" host = os.getenv("HOST", "0.0.0.0") port = int(os.getenv("PORT", "5000")) log_debug(f"[WEB] Starting Flask server on {host}:{port}, debug={debug}") app.run(host=host, port=port, debug=debug)