diff --git a/bookscraper/README.md b/bookscraper/README.md index 103eded..ab2775e 100644 --- a/bookscraper/README.md +++ b/bookscraper/README.md @@ -135,6 +135,16 @@ docker compose down docker compose build docker compose up +docker compose up -d + +docker compose build web && docker compose up web + +docker compose build worker_download && docker compose up worker_download + +docker compose up web +docker compose build web +docker compose restart web + tar \ --exclude="**pycache**" \ --exclude="_/**pycache**/_" \ diff --git a/bookscraper/app.py b/bookscraper/app.py index 0367408..ce713d6 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -6,29 +6,43 @@ from dotenv import load_dotenv load_dotenv() import os -import redis -from flask import Flask, render_template, request, jsonify, send_from_directory +from flask import ( + Flask, + render_template, + request, + jsonify, + send_from_directory, + redirect, + url_for, +) print(">>> [WEB] Importing celery_app …") from celery_app import celery_app -from db.db import init_db from celery.result import AsyncResult +from db.db import init_db +from db.repository import ( + get_registered_books, + fetch_book, + fetch_all_books, + get_progress, +) + from scraper.logger import log_debug from scraper.abort import set_abort -from scraper.progress import get_progress from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta from scraper.state import state as r - +from scraper.logger_decorators import logcall +from scraper.utils.state_sync import sync_books_from_redis from scraper.services.init_service import InitService -from db.repository import get_registered_books # INIT DB init_db() app = Flask(__name__) + # ===================================================== # STATIC FILE SERVING # ===================================================== @@ -36,6 +50,7 @@ OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") @app.route("/output/") +@logcall def serve_output(filename): return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False) @@ -46,22 +61,29 @@ def serve_output(filename): @app.route("/", methods=["GET"]) +@logcall def index(): - return render_template("index.html") + return redirect(url_for("dashboard")) @app.route("/dashboard", methods=["GET"]) +@logcall def dashboard(): logs_list = get_ui_logs() or [] + + # Filter hidden books ONLY for GUI + reg = [b for b in get_registered_books() if b.get("status") != "hidden"] + return render_template( "dashboard/dashboard.html", books=list_active_books(), # Redis - registered=get_registered_books(), # SQLite INIT results + registered=reg, # SQLite (filtered) logs=logs_list, ) @app.route("/book/") +@logcall def book_detail(book_id): title = r.get(f"book:{book_id}:title") or book_id return render_template( @@ -73,20 +95,19 @@ def book_detail(book_id): # ===================================================== -# SECTION 2 — ACTION ROUTES (INIT, START, ABORT) +# SECTION 2 — ACTION ROUTES (INIT, START, ABORT, HIDE) # ===================================================== -# CORRECT PATH — services/ is root-level - @app.route("/init", methods=["POST"]) +@logcall def init_book(): """ INIT-flow: - user enters URL - - lightweight metadata fetch + - metadata fetch - insert into SQLite as 'registered' - - return dashboard HTML (NOT JSON) + - return dashboard """ url = request.form.get("url", "").strip() @@ -103,39 +124,63 @@ def init_book(): result = InitService.execute(url) msg = f"Boek geregistreerd: {result.get('title')}" + reg = [b for b in get_registered_books() if b.get("status") != "hidden"] + return render_template( "dashboard/dashboard.html", message=msg, - books=list_active_books(), # Redis - registered=get_registered_books(), # SQLite INIT results + books=list_active_books(), + registered=reg, logs=get_ui_logs(), ) except Exception as e: log_debug(f"[INIT] ERROR: {e}") + reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", error=f"INIT mislukt: {e}", books=list_active_books(), - registered=get_registered_books(), + registered=reg, logs=get_ui_logs(), ) +@app.route("/hide/", methods=["POST"]) +@logcall +def hide_registered_book(book_id): + """ + Soft-delete/hide voor GUI. + De DB blijft intact. + """ + # try: + # hide_book(book_id) + # return redirect("/dashboard") + # # return jsonify({"status": "ok", "hidden": book_id}) + # except Exception as e: + # return jsonify({"status": "error", "message": str(e)}), 500 + + @app.route("/start", methods=["POST"]) +@logcall def start_scraping(): - url = request.form.get("url", "").strip() + """ + Start FULL scraping vanuit een geregistreerd INIT-record. + """ + book_id = request.form.get("book_id") + if not book_id: + return jsonify({"status": "error", "message": "book_id ontbreekt"}), 400 + + book = fetch_book(book_id) + if not book: + return jsonify({"status": "error", "message": "Boek niet gevonden"}), 404 + + url = book.get("book_url") if not url: - return render_template( - "dashboard/dashboard.html", - error="Geen URL opgegeven.", - books=list_active_books(), - registered=get_registered_books(), - logs=get_ui_logs(), - ) + return jsonify({"status": "error", "message": "book_url ontbreekt"}), 500 reset_ui_logs() - log_debug(f"[WEB] Scraping via Celery: {url}") + log_debug(f"[WEB] Starting FULL scrape for book_id={book_id}, url={url}") async_result = celery_app.send_task( "scraper.tasks.scraping.start_scrape_book", @@ -143,16 +188,19 @@ def start_scraping(): queue="scraping", ) + reg = [b for b in get_registered_books() if b.get("status") != "hidden"] + return render_template( "dashboard/dashboard.html", scraping_task_id=async_result.id, books=list_active_books(), - registered=get_registered_books(), + registered=reg, logs=get_ui_logs(), ) @app.route("/abort/", methods=["POST"]) +@logcall def abort_download(book_id): log_debug(f"[WEB] Abort requested for book: {book_id}") set_abort(book_id) @@ -165,27 +213,32 @@ def abort_download(book_id): @app.route("/api/books") +@logcall def api_books(): return jsonify(list_active_books()) @app.route("/api/book//status") +@logcall def api_book_status(book_id): return jsonify(getStatus(book_id)) @app.route("/api/book//logs") +@logcall def api_book_logs(book_id): logs = r.lrange(f"logs:{book_id}", 0, -1) or [] return jsonify(logs) @app.route("/progress/") +@logcall def progress(book_id): return jsonify(get_progress(book_id)) @app.route("/celery-result/") +@logcall def celery_result(task_id): result = AsyncResult(task_id, app=celery_app) if result.successful(): @@ -196,32 +249,66 @@ def celery_result(task_id): @app.route("/clear-logs", methods=["POST"]) +@logcall def clear_logs(): reset_ui_logs() - return jsonify({"status": "ok", "message": "UI logs cleared"}) + return jsonify({"status": "ok"}) @app.route("/logs", methods=["GET"]) +@logcall def logs(): + # LAST_LOG_INDEX vanuit de client (default = -1 bij eerste call) try: last_index = int(request.args.get("last_index", -1)) except: last_index = -1 - new_lines, total = get_ui_logs_delta(last_index) - return jsonify({"lines": new_lines, "total": total}) + # Haal volledige huidige loglijst op + all_logs = get_ui_logs() or [] + + # Delta: alle regels met index > last_index + new_lines = [] + new_last = last_index + + for idx, line in enumerate(all_logs): + if idx > last_index: + new_lines.append(line) + new_last = idx + + return jsonify({"lines": new_lines, "last": new_last}) # ===================================================== # SECTION 4 — DEBUG ROUTES # ===================================================== +@app.route("/debug/sync_state", methods=["GET"]) +def debug_sync_state(): + results = sync_books_from_redis() + return {"status": "ok", "synced": results} + + +from scraper.utils.state_sync import inspect_books_state + + +@app.route("/debug/inspect_state", methods=["GET"]) +def debug_inspect_state(): + """ + Shows: + - raw SQLite values, + - raw Redis values, + - what the merged result WOULD be. + No writes happen. + """ + results = inspect_books_state() + return render_template("debug/inspect_state.html", results=results) @app.route("/debug/redis-keys") +@logcall def debug_redis_keys(): cursor = 0 results = {} - while True: cursor, keys = r.scan(cursor, match="*", count=200) for k in keys: @@ -231,22 +318,17 @@ def debug_redis_keys(): results[k] = "" if cursor == 0: break - return jsonify(results) # ===================================================== -# DB DEBUG: LIST ALL BOOKS FROM SQLITE +# DB DEBUG # ===================================================== -from db.repository import fetch_all_books @app.route("/api/db/books") +@logcall def api_db_books(): - """ - Return ALL books stored in SQLite — including INIT-only entries. - Useful to verify that /init wrote correct metadata. - """ try: books = fetch_all_books() return jsonify({"status": "ok", "books": books}) @@ -254,11 +336,74 @@ def api_db_books(): return jsonify({"status": "error", "message": str(e)}), 500 +# ============================================= +# DEBUG QUEUE VIEW (HTML) +# ============================================= +from flask import render_template +from urllib.parse import urlparse +import redis +import os +from celery_app import celery_app + + +@app.route("/debug/queues") +def debug_queues(): + insp = celery_app.control.inspect() + + workers_active = insp.active() or {} + workers_scheduled = insp.scheduled() or {} + workers_reserved = insp.reserved() or {} + + # ---- Redis connection ---- + redis_url = os.getenv("REDIS_BROKER") + parsed = urlparse(redis_url) + + r = redis.Redis( + host=parsed.hostname, + port=parsed.port, + db=int(parsed.path.strip("/") or 0), + decode_responses=True, + ) + + queue_names = ["scraping", "controller", "download", "parse", "save", "audio"] + + queues = [] + for q in queue_names: + key = f"celery:{q}" + try: + queues.append( + { + "name": q, + "redis_key": key, + "length": r.llen(key), + "items": r.lrange(key, 0, 30), # first 30 entries + } + ) + except Exception as e: + queues.append( + { + "name": q, + "redis_key": key, + "length": "ERR", + "items": [str(e)], + } + ) + + return render_template( + "debug/queues.html", + queues=queues, + workers_active=workers_active, + workers_reserved=workers_reserved, + workers_scheduled=workers_scheduled, + ) + + # ===================================================== # SECTION 5 — INTERNAL HELPERS # ===================================================== +@logcall def getStatus(book_id): state = r.hgetall(f"book:{book_id}:state") status = state.get("status") or "unknown" @@ -280,6 +425,7 @@ def getStatus(book_id): } +@logcall def list_active_books(): books = [] for key in r.scan_iter(match="book:*:state", count=1000): diff --git a/bookscraper/audio_worker_local.py b/bookscraper/audio_worker_local.py index 4affa93..229e2f0 100644 --- a/bookscraper/audio_worker_local.py +++ b/bookscraper/audio_worker_local.py @@ -54,6 +54,7 @@ def main(): "-l", "INFO", "--pool=prefork", + "--concurrency=2", ] print("[AUDIO-LOCAL] Launching Celery via subprocess…") diff --git a/bookscraper/celery_app.py b/bookscraper/celery_app.py index adf08bd..2aa97ba 100644 --- a/bookscraper/celery_app.py +++ b/bookscraper/celery_app.py @@ -32,6 +32,17 @@ celery_app = Celery( ], ) +# >>>>> PLAATS DIT HIER <<<<< +celery_app.conf.update( + worker_redirect_stdouts_level="WARNING", + task_send_sent_event=False, + resultrepr_maxsize=0, + worker_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s", + worker_task_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s", +) +# >>>>> TOT HIER <<<<< + + celery_app.conf.task_routes = { "scraper.tasks.scraping.*": {"queue": "scraping"}, "scraper.tasks.controller_tasks.*": {"queue": "controller"}, diff --git a/bookscraper/db/db.py b/bookscraper/db/db.py index 4706d0e..f9d4363 100644 --- a/bookscraper/db/db.py +++ b/bookscraper/db/db.py @@ -60,9 +60,10 @@ def init_db(): book_id TEXT PRIMARY KEY, title TEXT, author TEXT, - + description TEXT, cover_url TEXT, cover_path TEXT, + book_url TEXT, chapters_total INTEGER, @@ -72,6 +73,7 @@ def init_db(): audio_done INTEGER DEFAULT 0, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + processdate DATETIME, last_update DATETIME ); """ @@ -79,14 +81,38 @@ def init_db(): conn.commit() # -------------------------------------------------------- - # SCHEMA UPGRADE: add description column if missing + # SCHEMA UPGRADE UTIL # -------------------------------------------------------- + def add_column(name, type_): + try: + conn.execute(f"ALTER TABLE books ADD COLUMN {name} {type_};") + except: + pass # column already exists + cols = conn.execute("PRAGMA table_info(books);").fetchall() colnames = [c[1] for c in cols] - if "description" not in colnames: - conn.execute("ALTER TABLE books ADD COLUMN description TEXT;") - conn.commit() + # Existing: ensure new metadata fields exist + add_column("description", "TEXT") + add_column("cover_path", "TEXT") + add_column("book_url", "TEXT") + + # -------------------------------------------------------- + # NEW FIELDS — MATCH REDIS STATE MODEL (future-proof) + # These do NOT change logic, but enable repository snapshot sync. + # -------------------------------------------------------- + + # Download counters + add_column("chapters_download_done", "INTEGER DEFAULT 0") + add_column("chapters_download_skipped", "INTEGER DEFAULT 0") + + # Audio counters + add_column("audio_skipped", "INTEGER DEFAULT 0") + + # Optional future fields + add_column("audio_total", "INTEGER DEFAULT 0") + + conn.commit() # ------------------------------------------------------------ @@ -124,6 +150,5 @@ def _raw_get_book(book_id): def _raw_get_all_books(): conn = get_db() - # unchanged cur = conn.execute("SELECT * FROM books ORDER BY created_at DESC;") return [dict(row) for row in cur.fetchall()] diff --git a/bookscraper/db/repository.py b/bookscraper/db/repository.py index b34d871..faefc94 100644 --- a/bookscraper/db/repository.py +++ b/bookscraper/db/repository.py @@ -1,60 +1,164 @@ # ============================================================ # File: db/repository.py # Purpose: -# High-level BookScraper database interface. -# This is the ONLY module Celery tasks and Flask should use. +# Unified façade for BookScraper database state. # -# New additions for INIT-flow: -# - register_book() -# - update_book_after_full_scrape() -# - get_registered_books() -# - get_active_books() -# -# Existing functions remain unchanged for backward compatibility. +# Responsibilities: +# - Route metadata → SQLite +# - Route counters → Redis (live) + SQLite (snapshot) +# - Provide a clean API for tasks and Flask UI # ============================================================ -from db.db import ( - upsert_book, - _raw_get_book, - _raw_get_all_books, - get_db, +from scraper.logger_decorators import logcall +from logbus.publisher import log + +import redis +import os +import time + +# ============================================================ +# SQL low-level engines (snapshot storage) +# ============================================================ +from db.state_sql import ( + sql_fetch_book, + sql_fetch_all_books, + sql_set_status, + sql_set_chapters_total, + sql_register_book, + sql_update_book, + sql_inc_downloaded, + sql_inc_parsed, + sql_inc_audio_done, + sql_inc_audio_skipped, +) + +# ============================================================ +# REDIS low-level engines (live counters) +# ============================================================ +from db.state_redis import ( + redis_set_status, + redis_set_chapters_total, + redis_inc_download_done, + redis_inc_download_skipped, + redis_inc_parsed_done, + redis_inc_audio_done, + redis_inc_audio_skipped, ) +# ============================================================ +# Redis setup for legacy progress paths +# ============================================================ +REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") +_r = redis.Redis.from_url(REDIS_URL, decode_responses=True) + + +# ============================================================ +# INTERNAL — legacy progress helpers +# ============================================================ +def _legacy_set_total(book_id, total): + _r.set(f"progress:{book_id}:total", total) + + +def _legacy_inc_completed(book_id): + _r.incr(f"progress:{book_id}:completed") + + +def _legacy_inc_skipped(book_id): + _r.incr(f"progress:{book_id}:skipped") + + +def _legacy_inc_failed(book_id): + _r.incr(f"progress:{book_id}:failed") -# ------------------------------------------------------------ -# FETCH OPERATIONS -# ------------------------------------------------------------ + +def _legacy_add_failed_chapter(book_id, chapter, reason): + entry = f"Chapter {chapter}: {reason}" + _r.rpush(f"progress:{book_id}:failed_list", entry) + + +def _legacy_get_failed_list(book_id): + return _r.lrange(f"progress:{book_id}:failed_list", 0, -1) + + +def _legacy_get_progress(book_id): + total = int(_r.get(f"progress:{book_id}:total") or 0) + completed = int(_r.get(f"progress:{book_id}:completed") or 0) + skipped = int(_r.get(f"progress:{book_id}:skipped") or 0) + failed = int(_r.get(f"progress:{book_id}:failed") or 0) + abort = _r.exists(f"abort:{book_id}") == 1 + failed_list = _legacy_get_failed_list(book_id) + + return { + "book_id": book_id, + "total": total, + "completed": completed, + "skipped": skipped, + "failed": failed, + "failed_list": failed_list, + "abort": abort, + } + + +# ============================================================ +# PUBLIC — UI-ready legacy progress access +# ============================================================ +@logcall +def get_progress(book_id): + return _legacy_get_progress(book_id) + + +@logcall +def add_failed_chapter(book_id, chapter, reason): + _legacy_add_failed_chapter(book_id, chapter, reason) + + +@logcall +def get_failed_list(book_id): + return _legacy_get_failed_list(book_id) + + +# ============================================================ +# FETCH OPERATIONS (SQLite snapshot) +# ============================================================ +@logcall def fetch_book(book_id): - """Return a single book dict or None.""" - return _raw_get_book(book_id) + return sql_fetch_book(book_id) +@logcall def fetch_all_books(): - """Return all books ordered newest → oldest.""" - return _raw_get_all_books() + return sql_fetch_all_books() # ============================================================ -# NEW — INIT-FLOW SUPPORT +# INIT-FLOW (SQLite metadata only) # ============================================================ +@logcall +def register_book( + book_id, + title, + author=None, + description=None, + cover_url=None, + cover_path=None, + book_url=None, +): - -def register_book(book_id, title, author=None, description=None, cover_url=None): - """ - Create a new book entry with initial metadata. - Called when user enters a URL and presses INIT. - """ fields = { "title": title, "author": author, "description": description, "cover_url": cover_url, + "cover_path": cover_path, + "book_url": book_url, "chapters_total": 0, "status": "registered", } - upsert_book(book_id, **fields) + log(f"[DB] Registering new book={book_id} title='{title}'") + sql_register_book(book_id, fields) +@logcall def update_book_after_full_scrape( book_id, title=None, @@ -63,10 +167,6 @@ def update_book_after_full_scrape( cover_url=None, chapters_total=None, ): - """ - Called after a FULL scrape when chapters are known. - Moves the book into 'active' state. - """ fields = {} if title is not None: @@ -82,94 +182,122 @@ def update_book_after_full_scrape( fields["status"] = "active" - upsert_book(book_id, **fields) + log(f"[DB] update full scrape metadata book={book_id}") + sql_update_book(book_id, fields) +# ============================================================ +# ACTIVE BOOK LISTS +# ============================================================ +@logcall def get_registered_books(): - """ - Return books registered but not yet scraped. - """ - conn = get_db() - cur = conn.execute( - """SELECT * FROM books WHERE status='registered' - ORDER BY created_at DESC""" - ) - return [dict(row) for row in cur.fetchall()] + all_books = sql_fetch_all_books() + return [b for b in all_books if b.get("status") == "registered"] +@logcall def get_active_books(): - """ - Return books currently in progress. - """ - conn = get_db() - cur = conn.execute( - """SELECT * FROM books - WHERE status IN ('active', 'downloading') - ORDER BY created_at DESC""" - ) - return [dict(row) for row in cur.fetchall()] - - -# ------------------------------------------------------------ -# BOOK CREATION / METADATA (existing) -# ------------------------------------------------------------ -def create_or_update_book( - book_id, - title=None, - author=None, - chapters_total=None, - cover_url=None, - cover_path=None, - status=None, -): - fields = {} + all_books = sql_fetch_all_books() + log(f"[DB] Fetched all books for active filter, total={len(all_books)}") + return [b for b in all_books if b.get("status") in ("active", "downloading")] - if title is not None: - fields["title"] = title - if author is not None: - fields["author"] = author - if chapters_total is not None: - fields["chapters_total"] = chapters_total - if cover_url is not None: - fields["cover_url"] = cover_url - if cover_path is not None: - fields["cover_path"] = cover_path - if status is not None: - fields["status"] = status - if fields: - upsert_book(book_id, **fields) +# ============================================================ +# STATUS MANAGEMENT +# ============================================================ +@logcall +def set_status(book_id, status): + log(f"[DB] Setting status for {book_id} to '{status}'") + redis_set_status(book_id, status) + sql_set_status(book_id, status) -# ------------------------------------------------------------ -# STATUS MANAGEMENT (existing) -# ------------------------------------------------------------ -def set_status(book_id, status): - upsert_book(book_id, status=status) +# ============================================================ +# CHAPTER TOTALS +# ============================================================ +@logcall +def set_chapters_total(book_id, total): + log(f"[DB] Setting chapter total for {book_id} to {total}") + redis_set_chapters_total(book_id, total) + sql_set_chapters_total(book_id, total) + _legacy_set_total(book_id, total) # integrate legacy progress + + +# ============================================================ +# COUNTERS — DOWNLOAD +# ============================================================ +@logcall +def inc_download_done(book_id, amount=1): + log(f"[DB] Incrementing download done for {book_id} by {amount}") + redis_inc_download_done(book_id, amount) + sql_inc_downloaded(book_id, amount) + _legacy_inc_completed(book_id) + + +@logcall +def inc_download_skipped(book_id, amount=1): + log(f"[DB] Incrementing download skipped for {book_id} by {amount}") + redis_inc_download_skipped(book_id, amount) + _legacy_inc_skipped(book_id) + + +# ============================================================ +# COUNTERS — PARSE +# ============================================================ +@logcall +def inc_parsed_done(book_id, amount=1): + log(f"[DB] Incrementing parsed done for {book_id} by {amount}") + redis_inc_parsed_done(book_id, amount) + sql_inc_parsed(book_id, amount) -# ------------------------------------------------------------ -# INCREMENTING COUNTERS (existing — backward compat only) -# ------------------------------------------------------------ +# ============================================================ +# COUNTERS — AUDIO +# ============================================================ +# ============================================================ +# COUNTERS — AUDIO SKIPPED +# ============================================================ +@logcall +def inc_audio_skipped(book_id, amount=1): + log(f"[DB] Incrementing audio skipped for {book_id} by {amount}") + # Redis live counter (maak deze functie in state_redis wanneer nodig) + sql_inc_audio_skipped(book_id, amount) + redis_inc_audio_skipped(book_id, amount) + # Geen SQLite kolom? Dan overslaan. + + +@logcall +def inc_audio_done(book_id, amount=1): + log(f"[DB] Incrementing audio done for {book_id} by {amount}") + redis_inc_audio_done(book_id, amount) + sql_inc_audio_done(book_id, amount) + + +# ============================================================ +# BACKWARDS COMPATIBILITY SHIMS (old task API) +# ============================================================ + + +@logcall def inc_downloaded(book_id, amount=1): - book = _raw_get_book(book_id) - if not book: - return - cur = book.get("downloaded", 0) or 0 - upsert_book(book_id, downloaded=cur + amount) + """ + Old name used by older tasks. + Redirects to new unified counter. + """ + return inc_download_done(book_id, amount) +@logcall def inc_parsed(book_id, amount=1): - book = _raw_get_book(book_id) - if not book: - return - cur = book.get("parsed", 0) or 0 - upsert_book(book_id, parsed=cur + amount) + """ + Old name used by older tasks. + """ + return inc_parsed_done(book_id, amount) -def inc_audio_done(book_id, amount=1): - book = _raw_get_book(book_id) - if not book: - return - cur = book.get("audio_done", 0) or 0 - upsert_book(book_id, audio_done=cur + amount) +@logcall +def inc_audio_done_legacy(book_id, amount=1): + """ + Old audio name used by older tasks. + """ + return inc_audio_done(book_id, amount) diff --git a/bookscraper/db/state_redis.py b/bookscraper/db/state_redis.py new file mode 100644 index 0000000..232d7af --- /dev/null +++ b/bookscraper/db/state_redis.py @@ -0,0 +1,79 @@ +# ============================================================ +# File: db/state_redis.py +# Purpose: +# Low-level Redis counters/state for BookScraper. +# Used ONLY by db.repository façade. +# ============================================================ + +import os +import time +import redis + +from logbus.publisher import log + +REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") +r = redis.Redis.from_url(REDIS_URL, decode_responses=True) + + +# ------------------------------------------------------------ +# STATUS +# ------------------------------------------------------------ +def redis_set_status(book_id: str, status: str): + key = f"book:{book_id}:state" + r.hset(key, "status", status) + r.hset(key, "last_update", int(time.time())) + + +# ------------------------------------------------------------ +# TOTAL CHAPTERS +# ------------------------------------------------------------ +def redis_set_chapters_total(book_id: str, total: int): + key = f"book:{book_id}:state" + r.hset(key, "chapters_total", total) + r.hset(key, "last_update", int(time.time())) + + +# ------------------------------------------------------------ +# DOWNLOAD COUNTERS +# ------------------------------------------------------------ +def redis_inc_download_done(book_id: str, amount: int = 1): + key = f"book:{book_id}:state" + r.hincrby(key, "chapters_download_done", amount) + r.hset(key, "last_update", int(time.time())) + + +def redis_inc_download_skipped(book_id: str, amount: int = 1): + log(f"[DB-REDIS] Incrementing download skipped for {book_id} by {amount}") + key = f"book:{book_id}:state" + r.hincrby(key, "chapters_download_skipped", amount) + r.hset(key, "last_update", int(time.time())) + + +# ------------------------------------------------------------ +# PARSE COUNTERS +# ------------------------------------------------------------ +def redis_inc_parsed_done(book_id: str, amount: int = 1): + key = f"book:{book_id}:state" + r.hincrby(key, "chapters_parsed_done", amount) + r.hset(key, "last_update", int(time.time())) + + +# ------------------------------------------------------------ +# AUDIO COUNTERS +# ------------------------------------------------------------ +def redis_inc_audio_done(book_id: str, amount: int = 1): + log(f"[DB-REDIS] Incrementing audio done for {book_id} by {amount}") + key = f"book:{book_id}:state" + r.hincrby(key, "audio_done", amount) + r.hset(key, "last_update", int(time.time())) + + +def redis_inc_audio_skipped(book_id: str, amount: int = 1): + log(f"[DB-REDIS] Incrementing audio skipped for {book_id} by {amount}") + """ + New: Count skipped audio chapters (timeouts, pre-existing files, abort, etc.) + SQL does NOT track this; Redis-only metric. + """ + key = f"book:{book_id}:state" + r.hincrby(key, "audio_skipped", amount) + r.hset(key, "last_update", int(time.time())) diff --git a/bookscraper/db/state_sql.py b/bookscraper/db/state_sql.py new file mode 100644 index 0000000..ec7626a --- /dev/null +++ b/bookscraper/db/state_sql.py @@ -0,0 +1,165 @@ +# ============================================================ +# File: db/state_sql.py +# Purpose: +# Low-level SQLite snapshot layer for BookScraper metadata. +# Used ONLY through db.repository façade. +# ============================================================ + +import sqlite3 +import os + +from logbus.publisher import log + +DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/db/books.db") + + +# ------------------------------------------------------------ +# INTERNAL HELPERS +# ------------------------------------------------------------ +def _connect(): + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + return conn + + +# ------------------------------------------------------------ +# FETCH +# ------------------------------------------------------------ +def sql_fetch_book(book_id): + conn = _connect() + cur = conn.cursor() + cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,)) + row = cur.fetchone() + conn.close() + return dict(row) if row else None + + +def sql_fetch_all_books(): + conn = _connect() + cur = conn.cursor() + cur.execute("SELECT * FROM books ORDER BY rowid DESC") + rows = cur.fetchall() + conn.close() + return [dict(r) for r in rows] + + +# ------------------------------------------------------------ +# REGISTER / UPDATE +# ------------------------------------------------------------ +def sql_register_book(book_id, fields: dict): + conn = _connect() + cur = conn.cursor() + + cols = ", ".join(["book_id"] + list(fields.keys())) + placeholders = ", ".join(["?"] * (1 + len(fields))) + values = [book_id] + list(fields.values()) + + cur.execute( + f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})", values + ) + conn.commit() + conn.close() + + +def sql_update_book(book_id, fields: dict): + if not fields: + return + + conn = _connect() + cur = conn.cursor() + + set_clause = ", ".join([f"{k} = ?" for k in fields]) + params = list(fields.values()) + [book_id] + + cur.execute(f"UPDATE books SET {set_clause} WHERE book_id = ?", params) + conn.commit() + conn.close() + + +# ------------------------------------------------------------ +# STATUS +# ------------------------------------------------------------ +def sql_set_status(book_id, status: str): + conn = _connect() + cur = conn.cursor() + cur.execute("UPDATE books SET status = ? WHERE book_id = ?", (status, book_id)) + conn.commit() + conn.close() + + +# ------------------------------------------------------------ +# CHAPTER TOTAL (snapshot) +# ------------------------------------------------------------ +def sql_set_chapters_total(book_id, total: int): + conn = _connect() + cur = conn.cursor() + cur.execute( + "UPDATE books SET chapters_total = ? WHERE book_id = ?", (total, book_id) + ) + conn.commit() + conn.close() + + +# ------------------------------------------------------------ +# COUNTERS (SNAPSHOT-ONLY) +# ------------------------------------------------------------ +def sql_inc_downloaded(book_id, amount=1): + conn = _connect() + cur = conn.cursor() + cur.execute( + """ + UPDATE books + SET downloaded = COALESCE(downloaded,0) + ? + WHERE book_id = ? + """, + (amount, book_id), + ) + conn.commit() + conn.close() + + +def sql_inc_parsed(book_id, amount=1): + conn = _connect() + cur = conn.cursor() + cur.execute( + """ + UPDATE books + SET parsed = COALESCE(parsed,0) + ? + WHERE book_id = ? + """, + (amount, book_id), + ) + conn.commit() + conn.close() + + +def sql_inc_audio_done(book_id, amount=1): + log(f"[DB-SQL] Incrementing audio done for {book_id} by {amount}") + conn = _connect() + cur = conn.cursor() + cur.execute( + """ + UPDATE books + SET audio_done = COALESCE(audio_done,0) + ? + WHERE book_id = ? + """, + (amount, book_id), + ) + conn.commit() + conn.close() + + +def sql_inc_audio_skipped(book_id, amount=1): + log(f"[DB-SQL] Incrementing audio skipped for {book_id} by {amount}") + conn = _connect() + cur = conn.cursor() + cur.execute( + """ + UPDATE books + SET audio_skipped = COALESCE(audio_skipped,0) + ? + WHERE book_id = ? + """, + (amount, book_id), + ) + conn.commit() + conn.close() diff --git a/bookscraper/logbus/publisher.py b/bookscraper/logbus/publisher.py index d87695a..ed699e7 100644 --- a/bookscraper/logbus/publisher.py +++ b/bookscraper/logbus/publisher.py @@ -1,9 +1,31 @@ # logbus/publisher.py import logging +import os logger = logging.getLogger("logbus") +logger.setLevel(logging.WARNING) + +# ============================================================ +# FILE LOGGER — log.txt in BOOKSCRAPER_OUTPUT_DIR +# ============================================================ +try: + root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") + os.makedirs(root, exist_ok=True) + + file_path = os.path.join(root, "log.txt") + + file_handler = logging.FileHandler(file_path, mode="a", encoding="utf-8") + file_formatter = logging.Formatter("%(message)s") # exact zoals input + file_handler.setFormatter(file_formatter) + + logger.addHandler(file_handler) + +except Exception: + # Logging naar file mag nooit de app laten crashen + pass + def log(message: str): """ diff --git a/bookscraper/scraper/progress.py b/bookscraper/scraper/_dep_progress.py similarity index 100% rename from bookscraper/scraper/progress.py rename to bookscraper/scraper/_dep_progress.py diff --git a/bookscraper/scraper/abort.py b/bookscraper/scraper/abort.py index 2df7880..2c67f75 100644 --- a/bookscraper/scraper/abort.py +++ b/bookscraper/scraper/abort.py @@ -1,6 +1,8 @@ import os import redis +from scraper.logger_decorators import logcall + # GUI log (non-breaking) from scraper.ui_log import push_ui diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py index 52c5bd7..a34976d 100644 --- a/bookscraper/scraper/book_scraper.py +++ b/bookscraper/scraper/book_scraper.py @@ -1,21 +1,66 @@ # ============================================================ # File: scraper/book_scraper.py # Purpose: -# Backwards-compatible wrapper giving same API as before. -# Uses the new engine under the hood. +# Backwards-compatible wrapper giving the SAME public API +# as the old BookScraper, but internally uses ScrapeEngine. +# +# execute() → full metadata + chapterlist +# +# (* Chapter downloading komt later in ScrapeEngine, +# maar deze wrapper hoeft NIET aangepast te worden.) # ============================================================ -from scraper.engine.parser import extract_metadata_full +from scraper.logger_decorators import logcall +from scraper.services.scrape_engine import ScrapeEngine class BookScraper: - def __init__(self, site_scraper, url): + """ + Backwards-compatible BookScraper façade. + + In het oude systeem deed BookScraper ALLES: + - metadata ophalen + - cover ophalen + - hoofdstukkenlijst + - hoofdstukken downloaden + - volume folders + - skip logic + + In het nieuwe systeem is dát opgesplitst: + + ScrapeEngine → metadata / chapterlist / download engine (in ontwikkeling) + BookScraper → behoudt dezelfde API als voorheen + + Daardoor kunnen Celery-tasks en oudere modules blijven werken + zonder refactor-chaos. + """ + + @logcall + def __init__(self, site_scraper, url: str): self.site = site_scraper self.url = url + @logcall def execute(self): """ - Backwards compatible full scrape: - returns {title, author, description, cover_url, chapters, book_url} + Public legacy API. + Retourneert metadata + chapters EXACT zoals de oude BookScraper + vóór downloadfase. + + Dit is belangrijk: + - INIT-flow gebruikt metadata only + - scraping tasks gebruiken chapterlist """ - return extract_metadata_full(self.url, self.site) + + data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url) + + # Legacy output structuur volledig repliceren: + return { + "title": data.get("title"), + "author": data.get("author"), + "description": data.get("description"), + "cover_url": data.get("cover_url"), + "chapters": data.get("chapters", []), + "chapters_total": data.get("chapters_total", 0), + "book_url": data.get("book_url"), + } diff --git a/bookscraper/scraper/download_controller.py b/bookscraper/scraper/download_controller.py index f3ad53f..8f5c9ed 100644 --- a/bookscraper/scraper/download_controller.py +++ b/bookscraper/scraper/download_controller.py @@ -16,14 +16,6 @@ import os import requests import shutil from scraper.abort import abort_requested # DEBUG allowed -from db.repository import create_or_update_book - -# NEW: Redis State Model (C&U) -from scraper.progress import ( - init_book_state, - set_status, - set_chapter_total, -) class DownloadController: diff --git a/bookscraper/scraper/logger_decorators.py b/bookscraper/scraper/logger_decorators.py new file mode 100644 index 0000000..095abc1 --- /dev/null +++ b/bookscraper/scraper/logger_decorators.py @@ -0,0 +1,33 @@ +# ============================================================ +# File: scraper/logger_decorators.py +# Purpose: Function-call logging decorator +# ============================================================ + +from functools import wraps +from scraper.logger import log_debug + + +def logcall(func): + """ + Decorator: log function name + arguments every time it's called. + Usage: @logcall above any function. + """ + + @wraps(func) + def wrapper(*args, **kwargs): + # Naam van de functie + name = func.__qualname__ + + # Eerste logregel vóór uitvoering + # log_debug(f"[CALL] {name} args={args} kwargs={kwargs}") + log_debug(f"[CALL] {name} args={args}") + # log_debug(f"[CALL] {name}") + + result = func(*args, **kwargs) + + # Log ná uitvoering + # log_debug(f"[RETURN] {name} → {result}") + + return result + + return wrapper diff --git a/bookscraper/scraper/scriptgen.py b/bookscraper/scraper/scriptgen.py index bd9c148..4f94d68 100644 --- a/bookscraper/scraper/scriptgen.py +++ b/bookscraper/scraper/scriptgen.py @@ -6,6 +6,8 @@ import os import stat from logbus.publisher import log +from scraper.logger_decorators import logcall + TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates") diff --git a/bookscraper/scraper/services/cover_service.py b/bookscraper/scraper/services/cover_service.py index 8392947..3df193c 100644 --- a/bookscraper/scraper/services/cover_service.py +++ b/bookscraper/scraper/services/cover_service.py @@ -5,12 +5,13 @@ import os import requests from logbus.publisher import log +from typing import Optional class CoverService: @staticmethod - def download_main_cover(cover_url: str, book_id: str) -> str | None: + def download_main_cover(cover_url: str, book_id: str) -> Optional[str]: """ Downloads cover image into: static/covers/.jpg. Returns local path or None. diff --git a/bookscraper/scraper/services/init_service.py b/bookscraper/scraper/services/init_service.py index 91bf5bf..601c748 100644 --- a/bookscraper/scraper/services/init_service.py +++ b/bookscraper/scraper/services/init_service.py @@ -16,10 +16,13 @@ from scraper.services.cover_service import CoverService from db.repository import register_book +from scraper.logger_decorators import logcall + class InitService: @staticmethod + @logcall def derive_book_id(url: str) -> str: """ PTWXZ URL format ends with /{id}.html. @@ -31,16 +34,20 @@ class InitService: return url.replace("/", "_") @staticmethod + @logcall def execute(url: str) -> dict: """ Main INIT-flow entry point. Returns complete metadata + registration info. """ - # 1) Determine which BookSite applies + # 1) Determine site site = SiteResolver.resolve(url) - # 2) Metadata only (no chapters) + book_id = InitService.derive_book_id(url) + + site.book_id = book_id + # 2) Metadata only meta = ScrapeEngine.fetch_metadata_only(site, url) title = meta.get("title") or "Unknown" @@ -48,27 +55,27 @@ class InitService: description = meta.get("description") cover_url = meta.get("cover_url") - # 3) Determine book_id - book_id = InitService.derive_book_id(url) + # 4) Download UI cover (NEW: capture returned local path) + cover_path = CoverService.download_main_cover(cover_url, book_id) - # 4) SQLite registration + # 5) SQLite registration INCLUDING cover_path ← ★ FIX register_book( book_id=book_id, title=title, author=author, description=description, cover_url=cover_url, + cover_path=cover_path, # ← ★ BELANGRIJK + book_url=url, ) - # 5) Download UI cover - CoverService.download_main_cover(cover_url, book_id) - - # 6) Structured output for UI + # 6) Output for UI return { "book_id": book_id, "title": title, "author": author, "description": description, "cover_url": cover_url, + "cover_path": cover_path, # ← handig voor UI "status": "registered", } diff --git a/bookscraper/scraper/services/scrape_engine.py b/bookscraper/scraper/services/scrape_engine.py index 35df5ac..041d0f3 100644 --- a/bookscraper/scraper/services/scrape_engine.py +++ b/bookscraper/scraper/services/scrape_engine.py @@ -1,33 +1,287 @@ # ============================================================ # File: scraper/services/scrape_engine.py # Purpose: -# Provide unified scraping methods for INIT-flow. -# Reuses BookScraper internally with ZERO duplication. +# Unified scraping engine for INIT-flow and Celery tasks. +# All functions are fully logged via @logcall. # ============================================================ -from scraper.book_scraper import BookScraper +import os +import time +import re +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse + +from logbus.publisher import log +from scraper.logger import log_debug +from scraper.logger_decorators import logcall +from scraper.utils.utils import load_replacements class ScrapeEngine: """ - Adapter layer around BookScraper. - Allows INIT-flow to fetch ONLY metadata (no chapters). + Central scraping engine. + Metadata + chapterlist scraping. + All methods logged with @logcall. """ + # ------------------------------------------------------------ + # REPLACEMENTS LOADER + # ------------------------------------------------------------ @staticmethod - def fetch_metadata_only(site, url: str) -> dict: + @logcall + def _apply_replacements(site): + fp = os.path.join(os.getcwd(), "replacements.txt") + extra = load_replacements(fp) + if not hasattr(site, "replacements"): + site.replacements = {} + site.replacements.update(extra) + return True + + # ------------------------------------------------------------ + # RATE LIMITER + # ------------------------------------------------------------ + MIN_DELAY = 1.0 / float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1")) + + @staticmethod + @logcall + def _throttle(last_time=[0]): + now = time.time() + elapsed = now - last_time[0] + if elapsed < ScrapeEngine.MIN_DELAY: + time.sleep(ScrapeEngine.MIN_DELAY - elapsed) + last_time[0] = time.time() + return True + + # ------------------------------------------------------------ + # HTTP GET + # ------------------------------------------------------------ + @staticmethod + @logcall + def _get_doc(url: str, site): + attempt = 1 + while True: + ScrapeEngine._throttle() + log_debug(f"[SCRAPER] GET {url} (attempt {attempt})") + + try: + resp = requests.get( + url, + headers={"User-Agent": "Mozilla/5.0"}, + timeout=10, + ) + except Exception as e: + log_debug(f"Network error {e} → retry {attempt + 1}s") + time.sleep(attempt + 1) + attempt += 1 + continue + + code = resp.status_code + + if code == 200: + resp.encoding = getattr(site, "encoding", "utf-8") + return BeautifulSoup(resp.text, "lxml") + + if code == 429: + cooldown = 60 + log_debug("429 detected — cooldown 60s") + for i in range(cooldown, 0, -1): + log_debug(f" cooldown {i}s…") + time.sleep(1) + attempt += 1 + continue + + if code in (403, 500): + wait = min(5 * attempt, 30) + log_debug(f"HTTP {code} → retry in {wait}s") + time.sleep(wait) + attempt += 1 + continue + + wait = attempt + 1 + log_debug(f"Unexpected HTTP {code} → sleep {wait}s") + time.sleep(wait) + attempt += 1 + + # ------------------------------------------------------------ + # PARSER HELPERS + # ------------------------------------------------------------ + @staticmethod + @logcall + def _parse_title(soup): + h1 = soup.find("h1") + return h1.get_text(strip=True) if h1 else "UnknownTitle" + + @staticmethod + @logcall + def _parse_author(soup): + td = soup.find("td", string=lambda t: t and "作" in t) + if td and ":" in td.get_text(): + return td.get_text(strip=True).split(":")[1] + return "UnknownAuthor" + + @staticmethod + @logcall + def _parse_description(soup): + span = soup.find("span", string=lambda t: t and "内容简介" in t) + if not span: + return "" + parts = [] + for sib in span.next_siblings: + if getattr(sib, "name", None) == "span": + break + txt = ( + sib.get_text(strip=True) + if hasattr(sib, "get_text") + else str(sib).strip() + ) + if txt: + parts.append(txt) + return "\n".join(parts) + + # ------------------------------------------------------------ + # COVER PARSER + # ------------------------------------------------------------ + @staticmethod + @logcall + def _parse_cover(soup, site): """ - Execute BookScraper but return ONLY metadata. - Chapters are intentionally removed. + Vind cover door book_id substring matching: + - haal book_id uit site.url + - zoek IMG-tags waarvan filename book_id bevat + - kies kortste filename als beste match """ - scraper = BookScraper(site, url) - result = scraper.execute() # returns full metadata + chapters + try: + parsed = urlparse(site.url) + m = re.search(r"/(\d+)\.html$", parsed.path) + if m: + book_id = m.group(1) + else: + book_id = parsed.path.rstrip("/").split("/")[-1] + except Exception: + return None + + imgs = soup.find_all("img", src=True) + candidates = [] + + for img in imgs: + src = img["src"].strip() + filename = os.path.basename(src) + if book_id in filename: + candidates.append((filename, src)) + + if not candidates: + return None + + candidates.sort(key=lambda t: len(t[0])) # kortste filename wint + best_src = candidates[0][1] + + return urljoin(site.root, best_src) + + # ------------------------------------------------------------ + # RESOLVE CHAPTER PAGE + # ------------------------------------------------------------ + @staticmethod + @logcall + def _resolve_chapter_page(soup, site): + node = soup.select_one( + "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" + ) + if not node: + raise ValueError("Could not locate chapter list base node") + + href = node.select_one("a").get("href") + url = urljoin(site.root, href) + + parsed = urlparse(url) + basepath = parsed.path.rsplit("/", 1)[0] + "/" + chapter_base = f"{parsed.scheme}://{parsed.netloc}{basepath}" + + return url, chapter_base + + # ------------------------------------------------------------ + # PARSE CHAPTER LINKS + # ------------------------------------------------------------ + @staticmethod + @logcall + def _parse_chapter_links(soup, chapter_base, selector): + cont = soup.select_one(selector) + if not cont: + return [] + + items = cont.select("ul li a[href]") + chapters = [] + idx = 1 + + for a in items: + href = a.get("href") + if not href.endswith(".html"): + continue + title = a.get_text(strip=True) + full = urljoin(chapter_base, href) + chapters.append({"num": idx, "title": title, "url": full}) + idx += 1 + + return chapters + + # ============================================================ + # PUBLIC APIS + # ============================================================ + + @staticmethod + @logcall + def fetch_metadata_only(site, url: str) -> dict: + ScrapeEngine._apply_replacements(site) + soup = ScrapeEngine._get_doc(url, site) + site.url = url # NODIG voor cover parsing - # Strip chapterlist — INIT-flow should not fetch them return { - "title": result.get("title"), - "author": result.get("author"), - "description": result.get("description"), - "cover_url": result.get("cover_url"), + "title": ScrapeEngine._parse_title(soup), + "author": ScrapeEngine._parse_author(soup), + "description": ScrapeEngine._parse_description(soup), + "cover_url": ScrapeEngine._parse_cover(soup, site), "book_url": url, } + + @staticmethod + @logcall + def fetch_metadata_and_chapters(site, url: str) -> dict: + ScrapeEngine._apply_replacements(site) + + soup = ScrapeEngine._get_doc(url, site) + site.url = url + + title = ScrapeEngine._parse_title(soup) + author = ScrapeEngine._parse_author(soup) + desc = ScrapeEngine._parse_description(soup) + cover = ScrapeEngine._parse_cover(soup, site) + + chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site) + chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site) + + chapters = ScrapeEngine._parse_chapter_links( + chapter_soup, chapter_base, site.chapter_list_selector + ) + + return { + "title": title, + "author": author, + "description": desc, + "cover_url": cover, + "chapters": chapters, + "chapters_total": len(chapters), + "book_url": url, + } + + @staticmethod + @logcall + def fetch_chapterlist(site, url: str): + ScrapeEngine._apply_replacements(site) + soup = ScrapeEngine._get_doc(url, site) + + chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site) + chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site) + + return ScrapeEngine._parse_chapter_links( + chapter_soup, chapter_base, site.chapter_list_selector + ) diff --git a/bookscraper/scraper/sites/base.py b/bookscraper/scraper/sites/base.py index b75f414..a2ec861 100644 --- a/bookscraper/scraper/sites/base.py +++ b/bookscraper/scraper/sites/base.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod from bs4 import BeautifulSoup +from typing import Optional class SiteScraper(ABC): @@ -39,7 +40,7 @@ class SiteScraper(ABC): def parse_description(self, soup: BeautifulSoup) -> str: ... @abstractmethod - def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None: ... + def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]: ... # -------------------------- # Chapter extraction diff --git a/bookscraper/scraper/sites/piaotian.py b/bookscraper/scraper/sites/piaotian.py index 95e430e..9ea89ff 100644 --- a/bookscraper/scraper/sites/piaotian.py +++ b/bookscraper/scraper/sites/piaotian.py @@ -9,6 +9,7 @@ from scraper.sites.base import SiteScraper from bs4 import BeautifulSoup from urllib.parse import urljoin import re +from typing import Optional class PiaotianScraper(SiteScraper): @@ -53,7 +54,7 @@ class PiaotianScraper(SiteScraper): # COVER PARSING # (exactly your BookScraper._parse_cover logic) # ------------------------------------------------------------ - def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None: + def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]: # Extract book_id from URL m = re.search(r"/(\d+)\.html$", url) if not m: diff --git a/bookscraper/scraper/tasks/audio_tasks.py b/bookscraper/scraper/tasks/audio_tasks.py index 18fcf55..82cd692 100644 --- a/bookscraper/scraper/tasks/audio_tasks.py +++ b/bookscraper/scraper/tasks/audio_tasks.py @@ -1,5 +1,8 @@ # ============================================================ # File: scraper/tasks/audio_tasks.py +# Purpose: Convert chapter text files into audio using macOS +# “say”, with Redis-based slot control. +# Updated: now uses db.repository for audio counters. # ============================================================ from celery_app import celery_app @@ -7,61 +10,80 @@ from logbus.publisher import log import os import subprocess import time +import socket +import os -from scraper.progress import inc_audio_done, inc_audio_skipped - -# from db.repository import inc_audio_done from scraper.abort import abort_requested +from scraper.logger_decorators import logcall from redis import Redis from urllib.parse import urlparse -# Kies lokale redis als aanwezig, anders standaard backend -redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND") +# NEW — unified repository façade +from db.repository import ( + inc_audio_done, + inc_audio_skipped, +) -parsed = urlparse(redis_url) +HOST = socket.gethostname() # ------------------------------------------------------------ -# REGULIER REDIS CLIENT (slots, file checks, dstate) +# REDIS CLIENT SETUP # ------------------------------------------------------------ +redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND") +parsed = urlparse(redis_url) + +# Slot locking Redis client redis_client = Redis( host=parsed.hostname, port=parsed.port, db=parsed.path.strip("/"), ) -# ------------------------------------------------------------ -# BACKEND CLIENT (abort flags, progress counters) - altijd DB 0 -# ------------------------------------------------------------ +# Abort + global progress flags always live in DB 0 backend_client = Redis( host=parsed.hostname, port=parsed.port, db=0, ) +# ------------------------------------------------------------ +# CONFIG +# ------------------------------------------------------------ AUDIO_TIMEOUT = int(os.getenv("AUDIO_TIMEOUT_SECONDS", "300")) AUDIO_VOICE = os.getenv("AUDIO_VOICE", "SinJi") AUDIO_RATE = int(os.getenv("AUDIO_RATE", "200")) -HOST_PATH = os.getenv("HOST_PATH", "/app/output") -AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1")) +HOST_PATH = os.getenv("HOST_PATH", "/app/output") CONTAINER_PREFIX = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "/app/output") +AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1")) + +# ============================================================ +# CELERY TASK +# ============================================================ @celery_app.task(bind=True, queue="audio", ignore_result=True) +@logcall def generate_audio( - self, book_id, volume_name, chapter_number, chapter_title, chapter_text + self, book_id, volume_name, chapter_number, chapter_title, chapter_path ): - log(f"[AUDIO] CH{chapter_number}: START task → raw_input={chapter_text}") + """ + chapter_path: absolute container path to chapter text file. + """ + + log(f"[AUDIO]({HOST}) CH{chapter_number}: START → {chapter_title}") - # Abort early + # ------------------------------------------------------------ + # ABORT CHECK + # ------------------------------------------------------------ if abort_requested(book_id, backend_client): inc_audio_skipped(book_id) - log(f"[AUDIO] ABORT detected → skip CH{chapter_number}") + log(f"[AUDIO]({HOST}) ABORT detected → skip CH{chapter_number}") return - # ============================================================ - # ACQUIRE AUDIO SLOT - # ============================================================ + # ------------------------------------------------------------ + # ACQUIRE SLOT + # ------------------------------------------------------------ slot_key = None ttl = AUDIO_TIMEOUT + 15 @@ -72,11 +94,13 @@ def generate_audio( log(f"[AUDIO] CH{chapter_number}: Acquired slot {i}/{AUDIO_SLOTS}") break + # Need to wait if slot_key is None: - log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting...") + log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting…") start_wait = time.time() while slot_key is None: + # Try all slots again for i in range(1, AUDIO_SLOTS + 1): key = f"audio_slot:{i}" if redis_client.set(key, "1", nx=True, ex=ttl): @@ -84,32 +108,32 @@ def generate_audio( log(f"[AUDIO] CH{chapter_number}: Slot acquired after wait") break - if slot_key: - break + # If still no slot + if not slot_key: + if abort_requested(book_id, backend_client): + log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}") + inc_audio_skipped(book_id) + return - if abort_requested(book_id, backend_client): - log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}") - return + if time.time() - start_wait > ttl: + log(f"[AUDIO] CH{chapter_number}: Wait timeout → abort audio") + inc_audio_skipped(book_id) + return - if time.time() - start_wait > ttl: - log(f"[AUDIO] CH{chapter_number}: Slot wait timeout → aborting audio") - return + time.sleep(0.25) - time.sleep(0.25) - - # ============================================================ + # ------------------------------------------------------------ # PATH NORMALISATION - # ============================================================ - - container_path = chapter_text + # ------------------------------------------------------------ + container_path = chapter_path - # Fix 1 — container_path kan None zijn → abort zonder crash if not container_path: - log(f"[AUDIO] CH{chapter_number}: FATAL — no input path provided") + log(f"[AUDIO] CH{chapter_number}: ERROR — no input file path provided") redis_client.delete(slot_key) + inc_audio_skipped(book_id) return - # Fix 2 — veilige startswith + # Strip container prefix so that host path is resolvable if CONTAINER_PREFIX and container_path.startswith(CONTAINER_PREFIX): relative_path = container_path[len(CONTAINER_PREFIX) :].lstrip("/") else: @@ -118,35 +142,35 @@ def generate_audio( parts = relative_path.split("/") if len(parts) < 3: log( - f"[AUDIO] CH{chapter_number}: FATAL — cannot parse book/volume from {relative_path}" + f"[AUDIO] CH{chapter_number}: ERROR — cannot parse book/volume from {relative_path}" ) redis_client.delete(slot_key) + inc_audio_skipped(book_id) return - book_from_path = parts[0] - volume_from_path = parts[1] - + # book_from_path = parts[0] # volume_name passed explicitly anyway + # volume_from_path = parts[1] host_path = os.path.join(HOST_PATH, relative_path) - # ============================================================ - # OUTPUT PREP - # ============================================================ - - base_dir = os.path.join(HOST_PATH, book_from_path, volume_from_path, "Audio") + # ------------------------------------------------------------ + # OUTPUT DIRECTORY + # ------------------------------------------------------------ + base_dir = os.path.join(HOST_PATH, parts[0], parts[1], "Audio") os.makedirs(base_dir, exist_ok=True) safe_num = f"{chapter_number:04d}" audio_file = os.path.join(base_dir, f"{safe_num}.m4b") + # Skip if audio already exists if os.path.exists(audio_file): - log(f"[AUDIO] Skip CH{chapter_number} → already exists") + log(f"[AUDIO] CH{chapter_number}: Already exists → skip") redis_client.delete(slot_key) + inc_audio_skipped(book_id) return - # ============================================================ - # BUILD CMD - # ============================================================ - + # ------------------------------------------------------------ + # BUILD TTS COMMAND + # ------------------------------------------------------------ cmd = ( f"say --voice={AUDIO_VOICE} " f"--input-file='{host_path}' " @@ -157,30 +181,34 @@ def generate_audio( f"--data-format=aac" ) - log(f"[AUDIO] CH{chapter_number}: CMD = {cmd}") + log(f"[AUDIO]({HOST}) CH{chapter_number} → output: {audio_file}") - # ============================================================ - # RUN TTS - # ============================================================ + # ------------------------------------------------------------ + # EXECUTE + # ------------------------------------------------------------ try: subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT) + # NEW — repository façade inc_audio_done(book_id) - log(f"[AUDIO] CH{chapter_number}: Completed") + log(f"[AUDIO]({HOST}) CH{chapter_number}: Completed") except subprocess.TimeoutExpired: - log(f"[AUDIO] CH{chapter_number}: TIMEOUT → remove incomplete file") + log(f"[AUDIO]({HOST}) CH{chapter_number}: TIMEOUT → removing file") if os.path.exists(audio_file): try: os.remove(audio_file) except Exception: pass + inc_audio_skipped(book_id) except subprocess.CalledProcessError as e: log(f"[AUDIO] CH{chapter_number}: ERROR during say → {e}") + inc_audio_skipped(book_id) except Exception as e: log(f"[AUDIO] CH{chapter_number}: UNEXPECTED ERROR → {e}") + inc_audio_skipped(book_id) finally: if slot_key: diff --git a/bookscraper/scraper/tasks/controller_tasks.py b/bookscraper/scraper/tasks/controller_tasks.py index 0c1419d..0992dfa 100644 --- a/bookscraper/scraper/tasks/controller_tasks.py +++ b/bookscraper/scraper/tasks/controller_tasks.py @@ -10,18 +10,20 @@ from celery_app import celery_app from logbus.publisher import log from scraper.download_controller import DownloadController -from scraper.progress import ( - set_total, -) + from urllib.parse import urlparse + +from scraper.logger_decorators import logcall import redis import os from scraper.abort import abort_requested +from db.repository import set_chapters_total print(">>> [IMPORT] controller_tasks.py loaded") @celery_app.task(bind=True, queue="controller", ignore_result=False) +@logcall def launch_downloads(self, book_id: str, scrape_result: dict): """ Launch the entire pipeline (download → parse → save), @@ -62,7 +64,8 @@ def launch_downloads(self, book_id: str, scrape_result: dict): # ------------------------------------------------------------ # INIT PROGRESS # ------------------------------------------------------------ - set_total(book_id, total) + set_chapters_total(book_id, total) + log(f"[CTRL] Progress initialized for {book_id}: total={total}") # ------------------------------------------------------------ diff --git a/bookscraper/scraper/tasks/download_tasks.py b/bookscraper/scraper/tasks/download_tasks.py index d3f3785..fdaaad9 100644 --- a/bookscraper/scraper/tasks/download_tasks.py +++ b/bookscraper/scraper/tasks/download_tasks.py @@ -1,26 +1,21 @@ # ============================================================ # File: scraper/tasks/download_tasks.py -# Purpose: Download chapter HTML with global concurrency, -# retry/backoff logic, 429 support, and abort-awareness. -# -# Logging: -# - timestamp + book_id in message -# - logbus.publisher → console -# - ui_log.push_ui → Redis GUI # ============================================================ from celery_app import celery_app -from scraper.utils import get_save_path +from scraper.utils.utils import get_save_path from scraper.abort import abort_requested, chapter_started, mark_chapter_started -from scraper.progress import ( - inc_completed, - inc_chapter_done, - inc_chapter_download_skipped, +# Repository façade — correct imports only +from db.repository import ( + set_status, + inc_download_done, + inc_download_skipped, ) -from db.repository import inc_downloaded, set_status + from logbus.publisher import log from scraper.ui_log import push_ui +from scraper.logger_decorators import logcall import requests import redis @@ -90,81 +85,74 @@ def release_global_slot(): # ============================================================ -# CELERY TASK — NEW SIGNATURE WITH chapter_dict + book_meta +# CELERY TASK — Unified payload v3 # ============================================================ @celery_app.task(bind=True, queue="download", ignore_result=False) -def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict): +@logcall +def download_chapter(self, payload: dict): """ - New unified chapter model: - chapter_dict = { - "num": int, - "url": str, - "title": str, - "volume_path": str - } - - book_meta is propagated through the pipeline for parse/save. + Payload: + { + "book_id": str, + "chapter": { "num", "url", "title", "volume_path" }, + "book_meta": dict, + "html": None | str, + "parsed": None | dict, + "skipped": bool, + "path": None | str + } """ - chapter_num = chapter_dict.get("num") - chapter_url = chapter_dict.get("url") - chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}" - volume_path = chapter_dict.get("volume_path") + if not payload: + raise ValueError("download_chapter received empty payload") + + book_id = payload["book_id"] + chapter = payload["chapter"] + book_meta = payload.get("book_meta") or {} + + chapter_num = chapter["num"] + chapter_url = chapter["url"] + chapter_title = chapter.get("title") or f"Chapter {chapter_num}" + volume_path = chapter["volume_path"] + + # STATUS UPDATE + set_status(book_id, "downloading") - # ----------------------------------------------------------- - # ABORT BEFORE START - # ----------------------------------------------------------- + # ABORT CHECK if abort_requested(book_id) and not chapter_started(book_id, chapter_num): - msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)" - log_msg(book_id, msg) - inc_chapter_download_skipped(book_id) - return { - "book_id": book_id, - "chapter": chapter_dict, - "html": None, - "skipped": True, - "path": None, - "abort": True, - "book_meta": book_meta, - } - - # Mark chapter as started + log_msg(book_id, f"[ABORT] Skip chapter {chapter_num}") + + inc_download_skipped(book_id) + + payload["html"] = None + payload["skipped"] = True + payload["path"] = None + return payload + mark_chapter_started(book_id, chapter_num) - # ----------------------------------------------------------- - # SKIP IF FILE ALREADY EXISTS - # ----------------------------------------------------------- + # SKIP IF FILE ALREADY SAVED save_path = get_save_path(chapter_num, volume_path) - if os.path.exists(save_path): - log_msg(book_id, f"[DL] SKIP {chapter_num} ({chapter_title}) → {save_path}") - return { - "book_id": book_id, - "chapter": chapter_dict, - "html": None, - "skipped": True, - "path": save_path, - "book_meta": book_meta, - } - - # ----------------------------------------------------------- - # GLOBAL + SYNC DELAY - # ----------------------------------------------------------- + log_msg(book_id, f"[DL] SKIP {chapter_num} → {save_path}") + + inc_download_skipped(book_id) + + payload["html"] = None + payload["skipped"] = True + payload["path"] = save_path + return payload + + # GLOBAL DELAY + SLOT if GLOBAL_DELAY > 0: time.sleep(GLOBAL_DELAY) wait_for_global_delay() acquire_global_slot(MAX_CONCURRENCY) - # log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}") - # ----------------------------------------------------------- # HTTP DOWNLOAD - # ----------------------------------------------------------- try: - log_msg( - book_id, - f"[DL] Downloading {chapter_num} ({chapter_title}): {chapter_url}", - ) + log_msg(book_id, f"[DL] Downloading {chapter_num} ({chapter_title})") resp = requests.get( chapter_url, @@ -178,39 +166,27 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict): log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes") - return { - "book_id": book_id, - "chapter": chapter_dict, - "html": html, - "skipped": False, - "path": save_path, - "book_meta": book_meta, - } + inc_download_done(book_id) + + # --- attach results --- + payload["html"] = html + payload["skipped"] = False + payload["path"] = save_path + return payload except Exception as exc: attempt = self.request.retries delay = BASE_DELAY * (BACKOFF**attempt) - # Specific 429 handler if getattr(getattr(exc, "response", None), "status_code", None) == 429: - log_msg( - book_id, - f"[DL] 429 {chapter_num} → WAIT {DELAY_429}s " - f"(attempt {attempt}/{MAX_RETRIES})", - ) + log_msg(book_id, f"[DL] 429 → WAIT {DELAY_429}s") time.sleep(DELAY_429) set_global_delay() raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES) - # Normal retry - log_msg( - book_id, - f"[DL] ERROR {chapter_num}: {exc} → retry in {delay}s " - f"(attempt {attempt}/{MAX_RETRIES})", - ) + log_msg(book_id, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s") raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES) finally: set_global_delay() release_global_slot() - # log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}") diff --git a/bookscraper/scraper/tasks/parse_tasks.py b/bookscraper/scraper/tasks/parse_tasks.py index d11eaf2..7fabd15 100644 --- a/bookscraper/scraper/tasks/parse_tasks.py +++ b/bookscraper/scraper/tasks/parse_tasks.py @@ -1,33 +1,31 @@ -# ========================================================= +# ============================================================ # File: scraper/tasks/parse_tasks.py # Purpose: Parse downloaded HTML into clean chapter text. -# Enhanced version: Piaotia H1→content extractor + clean pipeline -# NO HARDCODED REPLACEMENTS — everything comes from replacement files -# ========================================================= +# Enhanced Piaotia extractor + selector fallback + clean pipeline. +# Compatible with payload pipeline v3. +# ============================================================ from celery_app import celery_app -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString, Comment -from scraper.utils import clean_text, load_all_replacements -from scraper.tasks.download_tasks import log_msg # unified logger +from scraper.tasks.download_tasks import log_msg +from scraper.utils.utils import clean_text, load_all_replacements +from scraper.logger_decorators import logcall +from db.repository import inc_parsed_done -from bs4 import NavigableString, Comment -print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)") +print(">>> [IMPORT] parse_tasks.py loaded (RESTORED + payload v3)") +# ============================================================ +# PIAOTIA ADVANCED CONTENT EXTRACTOR (unchanged from original) +# ============================================================ def extract_piaotia_content(soup): - """ - Extract clean chapter content from Piaotia pages. - Start after the table following

. - End before nav/ads/footer/copyright. - """ - h1 = soup.find("h1") if not h1: return None - # -------- Find first table after

-------- + # Find first table after

table = None for sib in h1.next_siblings: if getattr(sib, "name", None) == "table": @@ -39,44 +37,41 @@ def extract_piaotia_content(soup): parts = [] - # -------- Iterate after table -------- for sib in table.next_siblings: - name = getattr(sib, "name", None) text = None + if hasattr(sib, "get_text"): text = sib.get_text(strip=True) - # === STOP CONDITIONS === + # STOP CONDITIONS - # Comments like + # if isinstance(sib, Comment) and ("翻页" in sib): break - # Explicit footer blocks + # explicit footer blocks if name == "div": sid = sib.get("id", "") cls = sib.get("class", []) if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"): break - # Copyright block — strongest indicator + # copyright block if text and ("重要声明" in text or "Copyright" in text): break - # Navigation or 推荐阅读 + # navigation blocks if text and (text.startswith(("推荐阅读", "目录", "目 录"))): break - # Skip scripts, ads, centers if name in ("script", "style"): continue - # Skip JS containers like
if name == "center": continue - # === ACCUMULATE TEXT === + # ACCUMULATE if isinstance(sib, NavigableString): s = sib.strip() if s: @@ -90,36 +85,42 @@ def extract_piaotia_content(soup): return "\n".join(parts).strip() +# ============================================================ +# PARSE TASK — PAYLOAD PIPELINE (CORRECT v3 FORMAT) +# ============================================================ @celery_app.task(bind=True, queue="parse", ignore_result=False) -def parse_chapter(self, download_result: dict): - """ - New signature under chapter_dict pipeline: - - receives ONLY the output dict from download_chapter - - book_meta is inside download_result["book_meta"] - - chapter_dict is inside download_result["chapter"] - """ - - book_id = download_result.get("book_id", "NOBOOK") - chapter_dict = download_result.get("chapter") or {} - book_meta = download_result.get("book_meta") or {} - chapter_title = chapter_dict.get("title") - chapter_num = chapter_dict.get("num") - chapter_url = chapter_dict.get("url") - html = download_result.get("html") - # ------------------------------------------------------------ +@logcall +def parse_chapter(self, payload: dict): + + if not payload: + return {"skipped": True, "reason": "empty_payload"} + + book_id = payload["book_id"] + chapter = payload["chapter"] + book_meta = payload.get("book_meta") or {} + + num = chapter.get("num") + title = chapter.get("title") or f"Chapter {num}" + html = payload.get("html") + # SKIPPED DOWNLOAD → SKIP PARSE - # ------------------------------------------------------------ - if download_result.get("skipped"): - log_msg(book_id, f"[PARSE] SKIP chapter {chapter_num} (download skipped)") - return download_result # already has chapter + book_meta + skipped + if payload.get("skipped"): + log_msg(book_id, f"[PARSE] SKIP chapter {num} (download skipped)") + return payload - log_msg(book_id, f"[PARSE] Parsing chapter {chapter_num}") + if not html: + log_msg(book_id, f"[PARSE] Missing HTML for chapter {num} → SKIP") + payload["parsed"] = None + payload["skipped"] = True + return payload + + log_msg(book_id, f"[PARSE] Parsing chapter {num}") soup = BeautifulSoup(html, "lxml") - # ------------------------------------------------------------ - # STRICT SELECTORS (direct content blocks) - # ------------------------------------------------------------ + # ============================================================ + # STRICT SELECTORS + # ============================================================ selectors = [ "#content", "div#content", @@ -141,63 +142,32 @@ def parse_chapter(self, download_result: dict): raw = None - # --- STRICT SELECTOR FAILED → Try Piaotia extractor --- + # --- STRICT SELECTOR FAILED → Piaotia extractor --- if node is None: raw = extract_piaotia_content(soup) + else: + raw = node.get_text(separator="\n") - # # ------------------------------------------------------------ - # # PIAOTIA FALLBACK: - # # Extract content between

and the "bottomlink" block. - # # ------------------------------------------------------------ - # raw = None - # if node is None: - # h1 = soup.find("h1") - # if h1: - # content_parts = [] - # for sib in h1.next_siblings: - - # sib_class = getattr(sib, "get", lambda *_: None)("class") - # if sib_class and ( - # "bottomlink" in sib_class or sib_class == "bottomlink" - # ): - # break - - # if getattr(sib, "name", None) in ["script", "style", "center"]: - # continue - - # if hasattr(sib, "get_text"): - # content_parts.append(sib.get_text(separator="\n")) - # else: - # content_parts.append(str(sib)) - - # raw = "\n".join(content_parts) - - # ------------------------------------------------------------ # FINAL FALLBACK - # ------------------------------------------------------------ if raw is None: - if node: - raw = node.get_text(separator="\n") - else: - for tag in soup(["script", "style", "noscript"]): - tag.decompose() - raw = soup.get_text(separator="\n") + for tag in soup(["script", "style", "noscript"]): + tag.decompose() + raw = soup.get_text(separator="\n") - # ------------------------------------------------------------ - # MULTIPASS CLEANING via replacement files ONLY - # ------------------------------------------------------------ + # ============================================================ + # MULTIPASS CLEANING via replacement files + # ============================================================ REPL = load_all_replacements() text = raw for _ in range(5): text = clean_text(text, REPL) - # ------------------------------------------------------------ - # Collapse excessive empty lines - # ------------------------------------------------------------ + # ============================================================ + # Collapse double blank lines + # ============================================================ cleaned = [] prev_blank = False - for line in text.split("\n"): stripped = line.rstrip() if stripped == "": @@ -208,28 +178,31 @@ def parse_chapter(self, download_result: dict): else: prev_blank = False cleaned.append(stripped) + text = "\n".join(cleaned) - text = chapter_title + "\n" + text - # ------------------------------------------------------------ + text = f"{title}\n{text}" + + # ============================================================ # Add header to chapter 1 - # ------------------------------------------------------------ - if chapter_num == 1: - book_url = book_meta.get("book_url") or book_meta.get("url") or "UNKNOWN" + # ============================================================ + if num == 1: + book_url = book_meta.get("book_url") or "UNKNOWN" header = ( - f"{book_meta.get('title','')}\n" + f"{book_meta.get('title', '')}\n" f"Author: {book_meta.get('author','')}\n" f"Description:\n{book_meta.get('description','')}\n" f"Book URL: {book_url}\n" + "-" * 50 + "\n\n" ) text = header + text - log_msg(book_id, f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars") + log_msg(book_id, f"[PARSE] Parsed chapter {num}: {len(text)} chars") + + # ============================================================ + # PAYLOAD OUTPUT (v3) + # ============================================================ + payload["parsed"] = text + payload["skipped"] = False + + inc_parsed_done(book_id) - # NEW RETURN FORMAT: chapter_dict stays intact - return { - "book_id": book_id, - "chapter": chapter_dict, - "text": text, - "length": len(text), - "book_meta": book_meta, - } + return payload diff --git a/bookscraper/scraper/tasks/pipeline.py b/bookscraper/scraper/tasks/pipeline.py index 267af60..a42d827 100644 --- a/bookscraper/scraper/tasks/pipeline.py +++ b/bookscraper/scraper/tasks/pipeline.py @@ -1,16 +1,12 @@ # ========================================================= # File: scraper/tasks/pipeline.py # Purpose: -# Build Celery chains for chapter processing using chapter_dict. +# Build Celery chains for chapter processing using payload dict. # -# New Chain: -# download_chapter(book_id, chapter_dict, book_meta) -# → parse_chapter(download_result) -# → save_chapter(parsed_result) -# → update_progress(final_result, book_id) -# -# All subtasks pass through result dicts unchanged so the -# next stage receives the correct fields. +# Pipeline v3: +# download_chapter(payload) +# → parse_chapter(payload) +# → save_chapter(payload) # ========================================================= from celery import chain @@ -18,26 +14,28 @@ from celery import chain from scraper.tasks.download_tasks import download_chapter from scraper.tasks.parse_tasks import parse_chapter from scraper.tasks.save_tasks import save_chapter -from scraper.tasks.progress_tasks import update_progress +from scraper.logger_decorators import logcall -def build_chapter_pipeline( - book_id: str, - chapter_dict: dict, - book_meta: dict, -): - """ - Build a Celery chain for one chapter using chapter_dict. - download_chapter(book_id, chapter_dict, book_meta) - → parse_chapter(download_result) - → save_chapter(parsed_result) - → update_progress(result, book_id) +@logcall +def build_chapter_pipeline(book_id: str, chapter_dict: dict, book_meta: dict): """ + Payload model passed through entire pipeline. + """ + + payload = { + "book_id": book_id, + "chapter": chapter_dict, + "book_meta": book_meta, + "html": None, + "parsed": None, + "skipped": False, + "path": None, + } return chain( - download_chapter.s(book_id, chapter_dict, book_meta), + download_chapter.s(payload), parse_chapter.s(), save_chapter.s(), - update_progress.s(book_id), ) diff --git a/bookscraper/scraper/tasks/progress_tasks.py b/bookscraper/scraper/tasks/progress_tasks.py deleted file mode 100644 index 3752893..0000000 --- a/bookscraper/scraper/tasks/progress_tasks.py +++ /dev/null @@ -1,57 +0,0 @@ -# ============================================================ -# File: scraper/tasks/progress_tasks.py -# Purpose: Central progress updater for chapter pipelines. -# Updated for chapter_dict pipeline model. -# ============================================================ - -from celery_app import celery_app -from scraper.progress import inc_completed, inc_skipped, inc_failed -from logbus.publisher import log - -print(">>> [IMPORT] progress_tasks.py loaded") - - -@celery_app.task(bind=False, name="progress.update", queue="controller") -def update_progress(result: dict, book_id: str): - """ - Central progress logic: - - result: output of save_chapter - - book_id: explicitly passed by pipeline - - IMPORTANT: - - save_chapter already updates counters for skipped & normal chapters - - progress.update MUST NOT double-increment - """ - - ch = result.get("chapter") or {} - chapter_num = ch.get("num") - - skipped = result.get("skipped", False) - failed = result.get("failed", False) - - # ------------------------------------------------------------ - # FAILED CASE - # ------------------------------------------------------------ - if failed: - inc_failed(book_id) - log(f"[PROG] FAILED chapter {chapter_num}") - return result - - # ------------------------------------------------------------ - # SKIPPED CASE - # ------------------------------------------------------------ - if skipped: - # save_chapter already did: - # inc_skipped(book_id) - log(f"[PROG] SKIPPED chapter {chapter_num}") - return result - - # ------------------------------------------------------------ - # NORMAL COMPLETION - # ------------------------------------------------------------ - # save_chapter did NOT increment completed for skipped cases - # but DID inc_completed(book_id) for normal cases. - # update_progress should NOT double increment, so only log here. - log(f"[PROG] DONE chapter {chapter_num}") - - return result diff --git a/bookscraper/scraper/tasks/save_tasks.py b/bookscraper/scraper/tasks/save_tasks.py index 0999676..774920d 100644 --- a/bookscraper/scraper/tasks/save_tasks.py +++ b/bookscraper/scraper/tasks/save_tasks.py @@ -1,139 +1,83 @@ # ============================================================ -# File: scraper/tasks/save_tasks.py -# Purpose: Save parsed chapter text to disk + trigger audio. -# Updated for chapter_dict + book_meta pipeline model. +# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC) # ============================================================ print(">>> [IMPORT] save_tasks.py loaded") from celery import shared_task import os -from scraper.utils import get_save_path -from scraper.tasks.download_tasks import log_msg # unified logger -from scraper.progress import ( - inc_completed, - inc_chapter_done, - inc_chapter_download_skipped, -) + +from logbus.publisher import log +from scraper.logger_decorators import logcall +from scraper.utils.utils import get_save_path +from scraper.tasks.download_tasks import log_msg from scraper.tasks.audio_tasks import generate_audio +from db.repository import inc_download_done, inc_download_skipped + @shared_task(bind=True, queue="save", ignore_result=False) -def save_chapter(self, parsed: dict): - """ - New pipeline model: - parsed = { - "book_id": str, - "chapter": chapter_dict, - "text": str, - "length": int, - "book_meta": dict, - "skipped": bool, - "path": optional str (if skipped) - } - """ - - book_id = parsed.get("book_id", "NOBOOK") - chapter_dict = parsed.get("chapter") or {} - book_meta = parsed.get("book_meta") or {} - - chapter_num = chapter_dict.get("num") - chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}" - volume_path = chapter_dict.get("volume_path") - - # ------------------------------------------------------------ - # VALIDATION - # ------------------------------------------------------------ - if chapter_num is None or volume_path is None: - raise ValueError("Invalid parsed payload: chapter_dict missing fields.") - - # ------------------------------------------------------------ - # SKIPPED CASE - # ------------------------------------------------------------ - if parsed.get("skipped"): - path = parsed.get("path", None) - log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num} → {path}") - - inc_chapter_download_skipped(book_id) - - volume_name = os.path.basename(volume_path.rstrip("/")) - - # Queue audio only if a valid file exists +@logcall +def save_chapter(self, payload: dict): + + if not payload: + log("[SAVE] ERROR: payload is None") + return {"error": True} + + book_id = payload["book_id"] + chapter = payload["chapter"] + parsed = payload.get("parsed") + path = payload.get("path") + skipped = payload.get("skipped") + + num = chapter["num"] + title = chapter.get("title") or f"Chapter {num}" + volume = chapter.get("volume_path") + volume_name = os.path.basename(volume.rstrip("/")) + + # ============================================================ + # SKIPPED CASE (restore old behavior) + # ============================================================ + if skipped or not parsed: + log_msg(book_id, f"[SAVE] SKIP chapter {num}") + inc_download_skipped(book_id) + + # Restore old behavior: + # If file already exists, STILL trigger audio. if path and os.path.exists(path): + log_msg(book_id, f"[AUDIO] Queueing audio for SKIPPED chapter {num}") try: - generate_audio.delay( - book_id, - volume_name, - chapter_num, - chapter_title, - path, - ) - log_msg( - book_id, - f"[AUDIO] Task queued (SKIPPED) for chapter {chapter_num} in {volume_name}", - ) - except Exception as audio_exc: - log_msg( - book_id, - f"[AUDIO] ERROR queueing (SKIPPED) chapter {chapter_num}: {audio_exc}", - ) - - return { - "book_id": book_id, - "chapter": chapter_dict, - "path": path, - "skipped": True, - "book_meta": book_meta, - } - - # ------------------------------------------------------------ - # NORMAL SAVE CASE - # ------------------------------------------------------------ - try: - text = parsed.get("text", "") + generate_audio.delay(book_id, volume_name, num, title, path) + except Exception as exc: + log_msg(book_id, f"[AUDIO] ERROR queueing skipped audio: {exc}") - # Ensure volume folder exists - os.makedirs(volume_path, exist_ok=True) + return payload - # Build final chapter file path - path = get_save_path(chapter_num, volume_path) + # ============================================================ + # NORMAL SAVE CASE + # ============================================================ + try: + os.makedirs(volume, exist_ok=True) + save_path = get_save_path(num, volume) - # Write chapter text to file - with open(path, "w", encoding="utf-8") as f: - f.write(text) + with open(save_path, "w", encoding="utf-8") as f: + f.write(parsed) - log_msg(book_id, f"[SAVE] Saved chapter {chapter_num} → {path}") - inc_chapter_done(book_id) - inc_completed(book_id) + log_msg(book_id, f"[SAVE] Saved chapter {num} → {save_path}") - # Determine volume name - volume_name = os.path.basename(volume_path.rstrip("/")) + inc_download_done(book_id) - # Queue audio task + # Restore old behavior → ALWAYS queue audio try: - generate_audio.delay( - book_id, - volume_name, - chapter_num, - chapter_title, - path, - ) - log_msg( - book_id, - f"[AUDIO] Task queued for chapter {chapter_num} in {volume_name}", - ) - except Exception as audio_exc: - log_msg( - book_id, f"[AUDIO] ERROR queueing chapter {chapter_num}: {audio_exc}" - ) - - return { - "book_id": book_id, - "chapter": chapter_dict, - "path": path, - "book_meta": book_meta, - } + generate_audio.delay(book_id, volume_name, num, title, save_path) + log_msg(book_id, f"[AUDIO] Task queued for chapter {num}") + except Exception as exc: + log_msg(book_id, f"[AUDIO] ERROR queueing chapter {num}: {exc}") + + payload["path"] = save_path + payload["skipped"] = False + return payload except Exception as exc: - log_msg(book_id, f"[SAVE] ERROR saving chapter {chapter_num}: {exc}") + log_msg(book_id, f"[SAVE] ERROR saving chapter {num}: {exc}") raise diff --git a/bookscraper/scraper/tasks/scraping.py b/bookscraper/scraper/tasks/scraping.py index 0694089..64b26ba 100644 --- a/bookscraper/scraper/tasks/scraping.py +++ b/bookscraper/scraper/tasks/scraping.py @@ -9,6 +9,7 @@ from logbus.publisher import log import os import redis +from scraper.logger_decorators import logcall from scraper.sites import BookSite from scraper.book_scraper import BookScraper from scraper.abort import clear_abort # no circular deps diff --git a/bookscraper/templates/components/book_card.html b/bookscraper/scraper/utils/__init__.py similarity index 100% rename from bookscraper/templates/components/book_card.html rename to bookscraper/scraper/utils/__init__.py diff --git a/bookscraper/scraper/utils/state_sync.py b/bookscraper/scraper/utils/state_sync.py new file mode 100644 index 0000000..fe8a7a3 --- /dev/null +++ b/bookscraper/scraper/utils/state_sync.py @@ -0,0 +1,141 @@ +# ============================================================ +# File: scraper/utils/state_sync.py +# Purpose: +# State inspection + optional sync logic for book progress. +# This version provides: +# • inspect_books_state() → NO writes, just a dry-run +# • sync_books_from_redis() → NOT USED YET (kept commented) +# ============================================================ + +import os +import redis +from db.db import get_db + + +def inspect_books_state(): + """ + Reads all books from SQLite and fetches Redis progress, + but performs NO writes. Only shows: + - sqlite row + - redis state + - merged result (dry-run) + + Returns a list of inspection dicts. + """ + r = redis.Redis.from_url(os.getenv("REDIS_BROKER")) + db = get_db() + cur = db.cursor() + + cur.execute("SELECT * FROM books") + rows = cur.fetchall() + + results = [] + + for row in rows: + book_id = row["book_id"] + sqlite_row = dict(row) + + # Read redis state + redis_key = f"book:{book_id}:state" + progress = r.hgetall(redis_key) + + if progress: + decoded = {k.decode(): v.decode() for k, v in progress.items()} + else: + decoded = {} + + # Determine dry-run merged result + merged = sqlite_row.copy() + + if decoded: + merged["downloaded"] = int( + decoded.get("download_done", merged.get("downloaded", 0)) + ) + merged["parsed"] = int(decoded.get("parsed_done", merged.get("parsed", 0))) + merged["audio_done"] = int( + decoded.get("audio_done", merged.get("audio_done", 0)) + ) + merged["chapters_total"] = int( + decoded.get("chapters_total", merged.get("chapters_total", 0)) + ) + merged["status"] = decoded.get("status", merged.get("status", "unknown")) + + results.append( + { + "book_id": book_id, + "sqlite": sqlite_row, + "redis": decoded, + "would_merge_to": merged, + } + ) + + return results + + +def sync_books_from_redis(): + """ + Reads all books from SQLite, fetches Redis progress, + and updates SQLite rows accordingly. + + Returns a list of { + "book_id": ..., + "before": ..., + "redis": ..., + "after": ... + } + """ + r = redis.Redis.from_url(os.getenv("REDIS_BROKER")) + db = get_db() + cur = db.cursor() + + # Haal alle boeken op + cur.execute("SELECT * FROM books") + rows = cur.fetchall() + + results = [] + + for row in rows: + book_id = row["book_id"] + before = dict(row) + + redis_key = f"book:{book_id}:state" + progress = r.hgetall(redis_key) + + if not progress: + results.append( + {"book_id": book_id, "before": before, "redis": {}, "after": before} + ) + continue + + # Decode Redis bytes → string dictionary + decoded = {k.decode(): v.decode() for k, v in progress.items()} + + # Extract counters + downloaded = int(decoded.get("download_done", 0)) + parsed = int(decoded.get("parsed_done", 0)) + audio_done = int(decoded.get("audio_done", 0)) + chapters_total = int(decoded.get("chapters_total", 0)) + + # Redis status wins + status = decoded.get("status", before["status"]) + + # Write back to SQLite + cur.execute( + """ + UPDATE books + SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now') + WHERE book_id = ? + """, + (downloaded, parsed, audio_done, chapters_total, status, book_id), + ) + db.commit() + + # Fetch updated row + cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,)) + after = dict(cur.fetchone()) + + results.append( + {"book_id": book_id, "before": before, "redis": decoded, "after": after} + ) + + return results diff --git a/bookscraper/scraper/utils.py b/bookscraper/scraper/utils/utils.py similarity index 100% rename from bookscraper/scraper/utils.py rename to bookscraper/scraper/utils/utils.py diff --git a/bookscraper/static/covers/一剑朝天.jpg b/bookscraper/static/covers/一剑朝天.jpg deleted file mode 100644 index 8b1256e..0000000 Binary files a/bookscraper/static/covers/一剑朝天.jpg and /dev/null differ diff --git a/bookscraper/static/covers/从吞噬开始.jpg b/bookscraper/static/covers/从吞噬开始.jpg deleted file mode 100644 index fb7cd72..0000000 Binary files a/bookscraper/static/covers/从吞噬开始.jpg and /dev/null differ diff --git a/bookscraper/static/covers/流氓高手.jpg b/bookscraper/static/covers/流氓高手.jpg deleted file mode 100644 index d269316..0000000 Binary files a/bookscraper/static/covers/流氓高手.jpg and /dev/null differ diff --git a/bookscraper/static/covers/流氓高手II.jpg b/bookscraper/static/covers/流氓高手II.jpg deleted file mode 100644 index 4d96826..0000000 Binary files a/bookscraper/static/covers/流氓高手II.jpg and /dev/null differ diff --git a/bookscraper/static/css/bookcard.css b/bookscraper/static/css/bookcard.css new file mode 100644 index 0000000..588716e --- /dev/null +++ b/bookscraper/static/css/bookcard.css @@ -0,0 +1,201 @@ +/* ======================================================================= + File: static/css/bookcard.css + Purpose: + All styling for registered book cards (book-card) + + status colors + start/abort buttons. + ======================================================================= */ + +/* ----------------------------------------------------------------------- + GRID WRAPPER FOR REGISTERED BOOKS + ----------------------------------------------------------------------- */ + +.registered-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(340px, 1fr)); + gap: 20px; + margin-top: 15px; +} + +/* ----------------------------------------------------------------------- + MAIN BOOK CARD + ----------------------------------------------------------------------- */ + +.book-card { + position: relative; + display: grid; + grid-template-columns: 90px auto; + gap: 15px; + + padding: 15px; + background: #fff; + border-radius: 10px; + border: 1px solid #e5e5e5; + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); + + transition: border-color 0.25s ease, box-shadow 0.25s ease; +} + +/* ----------------------------------------------------------------------- + BOOK STATUS COLORS + ----------------------------------------------------------------------- */ + +.book-card.processing { + border-color: #007aff; + box-shadow: 0 0 6px rgba(0, 122, 255, 0.35); +} + +.book-card.downloading { + border-color: #ff9500; + box-shadow: 0 0 6px rgba(255, 149, 0, 0.35); +} + +.book-card.parsing { + border-color: #ffcc00; + box-shadow: 0 0 6px rgba(255, 204, 0, 0.35); +} + +.book-card.audio { + border-color: #e65100; + box-shadow: 0 0 6px rgba(230, 81, 0, 0.35); +} + +.book-card.completed { + border-color: #34c759; + box-shadow: 0 0 6px rgba(52, 199, 89, 0.35); +} + +.book-card.aborted { + border-color: #ff3b30; + box-shadow: 0 0 6px rgba(255, 59, 48, 0.35); +} + +/* ----------------------------------------------------------------------- + COVER IMAGE + ----------------------------------------------------------------------- */ + +.book-cover { + width: 90px; +} + +.book-img { + width: 90px; + height: 130px; + object-fit: cover; + border-radius: 4px; + background: #f4f4f4; +} + +.placeholder { + display: flex; + justify-content: center; + align-items: center; + color: #777; + font-size: 12px; +} + +/* ----------------------------------------------------------------------- + META INFORMATION + ----------------------------------------------------------------------- */ + +.book-meta { + display: flex; + flex-direction: column; + justify-content: space-between; +} + +.book-title { + font-size: 16px; + font-weight: bold; + margin-bottom: 4px; +} + +.book-author { + font-size: 14px; + color: #444; + margin-bottom: 8px; +} + +.book-created { + font-size: 12px; + color: #666; + margin-bottom: 10px; +} + +/* ----------------------------------------------------------------------- + ICON BUTTONS + ----------------------------------------------------------------------- */ + +.icon-btn { + width: 34px; + height: 34px; + border: none; + border-radius: 8px; + + display: flex; + justify-content: center; + align-items: center; + + font-size: 16px; + color: #fff; + cursor: pointer; + + transition: background 0.15s ease, transform 0.1s ease; +} + +/* Start (green) */ +.icon-start { + background: #2d8a3d; +} +.icon-start:hover { + background: #226c30; + transform: scale(1.05); +} + +.icon-start:disabled { + background: #9bbb9f !important; + cursor: not-allowed; + transform: none; + opacity: 0.5; +} + +/* Abort (red) */ +.icon-abort { + background: #c62828; +} +.icon-abort:hover { + background: #a31f1f; + transform: scale(1.05); +} + +.icon-abort:disabled { + background: #d8a0a0 !important; + cursor: not-allowed; + transform: none; + opacity: 0.5; +} + +/* Hide button (gray) */ +.hide-form { + position: absolute; + top: 6px; + right: 6px; + margin: 0; +} + +.icon-hide { + background: #777; +} +.icon-hide:hover { + background: #555; + transform: scale(1.05); +} +/* ----------------------------------------------------------------------- + BOOK ACTIONS (right aligned button row) + ----------------------------------------------------------------------- */ + +.book-actions { + display: flex; + justify-content: flex-end; /* rechts uitlijnen */ + gap: 10px; /* ruimte tussen knoppen */ + margin-top: 12px; +} diff --git a/bookscraper/static/css/dashboard.css b/bookscraper/static/css/dashboard.css index c3946d1..e7aa67a 100644 --- a/bookscraper/static/css/dashboard.css +++ b/bookscraper/static/css/dashboard.css @@ -2,33 +2,28 @@ File: static/css/dashboard.css Purpose: Clean full-width vertical dashboard layout with large log viewer. + Book-card CSS is now moved to bookcard.css ======================================================================= */ -/* ------------------------------ - GENERAL PAGE LAYOUT - ------------------------------ */ +/* ----------------------------------------------------------------------- + 1) GENERAL PAGE LAYOUT + ----------------------------------------------------------------------- */ -/* Dashboard content should use full width */ .dashboard-container { display: flex; flex-direction: column; width: 100%; - max-width: 1200px; /* voorkomt overflow rechts */ + max-width: 1200px; margin: 20px auto; padding: 0 20px; - gap: 18px; /* kleiner dan 30px */ + gap: 18px; } -/* ------------------------------ - SECTIONS (input, progress, logs) - ------------------------------ */ - .dashboard-section { background: #ffffff; - padding: 16px; /* kleiner */ + padding: 16px; border-radius: 6px; border: 1px solid #ddd; - margin: 0; /* weg extra witruimte */ } .page-title { @@ -36,9 +31,9 @@ margin-bottom: 15px; } -/* ------------------------------ - BOOK LIST (optional) - ------------------------------ */ +/* ----------------------------------------------------------------------- + 2) ACTIVE BOOK LIST (dashboard left panel) + ----------------------------------------------------------------------- */ .book-list { display: flex; @@ -52,7 +47,6 @@ color: #777; } -/* List item */ .book-list-item { padding: 12px 16px; background: #f7f7f7; @@ -73,7 +67,6 @@ border-color: #1e88e5; } -/* Title + metadata */ .book-title { font-size: 16px; font-weight: 600; @@ -84,13 +77,9 @@ color: #555; } -.meta-label { - font-weight: 600; -} - -/* ------------------------------ - PROGRESS BOX - ------------------------------ */ +/* ----------------------------------------------------------------------- + 3) PROGRESS BOX + ----------------------------------------------------------------------- */ .progress-box { background: #fafafa; @@ -141,14 +130,35 @@ margin-top: 4px; } -/* ------------------------------ - LOG VIEWER — LARGE FULL-WIDTH - ------------------------------ */ +.book-abort-area { + margin-top: 10px; + text-align: right; +} + +.abort-btn { + padding: 6px 12px; + border-radius: 4px; + border: 1px solid #cc0000; + background: #ff4444; + color: white; + font-size: 12px; + cursor: pointer; + transition: background 0.2s, border-color 0.2s; +} + +.abort-btn:hover { + background: #ff2222; + border-color: #aa0000; +} + +/* ----------------------------------------------------------------------- + 4) LOG VIEWER + ----------------------------------------------------------------------- */ .log-viewer { width: 100%; max-width: 100%; - overflow: hidden; /* voorkom horizontaal uitsteken */ + overflow: hidden; } .log-header { @@ -171,11 +181,11 @@ max-height: 75vh; overflow-y: auto; - overflow-x: hidden; /* voorkom dat de log naar rechts uitsteekt */ + overflow-x: hidden; - background: #000000; /* Pure terminal black */ - color: #00ff66; /* Matrix / retro green */ - border: 1px solid #0f0; /* neon green frame */ + background: #000; + color: #00ff66; + border: 1px solid #0f0; border-radius: 6px; padding: 12px; @@ -183,48 +193,39 @@ font-size: 13px; line-height: 1.35; - white-space: pre-wrap; /* wraps text */ - word-break: break-word; /* lange links breken */ + white-space: pre-wrap; + word-break: break-word; } -/* Basestijl voor alle logregels */ + .log-line { white-space: pre-wrap; padding: 2px 0; - font-family: "SF Mono", "Consolas", "Courier New", monospace; } - -/* Subklassen per logtype */ .log-line.default { - color: #00ff66; /* groen */ + color: #00ff66; } - .log-line.dl { - color: #00ccff; /* cyan */ + color: #00ccff; } - .log-line.parse { - color: #ffaa00; /* oranje */ + color: #ffaa00; } - .log-line.save { - color: #ffdd33; /* geel */ + color: #ffdd33; } - .log-line.audio { - color: #ff66ff; /* paars */ + color: #ff66ff; } - .log-line.ctrl { - color: #66aaff; /* lichtblauw */ + color: #66aaff; } - .log-line.error { - color: #ff3333; /* rood */ + color: #ff3333; } -/* ------------------------------ - PLACEHOLDER - ------------------------------ */ +/* ----------------------------------------------------------------------- + 5) PLACEHOLDER / FOOTER + ----------------------------------------------------------------------- */ .dashboard-placeholder { font-size: 15px; @@ -232,6 +233,7 @@ text-align: center; color: #777; } + .footer { text-align: center; padding: 12px; @@ -240,23 +242,56 @@ font-size: 12px; border-top: 1px solid #ddd; } -.book-abort-area { - margin-top: 10px; - text-align: right; +/* ----------------------------- + DROPDOWN NAVIGATION +------------------------------ */ + +/* Container for dropdown */ +.nav-dropdown { + position: relative; } -.abort-btn { - padding: 6px 12px; - border-radius: 4px; - border: 1px solid #cc0000; - background: #ff4444; - color: white; - font-size: 12px; +/* The clickable label ("Tools ▾") */ +.nav-dropdown > .nav-item { cursor: pointer; - transition: background 0.2s, border-color 0.2s; } -.abort-btn:hover { - background: #ff2222; - border-color: #aa0000; +/* Hide dropdown by default */ +.dropdown-menu { + display: none; + position: absolute; + top: 100%; + right: 0; + background: #fff; /* zelfde achtergrond als navbar */ + border: 1px solid #ddd; + padding: 8px 0; + margin: 0; + list-style: none; /* verwijder bolletjes */ + border-radius: 4px; + min-width: 160px; + z-index: 1000; +} + +/* Show dropdown when hovering over parent */ +.nav-dropdown:hover .dropdown-menu { + display: block; +} + +/* Menu item styling */ +.dropdown-menu li { + padding: 0; + margin: 0; +} + +.dropdown-menu li a { + display: block; + padding: 8px 16px; + white-space: nowrap; + color: #333; + text-decoration: none; +} + +/* Hover state */ +.dropdown-menu li a:hover { + background: #f0f0f0; } diff --git a/bookscraper/static/js/dashboard.js b/bookscraper/static/js/dashboard.js index 1f78b29..35340c0 100644 --- a/bookscraper/static/js/dashboard.js +++ b/bookscraper/static/js/dashboard.js @@ -2,15 +2,15 @@ File: static/js/dashboard.js Purpose: Dashboard interactions: - - select book - - refresh logs - - refresh progress + - Select active book + - Live logs & progress + - Bookcard AJAX start/abort NOTE: - $ / $$ / autoScroll komen uit helpers.js + updateLogs() is provided by log_view.js ======================================================================= */ /* --------------------------------------------------------- - Simple fetch wrapper + Utility: Safe fetch wrapper --------------------------------------------------------- */ async function apiGet(url) { try { @@ -32,79 +32,61 @@ let REFRESH_INTERVAL = null; console.log(">>> dashboard.js LOADED"); /* --------------------------------------------------------- - DOM Ready → setup + DOM READY --------------------------------------------------------- */ document.addEventListener("DOMContentLoaded", () => { console.log(">>> dashboard.js DOMContentLoaded"); - // ===================================================== - // GLOBAL FALLBACK POLLING — ALWAYS FETCH LOGS - // Runs when no books exist or no selection has been made - // ===================================================== - console.log(">>> dashboard.js: enabling global fallback polling"); + // Fallback: fetch global logs if no active book setInterval(() => { - // if no active book → fetch global logs - if (!ACTIVE_BOOK) { - refreshBook(null); // triggers /logs - } + if (!ACTIVE_BOOK) refreshBook(null); }, 2000); + // Sidebar items const items = $$(".book-list-item"); - console.log(">>> dashboard.js found book-list items:", items.length); - - // Geen boeken → geen polling starten - // if (!items || items.length === 0) { - // console.log(">>> dashboard.js: geen boeken aanwezig, polling uit."); - // return; - // } - - // Book selection listener items.forEach((item) => { item.addEventListener("click", () => { - console.log(">>> dashboard.js: user clicked book:", item.dataset.bookId); selectBook(item.dataset.bookId); }); }); - // Auto-select first book + // Auto-select if (!ACTIVE_BOOK && items[0]) { - console.log( - ">>> dashboard.js: auto-select first book:", - items[0].dataset.bookId - ); selectBook(items[0].dataset.bookId); } + + // Initial binding of book-card buttons + bindBookCardButtons(); + + // Refresh sidebar every 2 seconds + setInterval(refreshActiveBooks, 2800); }); /* --------------------------------------------------------- - Select a book (updates UI + starts polling) + Select a book --------------------------------------------------------- */ function selectBook(bookId) { - console.log(">>> selectBook(", bookId, ")"); - ACTIVE_BOOK = bookId; + console.log(">>> Selecting book", bookId); - // Highlight + // Highlight sidebar $$(".book-list-item").forEach((el) => { el.classList.toggle("active", el.dataset.bookId === bookId); }); - // Reset previous polling - if (REFRESH_INTERVAL) { - console.log(">>> dashboard.js: clearing previous polling interval"); - clearInterval(REFRESH_INTERVAL); - } + // Reset polling + if (REFRESH_INTERVAL) clearInterval(REFRESH_INTERVAL); - // Start new polling - console.log(">>> dashboard.js: starting polling for bookId =", bookId); REFRESH_INTERVAL = setInterval(() => { refreshBook(ACTIVE_BOOK); }, 2000); - // Immediate refresh refreshBook(ACTIVE_BOOK); } -setInterval(refreshActiveBooks, 2000); + +/* --------------------------------------------------------- + Refresh sidebar list + --------------------------------------------------------- */ async function refreshActiveBooks() { const books = await apiGet("/api/books"); if (!books) return; @@ -112,8 +94,8 @@ async function refreshActiveBooks() { const container = $("#book-list"); if (!container) return; - // Herbouw de lijst container.innerHTML = ""; + books.forEach((b) => { const div = document.createElement("div"); div.className = "book-list-item"; @@ -126,75 +108,152 @@ async function refreshActiveBooks() { ${b.download_done}/${b.download_total} downloaded, ${b.audio_done}/${b.audio_total} audio - - `; - // Event listener opnieuw koppelen div.addEventListener("click", () => selectBook(b.book_id)); - container.appendChild(div); }); - // Als ACTIVE_BOOK nog niet bekend → auto-selecteer eerste boek if (!ACTIVE_BOOK && books.length > 0) { selectBook(books[0].book_id); } } /* --------------------------------------------------------- - Fetch logs + progress from API + Fetch logs + progress --------------------------------------------------------- */ async function refreshBook(bookId) { - console.log(">>> refreshBook(", bookId, ")"); - - // 1) Als er GEEN bookId is → haal alleen globale logs op if (!bookId) { - console.log(">>> refreshBook: no active book → fetch /logs"); - const data = await apiGet("/logs"); - if (data && data.logs) updateLogs(data.logs); - - return; // klaar + if (data) updateLogs(data); + return; } - // 2) Als er WEL een boek is → haal book status + logs op const state = await apiGet(`/api/book/${bookId}/status`); const logs = await apiGet(`/api/book/${bookId}/logs`); - console.log(">>> refreshBook state =", state); - console.log(">>> refreshBook logs =", logs); - - if (state) updateProgressBars(state); + if (state) { + updateProgressBars(state); + refreshBookCards(); + } if (logs) updateLogs(logs); } /* --------------------------------------------------------- - Update LOG VIEW panel + BOOKCARD BUTTON BINDING — idempotent --------------------------------------------------------- */ -function updateLogs(logList) { - const output = $("#log-output"); - if (!output) { - console.warn(">>> updateLogs: no #log-output element found"); - return; - } +function bindBookCardButtons() { + console.log(">>> bindBookCardButtons() scanning…"); + + // START BUTTONS + document.querySelectorAll(".book-card .icon-start").forEach((btn) => { + if (btn.dataset.bound === "1") return; // prevent double-binding + btn.dataset.bound = "1"; + + btn.addEventListener("click", (ev) => { + ev.preventDefault(); + if (btn.disabled) return; + + const bookId = btn.closest(".book-card").dataset.bookId; + console.log(">>> START clicked:", bookId); + + startBook(bookId); + }); + }); + + // ABORT BUTTONS + document.querySelectorAll(".book-card .icon-abort").forEach((btn) => { + if (btn.dataset.bound === "1") return; + btn.dataset.bound = "1"; - output.innerHTML = ""; + btn.addEventListener("click", (ev) => { + ev.preventDefault(); + if (btn.disabled) return; - logList.forEach((line) => logAppend(line)); + const bookId = btn.closest(".book-card").dataset.bookId; + console.log(">>> ABORT clicked:", bookId); + + abortBookAjax(bookId); + }); + }); +} - autoScroll(output); +/* --------------------------------------------------------- + AJAX START + --------------------------------------------------------- */ +function startBook(bookId) { + console.log(">>> startBook():", bookId); + + fetch("/start", { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: `book_id=${bookId}`, + }) + .then(async (r) => { + console.log(">>> /start status:", r.status); + + let data = null; + try { + data = await r.json(); + } catch (e) {} + + console.log(">>> /start response:", data); + refreshBookCards(); + refreshBook(bookId); + }) + .catch((err) => console.error("Start failed:", err)); } -function abortBook(book_id) { - if (!confirm(`Abort tasks for book ${book_id}?`)) return; +/* --------------------------------------------------------- + AJAX ABORT + --------------------------------------------------------- */ +function abortBookAjax(bookId) { + if (!confirm(`Abort tasks for book ${bookId}?`)) return; + + console.log(">>> abortBookAjax():", bookId); + + fetch(`/abort/${bookId}`, { method: "POST" }) + .then(async (r) => { + let data = null; + try { + data = await r.json(); + } catch (e) {} + console.log(">>> /abort response:", data); - fetch(`/abort/${book_id}`, { method: "POST" }) - .then((r) => r.json()) - .then((data) => { - console.log("Abort:", data); + refreshBookCards(); + refreshBook(bookId); }) - .catch((err) => { - console.error("Abort failed:", err); - }); + .catch((err) => console.error("Abort failed:", err)); +} + +/* --------------------------------------------------------- + Refresh all book-cards (status, classes, buttons) + --------------------------------------------------------- */ +async function refreshBookCards() { + const books = await apiGet("/api/books"); + if (!books) return; + + document.querySelectorAll(".book-card").forEach((card) => { + const id = card.dataset.bookId; + const info = books.find((b) => b.book_id === id); + if (!info) return; + + // Status CSS + card.className = `book-card ${info.status}`; + + // Button states + const startBtn = card.querySelector(".icon-start"); + const abortBtn = card.querySelector(".icon-abort"); + + if (startBtn) startBtn.disabled = info.status !== "registered"; + if (abortBtn) + abortBtn.disabled = ![ + "processing", + "downloading", + "parsing", + "audio", + ].includes(info.status); + }); + + bindBookCardButtons(); // rebind new DOM } diff --git a/bookscraper/static/js/log_view.js b/bookscraper/static/js/log_view.js index ba4cff7..2191704 100644 --- a/bookscraper/static/js/log_view.js +++ b/bookscraper/static/js/log_view.js @@ -1,38 +1,36 @@ /* ======================================================================= File: static/js/log_view.js Purpose: - Log viewer functionality: - - filtering - - clearing - - auto-scroll - - delta polling (efficient) - - rolling limit (prevent GUI freeze) + High-performance rolling log viewer + - efficient delta polling + - append-only mode (no DOM reset) + - rolling limit (prevents memory freeze) + - supports both global logs and per-book logs ======================================================================= */ console.log(">>> log_view.js LOADING…"); /* --------------------------------------------------------- - Log filtering + Global log viewer state --------------------------------------------------------- */ let LOG_FILTER = "ALL"; -let LAST_LOG_INDEX = -1; // For delta polling -const MAX_LOG_LINES = 1000; // Rolling cap to prevent freezing +let LAST_LOG_INDEX = -1; // delta offset +const MAX_LOG_LINES = 600; // safe rolling window +/* --------------------------------------------------------- + Apply filter on existing log lines + --------------------------------------------------------- */ function applyLogFilter() { - console.log(">>> log_view.js applyLogFilter(), filter =", LOG_FILTER); - const lines = $$(".log-line"); - console.log(">>> log_view.js number of log-line elements:", lines.length); - lines.forEach((line) => { const text = line.innerText; - line.style.display = - LOG_FILTER === "ALL" || text.includes(LOG_FILTER) ? "block" : "none"; + const show = LOG_FILTER === "ALL" || (text && text.includes(LOG_FILTER)); + line.style.display = show ? "block" : "none"; }); } /* --------------------------------------------------------- - UI bindings + DOM Ready — bind clear/filter --------------------------------------------------------- */ document.addEventListener("DOMContentLoaded", () => { console.log(">>> log_view.js DOMContentLoaded"); @@ -41,84 +39,107 @@ document.addEventListener("DOMContentLoaded", () => { const clearBtn = $("#log-clear"); const output = $("#log-output"); - if (!filterSel) { - console.log(">>> log_view.js: No log viewer found on this page."); + if (!output) { + console.log( + ">>> log_view.js: No #log-output on this page → viewer disabled" + ); return; } console.log(">>> log_view.js: log viewer detected."); - // Filter dropdown - // filterSel.addEventListener("change", () => { - // LOG_FILTER = filterSel.value; - // console.log(">>> log_view.js filter changed to:", LOG_FILTER); - // applyLogFilter(); - // }); + // Filter dropdown (currently disabled in your UI) + // if (filterSel) { + // filterSel.addEventListener("change", () => { + // LOG_FILTER = filterSel.value; + // applyLogFilter(); + // }); + // } - // Clear log window if (clearBtn) { clearBtn.addEventListener("click", () => { - console.log(">>> log_view.js log-clear clicked → clearing output"); - if (output) { - output.innerHTML = ""; - LAST_LOG_INDEX = -1; // reset delta polling - } + console.log(">>> log_view.js: Clear log viewer"); + output.innerHTML = ""; + LAST_LOG_INDEX = -1; // reset delta polling }); } }); /* --------------------------------------------------------- - Append + Rolling buffer + Append ONE line (smart class assignment) --------------------------------------------------------- */ -function logAppend(lineText) { +function rollingAppend(lineText) { const output = $("#log-output"); if (!output) return; const div = document.createElement("div"); div.classList.add("log-line"); - // ----------------------------------------------------- - // Assign subtype classes - // ----------------------------------------------------- - if (lineText.includes("[DL]") || lineText.includes("[DOWNLOAD]")) { + // Type detection + if (lineText.includes("[DL]") || lineText.includes("[DOWNLOAD]")) div.classList.add("dl"); - } else if (lineText.includes("[PARSE]")) { - div.classList.add("parse"); - } else if (lineText.includes("[SAVE]")) { - div.classList.add("save"); - } else if (lineText.includes("[AUDIO]")) { - div.classList.add("audio"); - } else if (lineText.includes("[CTRL]")) { - div.classList.add("ctrl"); - } else if (lineText.includes("[ERROR]")) { - div.classList.add("error"); - } else { - div.classList.add("default"); - } + else if (lineText.includes("[PARSE]")) div.classList.add("parse"); + else if (lineText.includes("[SAVE]")) div.classList.add("save"); + else if (lineText.includes("[AUDIO]")) div.classList.add("audio"); + else if (lineText.includes("[CTRL]")) div.classList.add("ctrl"); + else if (lineText.includes("[ERROR]")) div.classList.add("error"); + else div.classList.add("default"); + + div.textContent = lineText; - div.innerText = lineText; output.appendChild(div); - // Rolling buffer - while (output.children.length > MAX_LOG_LINES) { + // Rolling limit + while (output.childNodes.length > MAX_LOG_LINES) { output.removeChild(output.firstChild); } +} + +/* --------------------------------------------------------- + Primary API entry: updateLogs() + Used by dashboard.js AND delta polling + Accepts: + { logs: [...], last_index: N } + OR legacy: + { lines: [...], total: N } + --------------------------------------------------------- */ +function updateLogs(packet) { + const output = $("#log-output"); + if (!output) return; + + if (!packet) return; + + // Normalized log arrays + let lines = packet.logs || packet.lines || []; + if (!Array.isArray(lines)) return; + + // Append only new lines + lines.forEach((line) => rollingAppend(line)); + + // Update delta index + if (packet.last_index !== undefined) { + LAST_LOG_INDEX = packet.last_index; + } else if (packet.total !== undefined) { + LAST_LOG_INDEX = packet.total - 1; + } applyLogFilter(); autoScroll(output); } /* --------------------------------------------------------- - Delta-based log polling + Delta polling: ONLY global logs use this + Dashboard overrides logs per book. --------------------------------------------------------- */ function pollLogs() { fetch(`/logs?last_index=${LAST_LOG_INDEX}`) .then((r) => r.json()) .then((data) => { const lines = data.lines || []; + if (lines.length > 0) { lines.forEach((line) => logAppend(line)); - LAST_LOG_INDEX = data.total - 1; + LAST_LOG_INDEX = data.last; // <-- DE JUISTE INDEX! } }) .catch((err) => { @@ -126,7 +147,6 @@ function pollLogs() { }); } -// Poll every 800 ms -setInterval(pollLogs, 1800); +setInterval(pollLogs, 2800); console.log(">>> log_view.js LOADED"); diff --git a/bookscraper/templates/components/bookcard.html b/bookscraper/templates/components/bookcard.html new file mode 100644 index 0000000..9a88862 --- /dev/null +++ b/bookscraper/templates/components/bookcard.html @@ -0,0 +1,82 @@ +{# ============================================================ + File: templates/components/bookcard.html + Purpose: + Eén enkele boekkaart met: + - status styles + - cover + - metadata + - hide button + - start (play) + - abort (stop) + Requires: + variable "b" in context + ============================================================ #} + +
+ + +
+ +
+ + +
+ {% if b.cover_path %} + cover + {% else %} +
?
+ {% endif %} +
+ + +
+
{{ b.title }}
+
{{ b.author }}
+
Geregistreerd: {{ b.created_at }}
+ +
+ +
+ + +
+ + +
+ + +
+
+ +
+ +
diff --git a/bookscraper/templates/components/nav.html b/bookscraper/templates/components/nav.html index 1653bca..10e6485 100644 --- a/bookscraper/templates/components/nav.html +++ b/bookscraper/templates/components/nav.html @@ -1,16 +1,16 @@ diff --git a/bookscraper/templates/components/registered_books.html b/bookscraper/templates/components/registered_books.html new file mode 100644 index 0000000..09dafe2 --- /dev/null +++ b/bookscraper/templates/components/registered_books.html @@ -0,0 +1,21 @@ +{# ============================================================ File: +templates/components/registered_books.html Purpose: Toon een grid van +geregistreerde boeken. Elke kaart wordt gerenderd via bookcard.html. +============================================================ #} + +
+

Geregistreerde boeken

+ + {% if registered and registered|length > 0 %} + +
+ {% for b in registered %} {% include "components/bookcard.html" %} {% endfor + %} +
+ + {% else %} + +

Geen geregistreerde boeken.

+ + {% endif %} +
diff --git a/bookscraper/templates/components/url_input.html b/bookscraper/templates/components/url_input.html index 634e2f2..d69db52 100644 --- a/bookscraper/templates/components/url_input.html +++ b/bookscraper/templates/components/url_input.html @@ -5,7 +5,7 @@ Used on landing pages or detail pages. ======================================================================= --> -
+ + {% include "components/registered_books.html" %} + +
diff --git a/bookscraper/templates/debug/inspect_state.html b/bookscraper/templates/debug/inspect_state.html new file mode 100644 index 0000000..25a757b --- /dev/null +++ b/bookscraper/templates/debug/inspect_state.html @@ -0,0 +1,88 @@ +{% extends "layout.html" %} {% block content %} + +

State Inspection (SQL vs Redis)

+ + + +{% macro cmp(sqlval, redisval) %} {% if (sqlval|string) == (redisval|string) %} +{{ sqlval }} +{{ redisval }} +{% else %} +{{ sqlval }} +{{ redisval }} +{% endif %} {% endmacro %} {% for entry in results %} +
+
📘 {{ entry.book_id }}
+ + {% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged = + entry.would_merge_to %} + + + + + + + + + + {% for field in [ "status", "chapters_total", "downloaded", + "chapters_download_done", "chapters_download_skipped", "parsed", + "chapters_parsed_done", "audio_done", "audio_skipped", "last_update" ] %} + + + + + + + + + + {% endfor %} +
FieldSQLiteRedisMerged Result
{{ field }}{{ sql.get(field, '') }}{{ redis.get(field, '') }}{{ merged.get(field, '') }}
+
+{% endfor %} {% endblock %} diff --git a/bookscraper/templates/debug/queues.html b/bookscraper/templates/debug/queues.html new file mode 100644 index 0000000..bb2dec9 --- /dev/null +++ b/bookscraper/templates/debug/queues.html @@ -0,0 +1,91 @@ +{% extends "layout.html" %} {% block content %} +

Celery Queue Debug

+ + + +
+

Workers

+ +

Active Tasks

+
{{ workers_active | tojson(indent=2) }}
+ +

Reserved

+
{{ workers_reserved | tojson(indent=2) }}
+ +

Scheduled

+
{{ workers_scheduled | tojson(indent=2) }}
+
+ +
+ +
+

Queues

+ + {% for q in queues %} +
+

{{ q.name }} ({{ q.length }} items)

+ + + + + + + + + + + + + + + + +
Redis Key{{ q.redis_key }}
Length{{ q.length }}
Items (first 30) + {% if q["items"] %} +
    + {% for item in q["items"] %} +
  • {{ item | e }}
  • + {% endfor %} +
+ {% else %} + No items + {% endif %} +
+
+ {% endfor %} +
+ + + +{% endblock %} diff --git a/bookscraper/templates/layout.html b/bookscraper/templates/layout.html index af2c248..14d6788 100644 --- a/bookscraper/templates/layout.html +++ b/bookscraper/templates/layout.html @@ -14,6 +14,12 @@ + + + diff --git a/bookscraper/worker/downloader.py b/bookscraper/worker/downloader.py index 6ad8378..e0216cb 100644 --- a/bookscraper/worker/downloader.py +++ b/bookscraper/worker/downloader.py @@ -5,7 +5,7 @@ import requests from io import BytesIO from bs4 import BeautifulSoup from scraper.logger import log_debug -from scraper.utils import clean_text +from scraper.utils.utils import clean_text from urllib.parse import urljoin @@ -103,8 +103,11 @@ class ChapterDownloader: collecting = True continue - text = sib.get_text("\n", strip=True) if hasattr( - sib, "get_text") else str(sib).strip() + text = ( + sib.get_text("\n", strip=True) + if hasattr(sib, "get_text") + else str(sib).strip() + ) if text: parts.append(text) @@ -121,6 +124,7 @@ class ChapterDownloader: vdir = f"{output_base}/v{volume}" import os + os.makedirs(vdir, exist_ok=True) fname = f"{number:05d}_{title}.txt"