From 3a62dfae799964a2cc801a2eaea0af51fd4b924f Mon Sep 17 00:00:00 2001 From: "peter.fong" Date: Tue, 9 Dec 2025 09:44:06 +0100 Subject: [PATCH] Refactor: unified book_idx state model + controller rewrite + Redis/SQLite sync overhaul MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Removed old scraping→controller chain (no more async .get) - New DownloadController pipeline structure - Unified Redis Book State Model (book:{idx}:state) - Updated all Celery tasks for unified IDs - Removed old scraper/db.py - Updated templates and dashboard components - Added debug Inspect State system with bookcard preview - Updated JS dashboard pipeline refresh - Updated init_service + scrape_engine - Improved abort logic --- bookscraper/app.py | 161 +++++++------- bookscraper/db/db.py | 50 +++-- bookscraper/db/repository.py | 174 +++++++-------- bookscraper/db/state_redis.py | 88 ++++++-- bookscraper/db/state_sql.py | 73 +++--- bookscraper/scraper/abort.py | 67 +++--- bookscraper/scraper/book_scraper.py | 39 ++-- bookscraper/scraper/db.py | 0 bookscraper/scraper/download_controller.py | 207 ++++++------------ bookscraper/scraper/replacements/junk.txt | 2 + bookscraper/scraper/services/init_service.py | 46 ++-- bookscraper/scraper/services/scrape_engine.py | 37 ++-- bookscraper/scraper/tasks/controller_tasks.py | 168 +++++++++----- bookscraper/scraper/tasks/download_tasks.py | 75 ++++--- bookscraper/scraper/tasks/parse_tasks.py | 73 +++--- bookscraper/scraper/tasks/pipeline.py | 15 +- bookscraper/scraper/tasks/save_tasks.py | 35 +-- bookscraper/scraper/tasks/scraping.py | 96 ++++---- bookscraper/scraper/utils/state_sync.py | 175 +++++++++------ bookscraper/static/js/dashboard.js | 102 +++++---- bookscraper/static/js/log_view.js | 52 ++--- bookscraper/static/js/progress.js | 44 ++-- .../templates/components/book_list_item.html | 12 +- .../templates/components/bookcard.html | 10 +- .../templates/components/progress_box.html | 7 +- .../templates/dashboard/book_detail.html | 13 +- .../templates/debug/inspect_state.html | 97 ++++---- bookscraper/templates/result.html | 123 ++--------- 28 files changed, 1037 insertions(+), 1004 deletions(-) delete mode 100644 bookscraper/scraper/db.py diff --git a/bookscraper/app.py b/bookscraper/app.py index ce713d6..07f8679 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -28,9 +28,10 @@ from db.repository import ( get_progress, ) +from logbus.publisher import log from scraper.logger import log_debug from scraper.abort import set_abort -from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta +from scraper.ui_log import get_ui_logs, reset_ui_logs from scraper.state import state as r from scraper.logger_decorators import logcall from scraper.utils.state_sync import sync_books_from_redis @@ -42,7 +43,6 @@ init_db() app = Flask(__name__) - # ===================================================== # STATIC FILE SERVING # ===================================================== @@ -70,25 +70,25 @@ def index(): @logcall def dashboard(): logs_list = get_ui_logs() or [] - - # Filter hidden books ONLY for GUI + registered_books = get_registered_books() + log(f"[WEB] Registered books: {registered_books}") reg = [b for b in get_registered_books() if b.get("status") != "hidden"] return render_template( "dashboard/dashboard.html", - books=list_active_books(), # Redis - registered=reg, # SQLite (filtered) + books=list_active_books(), + registered=reg, logs=logs_list, ) -@app.route("/book/") +@app.route("/book/") @logcall -def book_detail(book_id): - title = r.get(f"book:{book_id}:title") or book_id +def book_detail(book_idx): + title = r.get(f"book:{book_idx}:title") or book_idx return render_template( "dashboard/book_detail.html", - book_id=book_id, + book_id=book_idx, title=title, logs=get_ui_logs(), ) @@ -102,13 +102,6 @@ def book_detail(book_id): @app.route("/init", methods=["POST"]) @logcall def init_book(): - """ - INIT-flow: - - user enters URL - - metadata fetch - - insert into SQLite as 'registered' - - return dashboard - """ url = request.form.get("url", "").strip() if not url: @@ -146,47 +139,64 @@ def init_book(): ) -@app.route("/hide/", methods=["POST"]) +@app.route("/hide/", methods=["POST"]) @logcall -def hide_registered_book(book_id): - """ - Soft-delete/hide voor GUI. - De DB blijft intact. - """ - # try: - # hide_book(book_id) - # return redirect("/dashboard") - # # return jsonify({"status": "ok", "hidden": book_id}) - # except Exception as e: - # return jsonify({"status": "error", "message": str(e)}), 500 +def hide_registered_book(book_idx): + # intentionally left disabled + pass @app.route("/start", methods=["POST"]) @logcall def start_scraping(): - """ - Start FULL scraping vanuit een geregistreerd INIT-record. - """ - book_id = request.form.get("book_id") - if not book_id: - return jsonify({"status": "error", "message": "book_id ontbreekt"}), 400 - - book = fetch_book(book_id) + # 1) Form field: book_idx + book_idx = request.form.get("book_idx") + log(f"[WEB][START] Received start request for book_idx={book_idx}") + if not book_idx: + msg = "book_idx ontbreekt in formulier" + log(f"[WEB][START] ERROR: {msg}") + return jsonify({"status": "error", "message": msg}), 400 + + # 2) Fetch boek uit SQLite + try: + book = fetch_book(book_idx) + log(f"[WEB][START] Fetched book from DB: {book}") + except Exception as e: + log(f"[WEB][START] DB ERROR: {e}") + return jsonify({"status": "error", "message": "DB fout"}), 500 + if not book: - return jsonify({"status": "error", "message": "Boek niet gevonden"}), 404 + msg = f"Boek '{book_idx}' niet gevonden in DB" + log(f"[WEB][START] ERROR: {msg}") + return jsonify({"status": "error", "message": msg}), 404 + # 3) Boek moet een URL hebben url = book.get("book_url") if not url: - return jsonify({"status": "error", "message": "book_url ontbreekt"}), 500 + msg = f"Boek '{book_idx}' heeft geen book_url in DB" + log(f"[WEB][START] ERROR: {msg}") + return jsonify({"status": "error", "message": msg}), 500 + # 4) Reset UI logs reset_ui_logs() - log_debug(f"[WEB] Starting FULL scrape for book_id={book_id}, url={url}") - async_result = celery_app.send_task( - "scraper.tasks.scraping.start_scrape_book", - args=[url], - queue="scraping", - ) + # 5) Logging + log(f"[WEB][START] Starting full scrape book_idx={book_idx}, url={url}") + log_debug(f"[WEB][START] DEBUG: book data = {book}") + + # 6) Celery controller taak starten + try: + async_result = celery_app.send_task( + "scraper.tasks.controller_tasks.start_full_scrape", + args=[book_idx], + queue="controller", + ) + except Exception as e: + log(f"[WEB][START] Celery ERROR: {e}") + return jsonify({"status": "error", "message": f"Celery fout: {e}"}), 500 + + # 7) Successfully dispatched task + log(f"[WEB][START] Task dispatched: {async_result.id}") reg = [b for b in get_registered_books() if b.get("status") != "hidden"] @@ -199,12 +209,12 @@ def start_scraping(): ) -@app.route("/abort/", methods=["POST"]) +@app.route("/abort/", methods=["POST"]) @logcall -def abort_download(book_id): - log_debug(f"[WEB] Abort requested for book: {book_id}") - set_abort(book_id) - return jsonify({"status": "ok", "aborted": book_id}) +def abort_download(book_idx): + log_debug(f"[WEB] Abort requested for book: {book_idx}") + set_abort(book_idx) + return jsonify({"status": "ok", "aborted": book_idx}) # ===================================================== @@ -218,23 +228,23 @@ def api_books(): return jsonify(list_active_books()) -@app.route("/api/book//status") +@app.route("/api/book//status") @logcall -def api_book_status(book_id): - return jsonify(getStatus(book_id)) +def api_book_status(book_idx): + return jsonify(getStatus(book_idx)) -@app.route("/api/book//logs") +@app.route("/api/book//logs") @logcall -def api_book_logs(book_id): - logs = r.lrange(f"logs:{book_id}", 0, -1) or [] +def api_book_logs(book_idx): + logs = r.lrange(f"logs:{book_idx}", 0, -1) or [] return jsonify(logs) -@app.route("/progress/") +@app.route("/progress/") @logcall -def progress(book_id): - return jsonify(get_progress(book_id)) +def progress(book_idx): + return jsonify(get_progress(book_idx)) @app.route("/celery-result/") @@ -258,16 +268,13 @@ def clear_logs(): @app.route("/logs", methods=["GET"]) @logcall def logs(): - # LAST_LOG_INDEX vanuit de client (default = -1 bij eerste call) try: last_index = int(request.args.get("last_index", -1)) except: last_index = -1 - # Haal volledige huidige loglijst op all_logs = get_ui_logs() or [] - # Delta: alle regels met index > last_index new_lines = [] new_last = last_index @@ -282,6 +289,8 @@ def logs(): # ===================================================== # SECTION 4 — DEBUG ROUTES # ===================================================== + + @app.route("/debug/sync_state", methods=["GET"]) def debug_sync_state(): results = sync_books_from_redis() @@ -293,13 +302,6 @@ from scraper.utils.state_sync import inspect_books_state @app.route("/debug/inspect_state", methods=["GET"]) def debug_inspect_state(): - """ - Shows: - - raw SQLite values, - - raw Redis values, - - what the merged result WOULD be. - No writes happen. - """ results = inspect_books_state() return render_template("debug/inspect_state.html", results=results) @@ -339,10 +341,10 @@ def api_db_books(): # ============================================= # DEBUG QUEUE VIEW (HTML) # ============================================= + from flask import render_template from urllib.parse import urlparse import redis -import os from celery_app import celery_app @@ -354,11 +356,10 @@ def debug_queues(): workers_scheduled = insp.scheduled() or {} workers_reserved = insp.reserved() or {} - # ---- Redis connection ---- redis_url = os.getenv("REDIS_BROKER") parsed = urlparse(redis_url) - r = redis.Redis( + r2 = redis.Redis( host=parsed.hostname, port=parsed.port, db=int(parsed.path.strip("/") or 0), @@ -375,8 +376,8 @@ def debug_queues(): { "name": q, "redis_key": key, - "length": r.llen(key), - "items": r.lrange(key, 0, 30), # first 30 entries + "length": r2.llen(key), + "items": r2.lrange(key, 0, 30), } ) except Exception as e: @@ -404,17 +405,17 @@ def debug_queues(): @logcall -def getStatus(book_id): - state = r.hgetall(f"book:{book_id}:state") +def getStatus(book_idx): + state = r.hgetall(f"book:{book_idx}:state") status = state.get("status") or "unknown" dl_done = int(state.get("chapters_download_done", 0)) dl_skipped = int(state.get("chapters_download_skipped", 0)) dl_total = int(state.get("chapters_total", 0)) au_done = int(state.get("audio_done") or 0) - title = state.get("title") or book_id + title = state.get("title") or book_idx return { - "book_id": book_id, + "book_id": book_idx, "title": title, "status": status, "download_done": dl_done, @@ -431,8 +432,8 @@ def list_active_books(): for key in r.scan_iter(match="book:*:state", count=1000): first = key.find(":") second = key.find(":", first + 1) - book_id = key[first + 1 : second] - books.append(getStatus(book_id)) + book_idx = key[first + 1 : second] + books.append(getStatus(book_idx)) return books diff --git a/bookscraper/db/db.py b/bookscraper/db/db.py index f9d4363..163f16f 100644 --- a/bookscraper/db/db.py +++ b/bookscraper/db/db.py @@ -1,12 +1,11 @@ # ============================================================ -# File: db/db.py +# File: db/db.py (UPDATED for book_idx-only architecture) # Purpose: # Raw SQLite engine for BookScraper. -# Provides ONLY low-level DB primitives. -# - Connection management (existing DELETE journal mode) +# - Connection management # - init_db() schema creation + safe schema upgrade -# - upsert_book() atomic write -# - raw fetch helpers (private) +# - upsert_book() atomic write (now uses book_idx) +# - raw fetch helpers # ============================================================ import os @@ -52,12 +51,12 @@ def init_db(): conn = get_db() # -------------------------------------------------------- - # BASE SCHEMA (unchanged) + # BASE SCHEMA — book_idx is now PRIMARY KEY # -------------------------------------------------------- conn.execute( """ CREATE TABLE IF NOT EXISTS books ( - book_id TEXT PRIMARY KEY, + book_idx INTEGER PRIMARY KEY, title TEXT, author TEXT, description TEXT, @@ -81,7 +80,7 @@ def init_db(): conn.commit() # -------------------------------------------------------- - # SCHEMA UPGRADE UTIL + # SCHEMA UPGRADE UTILITY # -------------------------------------------------------- def add_column(name, type_): try: @@ -92,16 +91,16 @@ def init_db(): cols = conn.execute("PRAGMA table_info(books);").fetchall() colnames = [c[1] for c in cols] - # Existing: ensure new metadata fields exist + # -------------------------------------------------------- + # UPGRADE NEW FIELDS — future-proof, matched with Redis state model + # -------------------------------------------------------- + + # (book_idx already exists as PRIMARY KEY — no need to add) + add_column("description", "TEXT") add_column("cover_path", "TEXT") add_column("book_url", "TEXT") - # -------------------------------------------------------- - # NEW FIELDS — MATCH REDIS STATE MODEL (future-proof) - # These do NOT change logic, but enable repository snapshot sync. - # -------------------------------------------------------- - # Download counters add_column("chapters_download_done", "INTEGER DEFAULT 0") add_column("chapters_download_skipped", "INTEGER DEFAULT 0") @@ -116,13 +115,18 @@ def init_db(): # ------------------------------------------------------------ -# WRITE OPERATIONS +# WRITE OPERATIONS (book_idx-based UPSERT) # ------------------------------------------------------------ -def upsert_book(book_id, **fields): +def upsert_book(book_idx, **fields): + """ + UPSERT by book_idx. + Replaces old upsert that used book_id. + """ + conn = get_db() - keys = ["book_id"] + list(fields.keys()) - values = [book_id] + list(fields.values()) + keys = ["book_idx"] + list(fields.keys()) + values = [book_idx] + list(fields.values()) placeholders = ",".join(["?"] * len(values)) updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()]) @@ -130,7 +134,7 @@ def upsert_book(book_id, **fields): sql = f""" INSERT INTO books ({','.join(keys)}) VALUES ({placeholders}) - ON CONFLICT(book_id) + ON CONFLICT(book_idx) DO UPDATE SET {updates}, last_update = CURRENT_TIMESTAMP; """ @@ -140,11 +144,13 @@ def upsert_book(book_id, **fields): # ------------------------------------------------------------ -# RAW READ OPERATIONS (PRIVATE) +# RAW READ OPERATIONS # ------------------------------------------------------------ -def _raw_get_book(book_id): +def _raw_get_book(book_idx): conn = get_db() - row = conn.execute("SELECT * FROM books WHERE book_id = ?;", (book_id,)).fetchone() + row = conn.execute( + "SELECT * FROM books WHERE book_idx = ?;", (book_idx,) + ).fetchone() return dict(row) if row else None diff --git a/bookscraper/db/repository.py b/bookscraper/db/repository.py index faefc94..bebc970 100644 --- a/bookscraper/db/repository.py +++ b/bookscraper/db/repository.py @@ -8,6 +8,9 @@ # - Route counters → Redis (live) + SQLite (snapshot) # - Provide a clean API for tasks and Flask UI # ============================================================ +# ============================================================ +# File: db/repository.py (UPDATED for book_idx-only architecture) +# ============================================================ from scraper.logger_decorators import logcall from logbus.publisher import log @@ -53,43 +56,44 @@ _r = redis.Redis.from_url(REDIS_URL, decode_responses=True) # ============================================================ -# INTERNAL — legacy progress helpers +# INTERNAL — LEGACY PROGRESS HELPERS (kept for UI) +# Keys remain: progress:{book_idx}:* # ============================================================ -def _legacy_set_total(book_id, total): - _r.set(f"progress:{book_id}:total", total) +def _legacy_set_total(book_idx, total): + _r.set(f"progress:{book_idx}:total", total) -def _legacy_inc_completed(book_id): - _r.incr(f"progress:{book_id}:completed") +def _legacy_inc_completed(book_idx): + _r.incr(f"progress:{book_idx}:completed") -def _legacy_inc_skipped(book_id): - _r.incr(f"progress:{book_id}:skipped") +def _legacy_inc_skipped(book_idx): + _r.incr(f"progress:{book_idx}:skipped") -def _legacy_inc_failed(book_id): - _r.incr(f"progress:{book_id}:failed") +def _legacy_inc_failed(book_idx): + _r.incr(f"progress:{book_idx}:failed") -def _legacy_add_failed_chapter(book_id, chapter, reason): +def _legacy_add_failed_chapter(book_idx, chapter, reason): entry = f"Chapter {chapter}: {reason}" - _r.rpush(f"progress:{book_id}:failed_list", entry) + _r.rpush(f"progress:{book_idx}:failed_list", entry) -def _legacy_get_failed_list(book_id): - return _r.lrange(f"progress:{book_id}:failed_list", 0, -1) +def _legacy_get_failed_list(book_idx): + return _r.lrange(f"progress:{book_idx}:failed_list", 0, -1) -def _legacy_get_progress(book_id): - total = int(_r.get(f"progress:{book_id}:total") or 0) - completed = int(_r.get(f"progress:{book_id}:completed") or 0) - skipped = int(_r.get(f"progress:{book_id}:skipped") or 0) - failed = int(_r.get(f"progress:{book_id}:failed") or 0) - abort = _r.exists(f"abort:{book_id}") == 1 - failed_list = _legacy_get_failed_list(book_id) +def _legacy_get_progress(book_idx): + total = int(_r.get(f"progress:{book_idx}:total") or 0) + completed = int(_r.get(f"progress:{book_idx}:completed") or 0) + skipped = int(_r.get(f"progress:{book_idx}:skipped") or 0) + failed = int(_r.get(f"progress:{book_idx}:failed") or 0) + abort = _r.exists(f"abort:{book_idx}") == 1 + failed_list = _legacy_get_failed_list(book_idx) return { - "book_id": book_id, + "book_idx": book_idx, "total": total, "completed": completed, "skipped": skipped, @@ -100,29 +104,29 @@ def _legacy_get_progress(book_id): # ============================================================ -# PUBLIC — UI-ready legacy progress access +# PUBLIC — PROGRESS API # ============================================================ @logcall -def get_progress(book_id): - return _legacy_get_progress(book_id) +def get_progress(book_idx): + return _legacy_get_progress(book_idx) @logcall -def add_failed_chapter(book_id, chapter, reason): - _legacy_add_failed_chapter(book_id, chapter, reason) +def add_failed_chapter(book_idx, chapter, reason): + _legacy_add_failed_chapter(book_idx, chapter, reason) @logcall -def get_failed_list(book_id): - return _legacy_get_failed_list(book_id) +def get_failed_list(book_idx): + return _legacy_get_failed_list(book_idx) # ============================================================ # FETCH OPERATIONS (SQLite snapshot) # ============================================================ @logcall -def fetch_book(book_id): - return sql_fetch_book(book_id) +def fetch_book(book_idx): + return sql_fetch_book(book_idx) @logcall @@ -135,7 +139,7 @@ def fetch_all_books(): # ============================================================ @logcall def register_book( - book_id, + book_idx, title, author=None, description=None, @@ -145,6 +149,7 @@ def register_book( ): fields = { + "book_idx": book_idx, "title": title, "author": author, "description": description, @@ -154,19 +159,24 @@ def register_book( "chapters_total": 0, "status": "registered", } - log(f"[DB] Registering new book={book_id} title='{title}'") - sql_register_book(book_id, fields) + + log(f"[DB] Registering new book_idx={book_idx} title='{title}'") + sql_register_book(book_idx, fields) +# ============================================================ +# SCRAPE-FLOW UPDATE +# ============================================================ @logcall def update_book_after_full_scrape( - book_id, + book_idx, title=None, author=None, description=None, cover_url=None, chapters_total=None, ): + fields = {} if title is not None: @@ -182,8 +192,8 @@ def update_book_after_full_scrape( fields["status"] = "active" - log(f"[DB] update full scrape metadata book={book_id}") - sql_update_book(book_id, fields) + log(f"[DB] update metadata for book_idx={book_idx}") + sql_update_book(book_idx, fields) # ============================================================ @@ -206,98 +216,82 @@ def get_active_books(): # STATUS MANAGEMENT # ============================================================ @logcall -def set_status(book_id, status): - log(f"[DB] Setting status for {book_id} to '{status}'") - redis_set_status(book_id, status) - sql_set_status(book_id, status) +def set_status(book_idx, status): + log(f"[DB] Setting status for {book_idx} to '{status}'") + redis_set_status(book_idx, status) + sql_set_status(book_idx, status) # ============================================================ # CHAPTER TOTALS # ============================================================ @logcall -def set_chapters_total(book_id, total): - log(f"[DB] Setting chapter total for {book_id} to {total}") - redis_set_chapters_total(book_id, total) - sql_set_chapters_total(book_id, total) - _legacy_set_total(book_id, total) # integrate legacy progress +def set_chapters_total(book_idx, total): + log(f"[DB] Setting chapter total for {book_idx} to {total}") + redis_set_chapters_total(book_idx, total) + sql_set_chapters_total(book_idx, total) + _legacy_set_total(book_idx, total) # ============================================================ # COUNTERS — DOWNLOAD # ============================================================ @logcall -def inc_download_done(book_id, amount=1): - log(f"[DB] Incrementing download done for {book_id} by {amount}") - redis_inc_download_done(book_id, amount) - sql_inc_downloaded(book_id, amount) - _legacy_inc_completed(book_id) +def inc_download_done(book_idx, amount=1): + log(f"[DB] Incrementing download done for {book_idx} by {amount}") + redis_inc_download_done(book_idx, amount) + sql_inc_downloaded(book_idx, amount) + _legacy_inc_completed(book_idx) @logcall -def inc_download_skipped(book_id, amount=1): - log(f"[DB] Incrementing download skipped for {book_id} by {amount}") - redis_inc_download_skipped(book_id, amount) - _legacy_inc_skipped(book_id) +def inc_download_skipped(book_idx, amount=1): + log(f"[DB] Incrementing download skipped for {book_idx} by {amount}") + redis_inc_download_skipped(book_idx, amount) + _legacy_inc_skipped(book_idx) # ============================================================ # COUNTERS — PARSE # ============================================================ @logcall -def inc_parsed_done(book_id, amount=1): - log(f"[DB] Incrementing parsed done for {book_id} by {amount}") - redis_inc_parsed_done(book_id, amount) - sql_inc_parsed(book_id, amount) +def inc_parsed_done(book_idx, amount=1): + log(f"[DB] Incrementing parsed done for {book_idx} by {amount}") + redis_inc_parsed_done(book_idx, amount) + sql_inc_parsed(book_idx, amount) # ============================================================ # COUNTERS — AUDIO # ============================================================ -# ============================================================ -# COUNTERS — AUDIO SKIPPED -# ============================================================ @logcall -def inc_audio_skipped(book_id, amount=1): - log(f"[DB] Incrementing audio skipped for {book_id} by {amount}") - # Redis live counter (maak deze functie in state_redis wanneer nodig) - sql_inc_audio_skipped(book_id, amount) - redis_inc_audio_skipped(book_id, amount) - # Geen SQLite kolom? Dan overslaan. +def inc_audio_skipped(book_idx, amount=1): + log(f"[DB] Incrementing audio skipped for {book_idx} by {amount}") + sql_inc_audio_skipped(book_idx, amount) + redis_inc_audio_skipped(book_idx, amount) @logcall -def inc_audio_done(book_id, amount=1): - log(f"[DB] Incrementing audio done for {book_id} by {amount}") - redis_inc_audio_done(book_id, amount) - sql_inc_audio_done(book_id, amount) +def inc_audio_done(book_idx, amount=1): + log(f"[DB] Incrementing audio done for {book_idx} by {amount}") + redis_inc_audio_done(book_idx, amount) + sql_inc_audio_done(book_idx, amount) # ============================================================ -# BACKWARDS COMPATIBILITY SHIMS (old task API) +# BACKWARDS COMPATIBILITY SHIMS +# These map the old API (book_id) to the new book_idx-only system # ============================================================ - - @logcall -def inc_downloaded(book_id, amount=1): - """ - Old name used by older tasks. - Redirects to new unified counter. - """ - return inc_download_done(book_id, amount) +def inc_downloaded(book_idx, amount=1): + return inc_download_done(book_idx, amount) @logcall -def inc_parsed(book_id, amount=1): - """ - Old name used by older tasks. - """ - return inc_parsed_done(book_id, amount) +def inc_parsed(book_idx, amount=1): + return inc_parsed_done(book_idx, amount) @logcall -def inc_audio_done_legacy(book_id, amount=1): - """ - Old audio name used by older tasks. - """ - return inc_audio_done(book_id, amount) +def inc_audio_done_legacy(book_idx, amount=1): + return inc_audio_done(book_idx, amount) diff --git a/bookscraper/db/state_redis.py b/bookscraper/db/state_redis.py index 232d7af..2251f81 100644 --- a/bookscraper/db/state_redis.py +++ b/bookscraper/db/state_redis.py @@ -1,5 +1,5 @@ # ============================================================ -# File: db/state_redis.py +# File: db/state_redis.py (UPDATED for book_idx-only architecture) # Purpose: # Low-level Redis counters/state for BookScraper. # Used ONLY by db.repository façade. @@ -15,11 +15,18 @@ REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") r = redis.Redis.from_url(REDIS_URL, decode_responses=True) +# ------------------------------------------------------------ +# INTERNAL KEY BUILDER +# ------------------------------------------------------------ +def _key(book_idx: str) -> str: + return f"book:{book_idx}:state" + + # ------------------------------------------------------------ # STATUS # ------------------------------------------------------------ -def redis_set_status(book_id: str, status: str): - key = f"book:{book_id}:state" +def redis_set_status(book_idx: str, status: str): + key = _key(book_idx) r.hset(key, "status", status) r.hset(key, "last_update", int(time.time())) @@ -27,8 +34,8 @@ def redis_set_status(book_id: str, status: str): # ------------------------------------------------------------ # TOTAL CHAPTERS # ------------------------------------------------------------ -def redis_set_chapters_total(book_id: str, total: int): - key = f"book:{book_id}:state" +def redis_set_chapters_total(book_idx: str, total: int): + key = _key(book_idx) r.hset(key, "chapters_total", total) r.hset(key, "last_update", int(time.time())) @@ -36,15 +43,15 @@ def redis_set_chapters_total(book_id: str, total: int): # ------------------------------------------------------------ # DOWNLOAD COUNTERS # ------------------------------------------------------------ -def redis_inc_download_done(book_id: str, amount: int = 1): - key = f"book:{book_id}:state" +def redis_inc_download_done(book_idx: str, amount: int = 1): + key = _key(book_idx) r.hincrby(key, "chapters_download_done", amount) r.hset(key, "last_update", int(time.time())) -def redis_inc_download_skipped(book_id: str, amount: int = 1): - log(f"[DB-REDIS] Incrementing download skipped for {book_id} by {amount}") - key = f"book:{book_id}:state" +def redis_inc_download_skipped(book_idx: str, amount: int = 1): + log(f"[DB-REDIS] Incrementing download skipped for {book_idx} by {amount}") + key = _key(book_idx) r.hincrby(key, "chapters_download_skipped", amount) r.hset(key, "last_update", int(time.time())) @@ -52,8 +59,8 @@ def redis_inc_download_skipped(book_id: str, amount: int = 1): # ------------------------------------------------------------ # PARSE COUNTERS # ------------------------------------------------------------ -def redis_inc_parsed_done(book_id: str, amount: int = 1): - key = f"book:{book_id}:state" +def redis_inc_parsed_done(book_idx: str, amount: int = 1): + key = _key(book_idx) r.hincrby(key, "chapters_parsed_done", amount) r.hset(key, "last_update", int(time.time())) @@ -61,19 +68,64 @@ def redis_inc_parsed_done(book_id: str, amount: int = 1): # ------------------------------------------------------------ # AUDIO COUNTERS # ------------------------------------------------------------ -def redis_inc_audio_done(book_id: str, amount: int = 1): - log(f"[DB-REDIS] Incrementing audio done for {book_id} by {amount}") - key = f"book:{book_id}:state" +def redis_inc_audio_done(book_idx: str, amount: int = 1): + log(f"[DB-REDIS] Incrementing audio done for {book_idx} by {amount}") + key = _key(book_idx) r.hincrby(key, "audio_done", amount) r.hset(key, "last_update", int(time.time())) -def redis_inc_audio_skipped(book_id: str, amount: int = 1): - log(f"[DB-REDIS] Incrementing audio skipped for {book_id} by {amount}") +def redis_inc_audio_skipped(book_idx: str, amount: int = 1): + log(f"[DB-REDIS] Incrementing audio skipped for {book_idx} by {amount}") """ New: Count skipped audio chapters (timeouts, pre-existing files, abort, etc.) SQL does NOT track this; Redis-only metric. """ - key = f"book:{book_id}:state" + key = _key(book_idx) r.hincrby(key, "audio_skipped", amount) r.hset(key, "last_update", int(time.time())) + + +# ------------------------------------------------------------ +# INITIALISE BOOK STATE +# ------------------------------------------------------------ +def init_book_state(book_id: str, title: str, url: str, chapters_total: int): + """ + Initialiseert de complete Redis state voor een nieuw boek. + LET OP: + - Als een key al bestaat → NIET resetten (progress behouden). + - Alleen missende velden worden toegevoegd. + """ + + key = f"book:{book_id}:state" + + # Bestaat al? Dan vullen we alleen missende velden aan. + exists = r.exists(key) + + pipeline = r.pipeline() + + # Basis metadata + pipeline.hsetnx(key, "book_id", book_id) + pipeline.hsetnx(key, "title", title or "") + pipeline.hsetnx(key, "url", url or "") + + # State + pipeline.hsetnx(key, "status", "registered") + + # Counters + pipeline.hsetnx(key, "chapters_total", chapters_total) + pipeline.hsetnx(key, "chapters_download_done", 0) + pipeline.hsetnx(key, "chapters_download_skipped", 0) + pipeline.hsetnx(key, "chapters_parsed_done", 0) + pipeline.hsetnx(key, "audio_done", 0) + pipeline.hsetnx(key, "audio_skipped", 0) + + # Timestamp + pipeline.hset(key, "last_update", int(time.time())) + + pipeline.execute() + + if exists: + log(f"[DB-REDIS] init_book_state(): UPDATED existing state for {book_id}") + else: + log(f"[DB-REDIS] init_book_state(): CREATED new state for {book_id}") diff --git a/bookscraper/db/state_sql.py b/bookscraper/db/state_sql.py index ec7626a..b5ecb79 100644 --- a/bookscraper/db/state_sql.py +++ b/bookscraper/db/state_sql.py @@ -1,5 +1,5 @@ # ============================================================ -# File: db/state_sql.py +# File: db/state_sql.py (UPDATED for book_idx-only architecture) # Purpose: # Low-level SQLite snapshot layer for BookScraper metadata. # Used ONLY through db.repository façade. @@ -10,7 +10,8 @@ import os from logbus.publisher import log -DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/db/books.db") +# Must match db/db.py +DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/data/books.db") # ------------------------------------------------------------ @@ -25,10 +26,10 @@ def _connect(): # ------------------------------------------------------------ # FETCH # ------------------------------------------------------------ -def sql_fetch_book(book_id): +def sql_fetch_book(book_idx): conn = _connect() cur = conn.cursor() - cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,)) + cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,)) row = cur.fetchone() conn.close() return dict(row) if row else None @@ -37,7 +38,7 @@ def sql_fetch_book(book_id): def sql_fetch_all_books(): conn = _connect() cur = conn.cursor() - cur.execute("SELECT * FROM books ORDER BY rowid DESC") + cur.execute("SELECT * FROM books ORDER BY created_at DESC") rows = cur.fetchall() conn.close() return [dict(r) for r in rows] @@ -46,22 +47,27 @@ def sql_fetch_all_books(): # ------------------------------------------------------------ # REGISTER / UPDATE # ------------------------------------------------------------ -def sql_register_book(book_id, fields: dict): +def sql_register_book(book_idx, fields: dict): + """ + Insert or replace entire book record. + book_idx is the PRIMARY KEY. + """ conn = _connect() cur = conn.cursor() - cols = ", ".join(["book_id"] + list(fields.keys())) + cols = ", ".join(["book_idx"] + list(fields.keys())) placeholders = ", ".join(["?"] * (1 + len(fields))) - values = [book_id] + list(fields.values()) + values = [book_idx] + list(fields.values()) cur.execute( - f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})", values + f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})", + values, ) conn.commit() conn.close() -def sql_update_book(book_id, fields: dict): +def sql_update_book(book_idx, fields: dict): if not fields: return @@ -69,9 +75,12 @@ def sql_update_book(book_id, fields: dict): cur = conn.cursor() set_clause = ", ".join([f"{k} = ?" for k in fields]) - params = list(fields.values()) + [book_id] + params = list(fields.values()) + [book_idx] - cur.execute(f"UPDATE books SET {set_clause} WHERE book_id = ?", params) + cur.execute( + f"UPDATE books SET {set_clause} WHERE book_idx = ?", + params, + ) conn.commit() conn.close() @@ -79,10 +88,13 @@ def sql_update_book(book_id, fields: dict): # ------------------------------------------------------------ # STATUS # ------------------------------------------------------------ -def sql_set_status(book_id, status: str): +def sql_set_status(book_idx, status: str): conn = _connect() cur = conn.cursor() - cur.execute("UPDATE books SET status = ? WHERE book_id = ?", (status, book_id)) + cur.execute( + "UPDATE books SET status = ? WHERE book_idx = ?", + (status, book_idx), + ) conn.commit() conn.close() @@ -90,11 +102,12 @@ def sql_set_status(book_id, status: str): # ------------------------------------------------------------ # CHAPTER TOTAL (snapshot) # ------------------------------------------------------------ -def sql_set_chapters_total(book_id, total: int): +def sql_set_chapters_total(book_idx, total: int): conn = _connect() cur = conn.cursor() cur.execute( - "UPDATE books SET chapters_total = ? WHERE book_id = ?", (total, book_id) + "UPDATE books SET chapters_total = ? WHERE book_idx = ?", + (total, book_idx), ) conn.commit() conn.close() @@ -103,63 +116,63 @@ def sql_set_chapters_total(book_id, total: int): # ------------------------------------------------------------ # COUNTERS (SNAPSHOT-ONLY) # ------------------------------------------------------------ -def sql_inc_downloaded(book_id, amount=1): +def sql_inc_downloaded(book_idx, amount=1): conn = _connect() cur = conn.cursor() cur.execute( """ UPDATE books SET downloaded = COALESCE(downloaded,0) + ? - WHERE book_id = ? + WHERE book_idx = ? """, - (amount, book_id), + (amount, book_idx), ) conn.commit() conn.close() -def sql_inc_parsed(book_id, amount=1): +def sql_inc_parsed(book_idx, amount=1): conn = _connect() cur = conn.cursor() cur.execute( """ UPDATE books SET parsed = COALESCE(parsed,0) + ? - WHERE book_id = ? + WHERE book_idx = ? """, - (amount, book_id), + (amount, book_idx), ) conn.commit() conn.close() -def sql_inc_audio_done(book_id, amount=1): - log(f"[DB-SQL] Incrementing audio done for {book_id} by {amount}") +def sql_inc_audio_done(book_idx, amount=1): + log(f"[DB-SQL] Incrementing audio_done for {book_idx} by {amount}") conn = _connect() cur = conn.cursor() cur.execute( """ UPDATE books SET audio_done = COALESCE(audio_done,0) + ? - WHERE book_id = ? + WHERE book_idx = ? """, - (amount, book_id), + (amount, book_idx), ) conn.commit() conn.close() -def sql_inc_audio_skipped(book_id, amount=1): - log(f"[DB-SQL] Incrementing audio skipped for {book_id} by {amount}") +def sql_inc_audio_skipped(book_idx, amount=1): + log(f"[DB-SQL] Incrementing audio_skipped for {book_idx} by {amount}") conn = _connect() cur = conn.cursor() cur.execute( """ UPDATE books SET audio_skipped = COALESCE(audio_skipped,0) + ? - WHERE book_id = ? + WHERE book_idx = ? """, - (amount, book_id), + (amount, book_idx), ) conn.commit() conn.close() diff --git a/bookscraper/scraper/abort.py b/bookscraper/scraper/abort.py index 2c67f75..9753e53 100644 --- a/bookscraper/scraper/abort.py +++ b/bookscraper/scraper/abort.py @@ -2,8 +2,6 @@ import os import redis from scraper.logger_decorators import logcall - -# GUI log (non-breaking) from scraper.ui_log import push_ui # --------------------------------------------------------- @@ -15,55 +13,58 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True) # Debug mode (optional) ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1" -# Internal flag to avoid spamming the same message +# Avoid duplicate spam _seen_debug_keys = set() # ========================================================= -# ABORT FLAG +# INTERNAL DEBUGGING # ========================================================= - - def _debug(msg: str): - """Print + GUI log (non-breaking, minimal noise).""" print(msg) push_ui(msg) -def set_abort(book_id: str): - """Enable abort mode for this book.""" - key = f"abort:{book_id}" +# ========================================================= +# ABORT FLAG — unified book_idx +# ========================================================= + + +def set_abort(book_idx: str): + """Enable abort mode for book_idx.""" + key = f"abort:{book_idx}" r.set(key, "1") if ABORT_DEBUG: _debug(f"[ABORT] SET {key}") -def clear_abort(book_id: str): +def clear_abort(book_idx: str): """Clear abort flag.""" - key = f"abort:{book_id}" + key = f"abort:{book_idx}" r.delete(key) if ABORT_DEBUG: _debug(f"[ABORT] CLEAR {key}") -def abort_requested(book_id: str, redis_client=None) -> bool: +def abort_requested(book_idx: str, redis_client=None) -> bool: """ - Return True if abort flag is set. + Check whether abort flag is active for book_idx. redis_client: - Docker workers → None → use default Redis (r) - - Local macOS audio → passes Redis(host=127.0.0.1) + - Local macOS audio worker → passes Redis(host=127.0.0.1) """ client = redis_client or r - key = f"abort:{book_id}" + key = f"abort:{book_idx}" try: exists = client.exists(key) if ABORT_DEBUG: - # Log once per key + + # Log only once per book if key not in _seen_debug_keys: try: conn = client.connection_pool.connection_kwargs @@ -71,54 +72,54 @@ def abort_requested(book_id: str, redis_client=None) -> bool: port = conn.get("port") db = conn.get("db") _debug( - f"[ABORT_DEBUG] first check book_id={book_id} " + f"[ABORT_DEBUG] first check book_idx={book_idx} " f"redis={host}:{port} db={db}" ) except Exception: - _debug(f"[ABORT_DEBUG] first check book_id={book_id}") + _debug(f"[ABORT_DEBUG] first check book_idx={book_idx}") _seen_debug_keys.add(key) - # Only log abort ACTIVE + # Log ACTIVE state if exists == 1: - _debug(f"[ABORT] ACTIVE for {book_id}") + _debug(f"[ABORT] ACTIVE for {book_idx}") return exists == 1 except Exception as e: if ABORT_DEBUG: _debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}") - return False # ========================================================= -# PER-CHAPTER STATE +# PER-CHAPTER STATE — unified book_idx # ========================================================= -def mark_chapter_started(book_id: str, chapter_num: int): - key = f"started:{book_id}:{chapter_num}" +def mark_chapter_started(book_idx: str, chapter_num: int): + key = f"started:{book_idx}:{chapter_num}" r.set(key, "1") -def chapter_started(book_id: str, chapter_num: int) -> bool: - key = f"started:{book_id}:{chapter_num}" +def chapter_started(book_idx: str, chapter_num: int) -> bool: + key = f"started:{book_idx}:{chapter_num}" return r.exists(key) == 1 # ========================================================= -# UTILITY: RESET FOR A BOOK +# RESET STATE FOR BOOK_IDX # ========================================================= -def reset_book_state(book_id: str): +def reset_book_state(book_idx: str): """ - Remove abort flag and all chapter-start markers. + Remove abort flag and all per-chapter started markers. """ - key = f"abort:{book_id}" - r.delete(key) + # abort flag + r.delete(f"abort:{book_idx}") - pattern = f"started:{book_id}:*" + # chapter markers + pattern = f"started:{book_idx}:*" for k in r.scan_iter(pattern): r.delete(k) diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py index a34976d..ab5d577 100644 --- a/bookscraper/scraper/book_scraper.py +++ b/bookscraper/scraper/book_scraper.py @@ -4,10 +4,9 @@ # Backwards-compatible wrapper giving the SAME public API # as the old BookScraper, but internally uses ScrapeEngine. # -# execute() → full metadata + chapterlist +# execute() → full metadata + chapterlist (NO book_idx creation) # -# (* Chapter downloading komt later in ScrapeEngine, -# maar deze wrapper hoeft NIET aangepast te worden.) +# ID management is now handled exclusively by InitService. # ============================================================ from scraper.logger_decorators import logcall @@ -18,21 +17,15 @@ class BookScraper: """ Backwards-compatible BookScraper façade. - In het oude systeem deed BookScraper ALLES: - - metadata ophalen - - cover ophalen - - hoofdstukkenlijst - - hoofdstukken downloaden - - volume folders - - skip logic + Old responsibilities (metadata, chapters, covers, downloads) + are now split: - In het nieuwe systeem is dát opgesplitst: + ScrapeEngine → metadata + chapterlist + Download tasks → handle download/parse/save + InitService → determines book_idx (single source of truth) - ScrapeEngine → metadata / chapterlist / download engine (in ontwikkeling) - BookScraper → behoudt dezelfde API als voorheen - - Daardoor kunnen Celery-tasks en oudere modules blijven werken - zonder refactor-chaos. + This wrapper intentionally does NOT generate a book_idx or book_id. + It only returns metadata/chapters in legacy-compatible dict format. """ @logcall @@ -43,18 +36,14 @@ class BookScraper: @logcall def execute(self): """ - Public legacy API. - Retourneert metadata + chapters EXACT zoals de oude BookScraper - vóór downloadfase. - - Dit is belangrijk: - - INIT-flow gebruikt metadata only - - scraping tasks gebruiken chapterlist + Legacy public API: + Return metadata + chapter list EXACTLY as before, + but without generating any book_id. """ data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url) - # Legacy output structuur volledig repliceren: + # Legacy structure preserved, unchanged: return { "title": data.get("title"), "author": data.get("author"), @@ -62,5 +51,5 @@ class BookScraper: "cover_url": data.get("cover_url"), "chapters": data.get("chapters", []), "chapters_total": data.get("chapters_total", 0), - "book_url": data.get("book_url"), + "book_url": data.get("book_url"), # used later by parse/save tasks } diff --git a/bookscraper/scraper/db.py b/bookscraper/scraper/db.py deleted file mode 100644 index e69de29..0000000 diff --git a/bookscraper/scraper/download_controller.py b/bookscraper/scraper/download_controller.py index 8f5c9ed..90254c7 100644 --- a/bookscraper/scraper/download_controller.py +++ b/bookscraper/scraper/download_controller.py @@ -1,55 +1,54 @@ # ========================================================= # File: scraper/download_controller.py # Purpose: -# Build Celery pipelines for all chapters -# and pass book_id for abort/progress/log functionality. -# + Download and replicate cover image to all volume folders -# + Generate scripts (allinone.txt, makebook, say) -# + Initialize Redis Book State Model (status + counters) +# Build Celery pipelines for all chapters using book_idx +# Handles: +# • volume assignment +# • cover download + replication +# • script generation +# • Redis Book State Model init +# • abort tracking # ========================================================= from celery import group from scraper.tasks.pipeline import build_chapter_pipeline -from scraper.scriptgen import generate_all_scripts + +# ❗ IMPORTANT: +# generate_all_scripts MUST NOT import DownloadController, otherwise circular import. +# We keep the import, but scriptgen must be clean. +from scraper import scriptgen + from logbus.publisher import log import os import requests import shutil -from scraper.abort import abort_requested # DEBUG allowed + +from scraper.abort import abort_requested +from db.state_redis import init_book_state +from db.repository import set_status, set_chapters_total class DownloadController: """ - Coordinates all chapter pipelines (download → parse → save), - including: - - volume splitting - - consistent meta propagation - - book_id-based abort + progress tracking - - cover download + volume replication - - script generation (allinone.txt, makebook, say) - - Redis book state initialisation and status updates + Coordinates all chapter pipelines (download → parse → save). """ - def __init__(self, book_id: str, scrape_result: dict): - self.book_id = book_id + def __init__(self, book_idx: str, scrape_result: dict): + self.book_idx = str(book_idx) self.scrape_result = scrape_result - # Core metadata + # Metadata self.title = scrape_result.get("title", "UnknownBook") self.chapters = scrape_result.get("chapters", []) or [] self.cover_url = scrape_result.get("cover_url") - # Output base dir + # Output folder root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") - - # Volume size self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) - - # Base folder for the whole book self.book_base = os.path.join(root, self.title) os.makedirs(self.book_base, exist_ok=True) - # Meta passed to parse/save stage + # Meta passed downstream self.meta = { "title": self.title, "author": scrape_result.get("author"), @@ -57,200 +56,120 @@ class DownloadController: "book_url": scrape_result.get("book_url"), } - # ------------------------------------------------- - # DEBUG — bevestig dat controller correct book_id ziet - # ------------------------------------------------- - log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'") + log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}") - try: - abort_state = abort_requested(book_id) - log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}") - except Exception as e: - log(f"[CTRL_DEBUG] abort_requested ERROR: {e}") - - # ------------------------------------------------- - # NEW: Initialize Redis Book State Model - # ------------------------------------------------- + # Init Redis Book State Model try: init_book_state( - book_id=self.book_id, + book_id=self.book_idx, title=self.title, - url=self.scrape_result.get("book_url"), + url=self.meta["book_url"], chapters_total=len(self.chapters), ) - log(f"[CTRL_STATE] init_book_state() completed for {self.title}") except Exception as e: log(f"[CTRL_STATE] init_book_state FAILED: {e}") - # --------------------------------------------------------- - # Cover Download # --------------------------------------------------------- def download_cover(self): - """Download one cover image into the root of the book folder.""" if not self.cover_url: - log(f"[CTRL] No cover URL found for '{self.title}'") - return + return log(f"[CTRL] No cover URL for '{self.title}'") cover_path = os.path.join(self.book_base, "cover.jpg") headers = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) " - "Gecko/20100101 Firefox/118.0" - ), - "Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/", + "User-Agent": "Mozilla/5.0", + "Referer": self.scrape_result.get("book_url") or "", } try: log(f"[CTRL] Downloading cover: {self.cover_url}") - resp = requests.get(self.cover_url, timeout=10, headers=headers) resp.raise_for_status() with open(cover_path, "wb") as f: f.write(resp.content) - log(f"[CTRL] Cover saved to: {cover_path}") - + log(f"[CTRL] Cover saved: {cover_path}") except Exception as e: - log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})") + log(f"[CTRL] Cover download failed: {e}") - # --------------------------------------------------------- - # Cover Replication to Volumes # --------------------------------------------------------- def replicate_cover_to_volumes(self): - """Copy cover.jpg into each existing Volume_xxx directory.""" src = os.path.join(self.book_base, "cover.jpg") if not os.path.exists(src): - log("[CTRL] No cover.jpg found, replication skipped") return - try: - - for entry in os.listdir(self.book_base): - if entry.lower().startswith("volume_"): - vol_dir = os.path.join(self.book_base, entry) - dst = os.path.join(vol_dir, "cover.jpg") - + for entry in os.listdir(self.book_base): + if entry.lower().startswith("volume_"): + dst = os.path.join(self.book_base, entry, "cover.jpg") + try: shutil.copyfile(src, dst) - log(f"[CTRL] Cover replicated into: {dst}") - - except Exception as e: - log(f"[CTRL] Cover replication failed: {e}") + log(f"[CTRL] Cover replicated → {dst}") + except Exception as e: + log(f"[CTRL] Cover replication failed: {e}") + # --------------------------------------------------------- def store_cover_in_static(self): - """ - Copy the main cover.jpg from book_base into static/covers/.jpg. - This allows the Flask web UI to serve the cover directly. - """ - src = os.path.join(self.book_base, "cover.jpg") if not os.path.exists(src): - log("[CTRL] No cover.jpg found, cannot store in static/covers") return - # static/covers/.jpg - static_dir = os.path.join("static", "covers") - os.makedirs(static_dir, exist_ok=True) - - dst = os.path.join(static_dir, f"{self.book_id}.jpg") + os.makedirs("static/covers", exist_ok=True) + dst = os.path.join("static/covers", f"{self.book_idx}.jpg") try: shutil.copyfile(src, dst) log(f"[CTRL] Cover stored for UI: {dst}") except Exception as e: - log(f"[CTRL] Failed to store cover in static: {e}") + log(f"[CTRL] Failed storing cover: {e}") - # --------------------------------------------------------- - # Volume isolation # --------------------------------------------------------- def get_volume_path(self, chapter_num: int) -> str: - """Returns the correct volume directory for a chapter.""" vol_index = (chapter_num - 1) // self.max_vol + 1 vol_name = f"Volume_{vol_index:03d}" vol_path = os.path.join(self.book_base, vol_name) os.makedirs(vol_path, exist_ok=True) return vol_path - # --------------------------------------------------------- - # Pipeline launcher # --------------------------------------------------------- def start(self): total = len(self.chapters) + log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)") - log( - f"[CTRL] Initialising pipeline for '{self.title}' " - f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})" - ) - log(f"[CTRL] Output root: {self.book_base}") - - # ------------------------------------- - # NEW: Redis state update - # ------------------------------------- + # Update Redis/SQLite state try: - set_status(self.book_id, "downloading") - set_chapter_total(self.book_id, total) - log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}") + set_status(self.book_idx, "downloading") + set_chapters_total(self.book_idx, total) except Exception as e: - log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}") + log(f"[CTRL_STATE] Unable to set state: {e}") - # ------------------------------------- - # 1) Download cover - # ------------------------------------- + # Download cover self.download_cover() + # Build pipeline tasks tasks = [] - for ch in self.chapters: - - # Build chapter_dict (NEW) - chapter_num = ch["num"] - chapter_url = ch["url"] - chapter_title = ch.get("title") - - volume_path = self.get_volume_path(chapter_num) - - chapter_dict = { - "num": chapter_num, - "url": chapter_url, - "title": chapter_title, - "volume_path": volume_path, + num = ch["num"] + chapter_info = { + "num": num, + "url": ch["url"], + "title": ch.get("title"), + "volume_path": self.get_volume_path(num), } - - # Dispatch pipeline with chapter_dict - tasks.append( - build_chapter_pipeline( - self.book_id, - chapter_dict, - self.meta, - ) - ) + tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta)) async_result = group(tasks).apply_async() - log( - f"[CTRL] Pipelines dispatched for '{self.title}' " - f"(book_id={self.book_id}, group_id={async_result.id})" - ) - - # Debug abort state - try: - abort_state = abort_requested(self.book_id) - log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}") - except Exception as e: - log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}") - - # ------------------------------------------------------- + # Replicate cover + place in static self.replicate_cover_to_volumes() self.store_cover_in_static() - # ------------------------------------------------------- + + # Generate scripts (LATE IMPORT to avoid circular) try: - generate_all_scripts( - self.book_base, - self.title, - self.meta.get("author"), + scriptgen.generate_all_scripts( + self.book_base, self.title, self.meta["author"] ) - log(f"[CTRL] Scripts generated for '{self.title}'") + log("[CTRL] Scripts generated") except Exception as e: log(f"[CTRL] Script generation failed: {e}") diff --git a/bookscraper/scraper/replacements/junk.txt b/bookscraper/scraper/replacements/junk.txt index ae77872..dbed404 100644 --- a/bookscraper/scraper/replacements/junk.txt +++ b/bookscraper/scraper/replacements/junk.txt @@ -54,6 +54,8 @@ Copyright= 章节出错= 点此举报= 举报原因= +求收藏= +推荐票= www.piaotia.com= www.piaotian.com= www.= diff --git a/bookscraper/scraper/services/init_service.py b/bookscraper/scraper/services/init_service.py index 601c748..b39be07 100644 --- a/bookscraper/scraper/services/init_service.py +++ b/bookscraper/scraper/services/init_service.py @@ -4,7 +4,7 @@ # Orchestrate INIT-flow: # - resolve site # - fetch minimal metadata -# - derive book_id +# - derive book_idx # - register in SQLite # - store main cover # ============================================================ @@ -21,33 +21,47 @@ from scraper.logger_decorators import logcall class InitService: + # ------------------------------------------------------------ + # BOOK IDX DERIVATION + # ------------------------------------------------------------ @staticmethod @logcall def derive_book_id(url: str) -> str: """ PTWXZ URL format ends with /{id}.html. If no match → fallback to sanitized URL. + + Returns: + book_idx (string) """ m = re.search(r"/(\d+)\.html$", url) if m: return m.group(1) - return url.replace("/", "_") + # Fallback — ensures deterministic ID for unknown formats + return url.replace("/", "_").replace(":", "_") + + # ------------------------------------------------------------ + # MAIN INIT FLOW + # ------------------------------------------------------------ @staticmethod @logcall def execute(url: str) -> dict: """ - Main INIT-flow entry point. - Returns complete metadata + registration info. + INIT entry point. + Returns complete metadata + registration result. """ - # 1) Determine site + # 1) Resolve site handler site = SiteResolver.resolve(url) - book_id = InitService.derive_book_id(url) + # 2) Create unified book_idx + book_idx = InitService.derive_book_id(url) + + # Some site objects historically expect .book_id — we support it but DO NOT rely on it. + site.book_id = book_idx - site.book_id = book_id - # 2) Metadata only + # 3) Fetch initial metadata (title/author/description/cover) meta = ScrapeEngine.fetch_metadata_only(site, url) title = meta.get("title") or "Unknown" @@ -55,27 +69,27 @@ class InitService: description = meta.get("description") cover_url = meta.get("cover_url") - # 4) Download UI cover (NEW: capture returned local path) - cover_path = CoverService.download_main_cover(cover_url, book_id) + # 4) Download & store main cover for UI + cover_path = CoverService.download_main_cover(cover_url, book_idx) - # 5) SQLite registration INCLUDING cover_path ← ★ FIX + # 5) Register in SQLite (book_idx is the SOLE primary ID) register_book( - book_id=book_id, + book_idx=book_idx, title=title, author=author, description=description, cover_url=cover_url, - cover_path=cover_path, # ← ★ BELANGRIJK + cover_path=cover_path, book_url=url, ) - # 6) Output for UI + # 6) Return metadata for UI / API return { - "book_id": book_id, + "book_idx": book_idx, "title": title, "author": author, "description": description, "cover_url": cover_url, - "cover_path": cover_path, # ← handig voor UI + "cover_path": cover_path, "status": "registered", } diff --git a/bookscraper/scraper/services/scrape_engine.py b/bookscraper/scraper/services/scrape_engine.py index 041d0f3..9e00ac9 100644 --- a/bookscraper/scraper/services/scrape_engine.py +++ b/bookscraper/scraper/services/scrape_engine.py @@ -1,8 +1,8 @@ # ============================================================ -# File: scraper/services/scrape_engine.py +# File: scraper/services/scrape_engine.py (C&U — no circular import) # Purpose: # Unified scraping engine for INIT-flow and Celery tasks. -# All functions are fully logged via @logcall. +# ScrapeEngine does NOT determine book_idx itself. # ============================================================ import os @@ -23,6 +23,10 @@ class ScrapeEngine: Central scraping engine. Metadata + chapterlist scraping. All methods logged with @logcall. + + IMPORTANT: + - ScrapeEngine NEVER decides book_idx. + - No dependency on InitService (prevents circular import). """ # ------------------------------------------------------------ @@ -140,26 +144,23 @@ class ScrapeEngine: return "\n".join(parts) # ------------------------------------------------------------ - # COVER PARSER + # COVER PARSER (NO InitService dependency) # ------------------------------------------------------------ @staticmethod @logcall def _parse_cover(soup, site): """ - Vind cover door book_id substring matching: - - haal book_id uit site.url - - zoek IMG-tags waarvan filename book_id bevat - - kies kortste filename als beste match + Extract book index from URL heuristically instead of InitService + (prevents circular import). """ + + # Typical Chinese novel sites embed numeric ID in URL path try: parsed = urlparse(site.url) - m = re.search(r"/(\d+)\.html$", parsed.path) - if m: - book_id = m.group(1) - else: - book_id = parsed.path.rstrip("/").split("/")[-1] + digits = re.findall(r"\d+", parsed.path) + book_idx = digits[-1] if digits else None except Exception: - return None + book_idx = None imgs = soup.find_all("img", src=True) candidates = [] @@ -167,16 +168,14 @@ class ScrapeEngine: for img in imgs: src = img["src"].strip() filename = os.path.basename(src) - if book_id in filename: + if book_idx and book_idx in filename: candidates.append((filename, src)) if not candidates: return None - candidates.sort(key=lambda t: len(t[0])) # kortste filename wint - best_src = candidates[0][1] - - return urljoin(site.root, best_src) + candidates.sort(key=lambda t: len(t[0])) # smallest filename + return urljoin(site.root, candidates[0][1]) # ------------------------------------------------------------ # RESOLVE CHAPTER PAGE @@ -233,7 +232,7 @@ class ScrapeEngine: def fetch_metadata_only(site, url: str) -> dict: ScrapeEngine._apply_replacements(site) soup = ScrapeEngine._get_doc(url, site) - site.url = url # NODIG voor cover parsing + site.url = url # needed for cover parsing return { "title": ScrapeEngine._parse_title(soup), diff --git a/bookscraper/scraper/tasks/controller_tasks.py b/bookscraper/scraper/tasks/controller_tasks.py index 0992dfa..e8d0e95 100644 --- a/bookscraper/scraper/tasks/controller_tasks.py +++ b/bookscraper/scraper/tasks/controller_tasks.py @@ -1,109 +1,167 @@ # ============================================================ # File: scraper/tasks/controller_tasks.py # Purpose: -# Start the download → parse → save pipeline for a scraped book, -# including progress/abort tracking via book_id. -# ONLY THE CONTROLLER UPDATES PROGRESS (initial total). +# FULL scrape entrypoint + launching download/parse/save pipelines. +# NO result.get() anywhere. Scraping is done inline. # ============================================================ from celery_app import celery_app from logbus.publisher import log -from scraper.download_controller import DownloadController - +import os +import time +import redis from urllib.parse import urlparse from scraper.logger_decorators import logcall -import redis -import os from scraper.abort import abort_requested -from db.repository import set_chapters_total + +from scraper.services.scrape_engine import ScrapeEngine +from scraper.services.site_resolver import SiteResolver + +from db.repository import fetch_book, set_chapters_total +from scraper.download_controller import DownloadController + print(">>> [IMPORT] controller_tasks.py loaded") -@celery_app.task(bind=True, queue="controller", ignore_result=False) +# ============================================================= +# 1) PUBLIC ENTRYPOINT — CALLED FROM /start +# ============================================================= +@celery_app.task( + bind=True, + queue="controller", + ignore_result=False, + name="scraper.tasks.controller_tasks.start_full_scrape", +) @logcall -def launch_downloads(self, book_id: str, scrape_result: dict): +def start_full_scrape(self, book_idx: str): """ - Launch the entire pipeline (download → parse → save), - AND initialize progress counters. + FULL SCRAPE ENTRYPOINT. + Scraping is done inline → no Celery .get() needed. + """ + + log(f"[CTRL] start_full_scrape(book_idx={book_idx})") + + # Abort before doing anything + if abort_requested(book_idx): + log(f"[CTRL] PRE-ABORT flag detected for {book_idx}") + return {"book_idx": book_idx, "aborted": True, "reason": "pre-abort"} + + # -------------------------------------------------------- + # 1) Load book metadata from SQLite + # -------------------------------------------------------- + book = fetch_book(book_idx) + if not book: + msg = f"[CTRL] Book '{book_idx}' not found in DB" + log(msg) + raise ValueError(msg) + + url = book.get("book_url") + if not url: + msg = f"[CTRL] No book_url stored for {book_idx}" + log(msg) + raise ValueError(msg) + + # -------------------------------------------------------- + # 2) INLINE SCRAPE (fast, no Celery wait) + # -------------------------------------------------------- + site = SiteResolver.resolve(url) + + try: + scrape_result = ScrapeEngine.fetch_metadata_and_chapters(site, url) + log(f"[CTRL] Scrape OK for {book_idx}: {scrape_result.get('title')}") + except Exception as e: + log(f"[CTRL] ERROR during scrape of {book_idx}: {e}") + raise - Chapter-level progress is updated INSIDE the download/parse/save tasks. - This task MUST NOT call .get() on async subtasks (Celery restriction). + # -------------------------------------------------------- + # 3) Continue → dispatch pipelines + # -------------------------------------------------------- + return launch_downloads(book_idx, scrape_result) + + +# ============================================================= +# 2) PIPELINE DISPATCH (NOT a Celery task) +# ============================================================= +@logcall +def launch_downloads(book_idx: str, scrape_result: dict): + """ + Launches the entire processing pipeline: + - initialize Redis UI state + - initialize SQLite totals + - dispatch per-chapter pipelines via DownloadController """ title = scrape_result.get("title", "UnknownBook") chapters = scrape_result.get("chapters", []) or [] total = len(chapters) + # ------------------------------------------------------------ - # INIT BOOK STATE MODEL (required for Active Books dashboard) + # INIT REDIS STATE # ------------------------------------------------------------ - broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0") parsed = urlparse(broker_url) - state = redis.Redis( + r = redis.Redis( host=parsed.hostname, port=parsed.port, db=int(parsed.path.strip("/")), decode_responses=True, ) - # Book metadata - state.set(f"book:{book_id}:title", title) - state.set(f"book:{book_id}:status", "starting") + base = f"book:{book_idx}:state" - # Download counters - state.set(f"book:{book_id}:download:total", total) - state.set(f"book:{book_id}:download:done", 0) - - # Audio counters (start at zero) - state.set(f"book:{book_id}:audio:done", 0) + r.hset(base, "title", title) + r.hset(base, "status", "starting") + r.hset(base, "chapters_total", total) + r.hset(base, "chapters_download_done", 0) + r.hset(base, "chapters_download_skipped", 0) + r.hset(base, "chapters_parsed_done", 0) + r.hset(base, "audio_done", 0) + r.hset(base, "audio_skipped", 0) + r.hset(base, "last_update", int(time.time())) # ------------------------------------------------------------ - # INIT PROGRESS + # INIT SQLITE SNAPSHOT # ------------------------------------------------------------ - set_chapters_total(book_id, total) + try: + set_chapters_total(book_idx, total) + except Exception as e: + log(f"[CTRL] ERROR updating SQLite totals: {e}") + raise - log(f"[CTRL] Progress initialized for {book_id}: total={total}") + log(f"[CTRL] Initialized totals for {book_idx}: {total}") # ------------------------------------------------------------ - # BUILD CONTROLLER + # ABORT CHECK BEFORE LAUNCHING JOBS # ------------------------------------------------------------ - ctl = DownloadController(book_id, scrape_result) + if abort_requested(book_idx): + log(f"[CTRL] ABORT flag detected — stopping BEFORE dispatch for {book_idx}") + r.hset(base, "status", "aborted") + return {"book_idx": book_idx, "aborted": True, "reason": "abort-before-start"} # ------------------------------------------------------------ - # START PIPELINES (ASYNC) - # Returns a celery group AsyncResult. We DO NOT iterate or get(). - # Progress & failures are handled by the worker subtasks. + # BUILD + DISPATCH PER-CHAPTER PIPELINES # ------------------------------------------------------------ - try: - group_result = ctl.start() - - log( - f"[CTRL] Pipelines dispatched for '{title}' " - f"(book_id={book_id}, group_id={group_result.id})" - ) - - # Abort flag set BEFORE tasks start? - if abort_requested(book_id): - log(f"[CTRL] ABORT requested before tasks start") - return {"book_id": book_id, "aborted": True} + controller = DownloadController(book_idx, scrape_result) - except Exception as exc: - log(f"[CTRL] ERROR while dispatching pipelines: {exc}") + try: + group_result = controller.start() + gid = getattr(group_result, "id", None) + log(f"[CTRL] Pipelines dispatched for {book_idx} (group_id={gid})") + except Exception as e: + log(f"[CTRL] ERROR dispatching pipelines for {book_idx}: {e}") raise - # ------------------------------------------------------------ - # CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS - # (Download/parse/save tasks update progress themselves) - # ------------------------------------------------------------ - log(f"[CTRL] Controller finished dispatch for book_id={book_id}") + # Update UI state to "downloading" + r.hset(base, "status", "downloading") + r.hset(base, "last_update", int(time.time())) return { - "book_id": book_id, + "book_idx": book_idx, "total": total, "started": True, - "group_id": group_result.id, + "group_id": gid, } diff --git a/bookscraper/scraper/tasks/download_tasks.py b/bookscraper/scraper/tasks/download_tasks.py index fdaaad9..9d14848 100644 --- a/bookscraper/scraper/tasks/download_tasks.py +++ b/bookscraper/scraper/tasks/download_tasks.py @@ -1,12 +1,15 @@ # ============================================================ # File: scraper/tasks/download_tasks.py +# Purpose: +# Download chapter HTML into payload["html"]. +# Updated for book_idx unified ID model. # ============================================================ from celery_app import celery_app from scraper.utils.utils import get_save_path from scraper.abort import abort_requested, chapter_started, mark_chapter_started -# Repository façade — correct imports only +# Unified repository façade from db.repository import ( set_status, inc_download_done, @@ -30,9 +33,9 @@ print(">>> [IMPORT] download_tasks.py loaded") # ----------------------------------------------------------- # TIMESTAMPED LOG WRAPPER # ----------------------------------------------------------- -def log_msg(book_id: str, message: str): +def log_msg(book_idx: str, message: str): ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - full = f"{ts} [{book_id}] {message}" + full = f"{ts} [{book_idx}] {message}" log(full) push_ui(full) @@ -85,19 +88,27 @@ def release_global_slot(): # ============================================================ -# CELERY TASK — Unified payload v3 +# CELERY TASK — Payload v3 (book_idx model) # ============================================================ @celery_app.task(bind=True, queue="download", ignore_result=False) @logcall def download_chapter(self, payload: dict): """ - Payload: + Payload format: + { - "book_id": str, - "chapter": { "num", "url", "title", "volume_path" }, + "book_idx": str, + "chapter": { + "num": int, + "title": str, + "url": str, + "volume_path": str + }, "book_meta": dict, + + # fields filled during pipeline: "html": None | str, - "parsed": None | dict, + "parsed": None | str, "skipped": bool, "path": None | str } @@ -106,7 +117,7 @@ def download_chapter(self, payload: dict): if not payload: raise ValueError("download_chapter received empty payload") - book_id = payload["book_id"] + book_idx = payload["book_idx"] chapter = payload["chapter"] book_meta = payload.get("book_meta") or {} @@ -115,44 +126,55 @@ def download_chapter(self, payload: dict): chapter_title = chapter.get("title") or f"Chapter {chapter_num}" volume_path = chapter["volume_path"] - # STATUS UPDATE - set_status(book_id, "downloading") + # ----------------------------------------------------------- + # STATUS UPDATE (book is now in 'downloading') + # ----------------------------------------------------------- + set_status(book_idx, "downloading") - # ABORT CHECK - if abort_requested(book_id) and not chapter_started(book_id, chapter_num): - log_msg(book_id, f"[ABORT] Skip chapter {chapter_num}") + # ----------------------------------------------------------- + # ABORT CHECK (skip if not yet started) + # ----------------------------------------------------------- + if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num): + log_msg(book_idx, f"[ABORT] Skip chapter {chapter_num}") - inc_download_skipped(book_id) + inc_download_skipped(book_idx) payload["html"] = None payload["skipped"] = True payload["path"] = None return payload - mark_chapter_started(book_id, chapter_num) + mark_chapter_started(book_idx, chapter_num) - # SKIP IF FILE ALREADY SAVED + # ----------------------------------------------------------- + # SKIP IF FILE ALREADY EXISTS + # ----------------------------------------------------------- save_path = get_save_path(chapter_num, volume_path) + if os.path.exists(save_path): - log_msg(book_id, f"[DL] SKIP {chapter_num} → {save_path}") + log_msg(book_idx, f"[DL] SKIP {chapter_num} → {save_path}") - inc_download_skipped(book_id) + inc_download_skipped(book_idx) payload["html"] = None payload["skipped"] = True payload["path"] = save_path return payload - # GLOBAL DELAY + SLOT + # ----------------------------------------------------------- + # GLOBAL DELAY + CONCURRENCY + # ----------------------------------------------------------- if GLOBAL_DELAY > 0: time.sleep(GLOBAL_DELAY) wait_for_global_delay() acquire_global_slot(MAX_CONCURRENCY) + # ----------------------------------------------------------- # HTTP DOWNLOAD + # ----------------------------------------------------------- try: - log_msg(book_id, f"[DL] Downloading {chapter_num} ({chapter_title})") + log_msg(book_idx, f"[DL] Downloading {chapter_num} ({chapter_title})") resp = requests.get( chapter_url, @@ -164,11 +186,10 @@ def download_chapter(self, payload: dict): resp.encoding = resp.apparent_encoding or "gb2312" html = resp.text - log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes") + log_msg(book_idx, f"[DL] OK {chapter_num}: {len(html)} bytes") - inc_download_done(book_id) + inc_download_done(book_idx) - # --- attach results --- payload["html"] = html payload["skipped"] = False payload["path"] = save_path @@ -178,13 +199,15 @@ def download_chapter(self, payload: dict): attempt = self.request.retries delay = BASE_DELAY * (BACKOFF**attempt) + # Handle 429 if getattr(getattr(exc, "response", None), "status_code", None) == 429: - log_msg(book_id, f"[DL] 429 → WAIT {DELAY_429}s") + log_msg(book_idx, f"[DL] 429 → WAIT {DELAY_429}s") time.sleep(DELAY_429) set_global_delay() raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES) - log_msg(book_id, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s") + # General retry with backoff + log_msg(book_idx, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s") raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES) finally: diff --git a/bookscraper/scraper/tasks/parse_tasks.py b/bookscraper/scraper/tasks/parse_tasks.py index 7fabd15..754f995 100644 --- a/bookscraper/scraper/tasks/parse_tasks.py +++ b/bookscraper/scraper/tasks/parse_tasks.py @@ -2,7 +2,7 @@ # File: scraper/tasks/parse_tasks.py # Purpose: Parse downloaded HTML into clean chapter text. # Enhanced Piaotia extractor + selector fallback + clean pipeline. -# Compatible with payload pipeline v3. +# Compatible with payload pipeline v3 + book_idx refactor. # ============================================================ from celery_app import celery_app @@ -14,11 +14,11 @@ from scraper.logger_decorators import logcall from db.repository import inc_parsed_done -print(">>> [IMPORT] parse_tasks.py loaded (RESTORED + payload v3)") +print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)") # ============================================================ -# PIAOTIA ADVANCED CONTENT EXTRACTOR (unchanged from original) +# PIAOTIA ADVANCED CONTENT EXTRACTOR # ============================================================ def extract_piaotia_content(soup): h1 = soup.find("h1") @@ -44,39 +44,32 @@ def extract_piaotia_content(soup): if hasattr(sib, "get_text"): text = sib.get_text(strip=True) - # STOP CONDITIONS - - # + # Stop conditions if isinstance(sib, Comment) and ("翻页" in sib): break - # explicit footer blocks if name == "div": sid = sib.get("id", "") cls = sib.get("class", []) if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"): break - # copyright block if text and ("重要声明" in text or "Copyright" in text): break - # navigation blocks if text and (text.startswith(("推荐阅读", "目录", "目 录"))): break if name in ("script", "style"): continue - if name == "center": continue - # ACCUMULATE + # Accumulate if isinstance(sib, NavigableString): s = sib.strip() if s: parts.append(s) - elif hasattr(sib, "get_text"): t = sib.get_text(separator="\n").strip() if t: @@ -86,7 +79,7 @@ def extract_piaotia_content(soup): # ============================================================ -# PARSE TASK — PAYLOAD PIPELINE (CORRECT v3 FORMAT) +# PARSE TASK — PAYLOAD PIPELINE v3 (book_idx) # ============================================================ @celery_app.task(bind=True, queue="parse", ignore_result=False) @logcall @@ -95,7 +88,8 @@ def parse_chapter(self, payload: dict): if not payload: return {"skipped": True, "reason": "empty_payload"} - book_id = payload["book_id"] + # NEW MODEL + book_idx = payload["book_idx"] chapter = payload["chapter"] book_meta = payload.get("book_meta") or {} @@ -103,24 +97,26 @@ def parse_chapter(self, payload: dict): title = chapter.get("title") or f"Chapter {num}" html = payload.get("html") - # SKIPPED DOWNLOAD → SKIP PARSE + # ------------------------------------------------------------ + # DOWNLOAD SKIPPED → PARSE SKIP + # ------------------------------------------------------------ if payload.get("skipped"): - log_msg(book_id, f"[PARSE] SKIP chapter {num} (download skipped)") + log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)") return payload if not html: - log_msg(book_id, f"[PARSE] Missing HTML for chapter {num} → SKIP") + log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP") payload["parsed"] = None payload["skipped"] = True return payload - log_msg(book_id, f"[PARSE] Parsing chapter {num}") + log_msg(book_idx, f"[PARSE] Parsing chapter {num}") soup = BeautifulSoup(html, "lxml") - # ============================================================ + # ------------------------------------------------------------ # STRICT SELECTORS - # ============================================================ + # ------------------------------------------------------------ selectors = [ "#content", "div#content", @@ -142,7 +138,7 @@ def parse_chapter(self, payload: dict): raw = None - # --- STRICT SELECTOR FAILED → Piaotia extractor --- + # strict selectors failed → piaotia extractor if node is None: raw = extract_piaotia_content(soup) else: @@ -154,55 +150,56 @@ def parse_chapter(self, payload: dict): tag.decompose() raw = soup.get_text(separator="\n") - # ============================================================ - # MULTIPASS CLEANING via replacement files - # ============================================================ + # ------------------------------------------------------------ + # MULTIPASS CLEANING VIA replacement-block files + # ------------------------------------------------------------ REPL = load_all_replacements() text = raw for _ in range(5): text = clean_text(text, REPL) - # ============================================================ + # ------------------------------------------------------------ # Collapse double blank lines - # ============================================================ + # ------------------------------------------------------------ cleaned = [] prev_blank = False + for line in text.split("\n"): - stripped = line.rstrip() - if stripped == "": + s = line.rstrip() + if s == "": if prev_blank: continue prev_blank = True cleaned.append("") else: prev_blank = False - cleaned.append(stripped) + cleaned.append(s) text = "\n".join(cleaned) text = f"{title}\n{text}" - # ============================================================ - # Add header to chapter 1 - # ============================================================ + # ------------------------------------------------------------ + # Header on chapter 1 + # ------------------------------------------------------------ if num == 1: book_url = book_meta.get("book_url") or "UNKNOWN" header = ( - f"{book_meta.get('title', '')}\n" + f"{book_meta.get('title','')}\n" f"Author: {book_meta.get('author','')}\n" f"Description:\n{book_meta.get('description','')}\n" f"Book URL: {book_url}\n" + "-" * 50 + "\n\n" ) text = header + text - log_msg(book_id, f"[PARSE] Parsed chapter {num}: {len(text)} chars") + log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars") - # ============================================================ - # PAYLOAD OUTPUT (v3) - # ============================================================ + # ------------------------------------------------------------ + # OUTPUT PAYLOAD + # ------------------------------------------------------------ payload["parsed"] = text payload["skipped"] = False - inc_parsed_done(book_id) + inc_parsed_done(book_idx) return payload diff --git a/bookscraper/scraper/tasks/pipeline.py b/bookscraper/scraper/tasks/pipeline.py index a42d827..f7d582d 100644 --- a/bookscraper/scraper/tasks/pipeline.py +++ b/bookscraper/scraper/tasks/pipeline.py @@ -7,6 +7,10 @@ # download_chapter(payload) # → parse_chapter(payload) # → save_chapter(payload) +# +# NOTE: +# - book_idx is the single authoritative key for all tasks +# - payload travels unchanged through the entire pipeline # ========================================================= from celery import chain @@ -19,18 +23,23 @@ from scraper.logger_decorators import logcall @logcall -def build_chapter_pipeline(book_id: str, chapter_dict: dict, book_meta: dict): +def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict): """ - Payload model passed through entire pipeline. + Create a payload object passed through the pipeline. + Consistent with the chapter_dict-based task signature. """ payload = { - "book_id": book_id, + "book_idx": book_idx, "chapter": chapter_dict, "book_meta": book_meta, + # Will be filled by download_chapter "html": None, + # Will be filled by parse_chapter "parsed": None, + # Set by download or parse on skip/404/etc "skipped": False, + # Final path written by save_chapter "path": None, } diff --git a/bookscraper/scraper/tasks/save_tasks.py b/bookscraper/scraper/tasks/save_tasks.py index 774920d..a89723f 100644 --- a/bookscraper/scraper/tasks/save_tasks.py +++ b/bookscraper/scraper/tasks/save_tasks.py @@ -1,5 +1,5 @@ # ============================================================ -# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC) +# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC + book_idx) # ============================================================ print(">>> [IMPORT] save_tasks.py loaded") @@ -24,7 +24,9 @@ def save_chapter(self, payload: dict): log("[SAVE] ERROR: payload is None") return {"error": True} - book_id = payload["book_id"] + # NEW unified ID + book_idx = payload["book_idx"] + chapter = payload["chapter"] parsed = payload.get("parsed") path = payload.get("path") @@ -36,20 +38,19 @@ def save_chapter(self, payload: dict): volume_name = os.path.basename(volume.rstrip("/")) # ============================================================ - # SKIPPED CASE (restore old behavior) + # SKIPPED CASE (old behavior restored) # ============================================================ if skipped or not parsed: - log_msg(book_id, f"[SAVE] SKIP chapter {num}") - inc_download_skipped(book_id) + log_msg(book_idx, f"[SAVE] SKIP chapter {num}") + inc_download_skipped(book_idx) - # Restore old behavior: - # If file already exists, STILL trigger audio. + # OLD behavior: even skipped chapters still queue audio if path and os.path.exists(path): - log_msg(book_id, f"[AUDIO] Queueing audio for SKIPPED chapter {num}") + log_msg(book_idx, f"[AUDIO] Queueing audio for SKIPPED chapter {num}") try: - generate_audio.delay(book_id, volume_name, num, title, path) + generate_audio.delay(book_idx, volume_name, num, title, path) except Exception as exc: - log_msg(book_id, f"[AUDIO] ERROR queueing skipped audio: {exc}") + log_msg(book_idx, f"[AUDIO] ERROR queueing skipped audio: {exc}") return payload @@ -63,21 +64,21 @@ def save_chapter(self, payload: dict): with open(save_path, "w", encoding="utf-8") as f: f.write(parsed) - log_msg(book_id, f"[SAVE] Saved chapter {num} → {save_path}") + log_msg(book_idx, f"[SAVE] Saved chapter {num} → {save_path}") - inc_download_done(book_id) + inc_download_done(book_idx) - # Restore old behavior → ALWAYS queue audio + # OLD behavior: ALWAYS queue audio try: - generate_audio.delay(book_id, volume_name, num, title, save_path) - log_msg(book_id, f"[AUDIO] Task queued for chapter {num}") + generate_audio.delay(book_idx, volume_name, num, title, save_path) + log_msg(book_idx, f"[AUDIO] Task queued for chapter {num}") except Exception as exc: - log_msg(book_id, f"[AUDIO] ERROR queueing chapter {num}: {exc}") + log_msg(book_idx, f"[AUDIO] ERROR queueing chapter {num}: {exc}") payload["path"] = save_path payload["skipped"] = False return payload except Exception as exc: - log_msg(book_id, f"[SAVE] ERROR saving chapter {num}: {exc}") + log_msg(book_idx, f"[SAVE] ERROR saving chapter {num}: {exc}") raise diff --git a/bookscraper/scraper/tasks/scraping.py b/bookscraper/scraper/tasks/scraping.py index 64b26ba..c462600 100644 --- a/bookscraper/scraper/tasks/scraping.py +++ b/bookscraper/scraper/tasks/scraping.py @@ -1,7 +1,9 @@ # ============================================================ # File: scraper/tasks/scraping.py -# Purpose: Scrape metadata + chapter list and initialise -# Redis progress tracking + launch download controller +# Purpose: +# Scrape ONLY metadata + chapter list. +# Does NOT launch download controller anymore. +# Controller decides when pipelines start. # ============================================================ from celery_app import celery_app @@ -12,86 +14,88 @@ import redis from scraper.logger_decorators import logcall from scraper.sites import BookSite from scraper.book_scraper import BookScraper -from scraper.abort import clear_abort # no circular deps -from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT +from scraper.abort import clear_abort +from scraper.ui_log import reset_ui_logs + +from scraper.services.init_service import InitService print(">>> [IMPORT] scraping.py loaded") -# Redis connection (same as Celery broker) +# Redis connection (same DB as Celery broker) REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") r = redis.Redis.from_url(REDIS_URL, decode_responses=True) -@celery_app.task(bind=True, queue="scraping", ignore_result=False) +@celery_app.task( + bind=True, + queue="scraping", + ignore_result=False, + name="scraper.tasks.scraping.start_scrape_book", +) +@logcall def start_scrape_book(self, url: str): - """Scrapes metadata + chapters and prepares download tracking.""" + """ + Scrapes metadata + chapters. + DOES NOT START download / pipeline controller. + The controller_tasks.start_full_scrape() task will call this one. + """ # ------------------------------------------------------------ - # NEW: clear UI log buffer at start of new run + # CLEAR UI LOG BUFFER # ------------------------------------------------------------ reset_ui_logs() - log(f"[SCRAPING] Start scraping for: {url}") # ------------------------------------------------------------ - # Book scrape + # SCRAPE (old engine) # ------------------------------------------------------------ site = BookSite() scraper = BookScraper(site, url) - result = scraper.execute() # returns dict with metadata + chapters + result = scraper.execute() # → { title, author, chapters, cover_url, ... } chapters = result.get("chapters", []) full_count = len(chapters) # ------------------------------------------------------------ - # DRY RUN + # Compute unified book_idx + # ------------------------------------------------------------ + book_idx = InitService.derive_book_id(url) + result["book_idx"] = book_idx + + log(f"[SCRAPING] Assigned book_idx = {book_idx}") + + # ------------------------------------------------------------ + # DRY RUN TEST LIMIT # ------------------------------------------------------------ DRY_RUN = os.getenv("DRY_RUN", "0") == "1" TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5")) if DRY_RUN: - log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}") - chapters = chapters[:TEST_LIMIT] - result["chapters"] = chapters - - log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters") + log(f"[SCRAPING] DRY RUN active → limiting chapters to {TEST_LIMIT}") + result["chapters"] = chapters[:TEST_LIMIT] # ------------------------------------------------------------ - # BOOK RUN ID (using title as ID) + # LOG RESULTS # ------------------------------------------------------------ - title = result.get("title") or "UnknownBook" - book_id = title # user requirement - - result["book_id"] = book_id - - log(f"[SCRAPING] Assigned book_id = '{book_id}'") + log( + f"[SCRAPING] Completed scrape: " + f"{len(result['chapters'])}/{full_count} chapters" + ) # ------------------------------------------------------------ - # RESET ABORT + INITIALISE PROGRESS + # RESET ABORT + INITIALIZE LEGACY PROGRESS # ------------------------------------------------------------ - clear_abort(book_id) + clear_abort(book_idx) - r.set(f"progress:{book_id}:total", len(chapters)) - r.set(f"progress:{book_id}:done", 0) - r.delete(f"logs:{book_id}") # clear old logs if any + r.set(f"progress:{book_idx}:total", len(result["chapters"])) + r.set(f"progress:{book_idx}:done", 0) - r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}") - r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters") + r.delete(f"logs:{book_idx}") + r.rpush(f"logs:{book_idx}", f":: SCRAPING STARTED for {url}") + r.rpush(f"logs:{book_idx}", f":: Found {len(result['chapters'])} chapters") # ------------------------------------------------------------ - # DISPATCH DOWNLOAD CONTROLLER + # IMPORTANT: DO NOT DISPATCH any pipelines here + # Controller will receive scrape_result and continue. # ------------------------------------------------------------ - celery_app.send_task( - "scraper.tasks.controller_tasks.launch_downloads", - args=[book_id, result], - queue="controller", - ) - - log(f"[SCRAPING] Dispatched download controller for '{book_id}'") - - return { - "book_id": book_id, - "title": result.get("title"), - "author": result.get("author"), - "chapters": len(chapters), - } + return result diff --git a/bookscraper/scraper/utils/state_sync.py b/bookscraper/scraper/utils/state_sync.py index fe8a7a3..7b5b20b 100644 --- a/bookscraper/scraper/utils/state_sync.py +++ b/bookscraper/scraper/utils/state_sync.py @@ -1,10 +1,8 @@ # ============================================================ # File: scraper/utils/state_sync.py # Purpose: -# State inspection + optional sync logic for book progress. -# This version provides: -# • inspect_books_state() → NO writes, just a dry-run -# • sync_books_from_redis() → NOT USED YET (kept commented) +# State inspection + optional sync logic for unified book_idx model. +# Generates full book-card compatible dicts for debug UI. # ============================================================ import os @@ -12,17 +10,53 @@ import redis from db.db import get_db -def inspect_books_state(): +def _build_card(sqlite_row, redis_state, merged): + """ + Creates a dict that matches the fields required by components/bookcard.html: + b.book_idx + b.title + b.author + b.cover_path + b.status + b.created_at + b.download_done + b.download_total + b.audio_done + b.audio_total """ - Reads all books from SQLite and fetches Redis progress, - but performs NO writes. Only shows: - - sqlite row - - redis state - - merged result (dry-run) - Returns a list of inspection dicts. + return { + "book_idx": sqlite_row.get("book_idx"), + "title": sqlite_row.get("title") or "Unknown", + "author": sqlite_row.get("author"), + "cover_path": sqlite_row.get("cover_path"), + # Use merged status (Redis > SQLite) + "status": merged.get("status") or sqlite_row.get("status") or "unknown", + # Meta + "created_at": sqlite_row.get("created_at"), + # Download counters + "download_done": merged.get("downloaded", 0), + "download_total": merged.get("chapters_total", 0), + # Audio counters + "audio_done": merged.get("audio_done", 0), + "audio_total": merged.get("chapters_total", 0), + } + + +# ============================================================ +# INSPECT ONLY — NO WRITES +# ============================================================ +def inspect_books_state(): + """ + Reads all books from SQLite and fetches Redis progress. + Builds: + • entry.sqlite + • entry.redis + • entry.would_merge_to + • entry.card (book-card compatible) """ - r = redis.Redis.from_url(os.getenv("REDIS_BROKER")) + + r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True) db = get_db() cur = db.cursor() @@ -32,110 +66,125 @@ def inspect_books_state(): results = [] for row in rows: - book_id = row["book_id"] sqlite_row = dict(row) + book_idx = sqlite_row["book_idx"] - # Read redis state - redis_key = f"book:{book_id}:state" - progress = r.hgetall(redis_key) - - if progress: - decoded = {k.decode(): v.decode() for k, v in progress.items()} - else: - decoded = {} + redis_key = f"book:{book_idx}:state" + redis_state = r.hgetall(redis_key) or {} - # Determine dry-run merged result + # ================================ + # DRY-RUN MERGE LOGIC + # ================================ merged = sqlite_row.copy() - if decoded: + if redis_state: + merged["downloaded"] = int( - decoded.get("download_done", merged.get("downloaded", 0)) + redis_state.get("chapters_download_done", merged.get("downloaded", 0)) + ) + + merged["parsed"] = int( + redis_state.get("chapters_parsed_done", merged.get("parsed", 0)) ) - merged["parsed"] = int(decoded.get("parsed_done", merged.get("parsed", 0))) + merged["audio_done"] = int( - decoded.get("audio_done", merged.get("audio_done", 0)) + redis_state.get("audio_done", merged.get("audio_done", 0)) ) + merged["chapters_total"] = int( - decoded.get("chapters_total", merged.get("chapters_total", 0)) + redis_state.get("chapters_total", merged.get("chapters_total", 0)) + ) + + merged["status"] = redis_state.get( + "status", merged.get("status", "unknown") ) - merged["status"] = decoded.get("status", merged.get("status", "unknown")) + # ================================ + # Build book-card data + # ================================ + card = _build_card(sqlite_row, redis_state, merged) + + # ================================ + # Append final result entry + # ================================ results.append( { - "book_id": book_id, + "book_idx": book_idx, + "title": sqlite_row.get("title"), "sqlite": sqlite_row, - "redis": decoded, + "redis": redis_state, "would_merge_to": merged, + "card": card, } ) return results +# ============================================================ +# SYNC REDIS → SQLITE (writes) +# ============================================================ def sync_books_from_redis(): """ - Reads all books from SQLite, fetches Redis progress, - and updates SQLite rows accordingly. - - Returns a list of { - "book_id": ..., - "before": ..., - "redis": ..., - "after": ... - } + Writes Redis progress values back into SQLite. + Uses unified book_idx as identifier. """ - r = redis.Redis.from_url(os.getenv("REDIS_BROKER")) + + r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True) db = get_db() cur = db.cursor() - # Haal alle boeken op cur.execute("SELECT * FROM books") rows = cur.fetchall() results = [] for row in rows: - book_id = row["book_id"] before = dict(row) + book_idx = before["book_idx"] - redis_key = f"book:{book_id}:state" - progress = r.hgetall(redis_key) + redis_key = f"book:{book_idx}:state" + redis_state = r.hgetall(redis_key) - if not progress: + if not redis_state: results.append( - {"book_id": book_id, "before": before, "redis": {}, "after": before} + { + "book_idx": book_idx, + "before": before, + "redis": {}, + "after": before, + } ) continue - # Decode Redis bytes → string dictionary - decoded = {k.decode(): v.decode() for k, v in progress.items()} - - # Extract counters - downloaded = int(decoded.get("download_done", 0)) - parsed = int(decoded.get("parsed_done", 0)) - audio_done = int(decoded.get("audio_done", 0)) - chapters_total = int(decoded.get("chapters_total", 0)) + # Extract progress from Redis + downloaded = int(redis_state.get("chapters_download_done", 0)) + parsed = int(redis_state.get("chapters_parsed_done", 0)) + audio_done = int(redis_state.get("audio_done", 0)) + total = int(redis_state.get("chapters_total", 0)) + status = redis_state.get("status", before.get("status")) - # Redis status wins - status = decoded.get("status", before["status"]) - - # Write back to SQLite + # Update SQLite cur.execute( """ UPDATE books SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now') - WHERE book_id = ? + WHERE book_idx = ? """, - (downloaded, parsed, audio_done, chapters_total, status, book_id), + (downloaded, parsed, audio_done, total, status, book_idx), ) db.commit() - # Fetch updated row - cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,)) + cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,)) after = dict(cur.fetchone()) results.append( - {"book_id": book_id, "before": before, "redis": decoded, "after": after} + { + "book_idx": book_idx, + "before": before, + "redis": redis_state, + "after": after, + } ) return results diff --git a/bookscraper/static/js/dashboard.js b/bookscraper/static/js/dashboard.js index 35340c0..2945d06 100644 --- a/bookscraper/static/js/dashboard.js +++ b/bookscraper/static/js/dashboard.js @@ -2,7 +2,7 @@ File: static/js/dashboard.js Purpose: Dashboard interactions: - - Select active book + - Select active book_idx - Live logs & progress - Bookcard AJAX start/abort NOTE: @@ -26,7 +26,7 @@ async function apiGet(url) { /* --------------------------------------------------------- Dashboard state --------------------------------------------------------- */ -let ACTIVE_BOOK = null; +let ACTIVE_BOOK_IDX = null; let REFRESH_INTERVAL = null; console.log(">>> dashboard.js LOADED"); @@ -37,51 +37,51 @@ console.log(">>> dashboard.js LOADED"); document.addEventListener("DOMContentLoaded", () => { console.log(">>> dashboard.js DOMContentLoaded"); - // Fallback: fetch global logs if no active book + // Fallback: global logs when no active book_idx setInterval(() => { - if (!ACTIVE_BOOK) refreshBook(null); + if (!ACTIVE_BOOK_IDX) refreshBook(null); }, 2000); // Sidebar items const items = $$(".book-list-item"); items.forEach((item) => { item.addEventListener("click", () => { - selectBook(item.dataset.bookId); + selectBook(item.dataset.bookIdx); }); }); - // Auto-select - if (!ACTIVE_BOOK && items[0]) { - selectBook(items[0].dataset.bookId); + // Auto-select first book + if (!ACTIVE_BOOK_IDX && items[0]) { + selectBook(items[0].dataset.bookIdx); } - // Initial binding of book-card buttons + // Bind start/abort buttons inside cards bindBookCardButtons(); - // Refresh sidebar every 2 seconds + // Refresh sidebar every few seconds setInterval(refreshActiveBooks, 2800); }); /* --------------------------------------------------------- - Select a book + Select a book_idx --------------------------------------------------------- */ -function selectBook(bookId) { - ACTIVE_BOOK = bookId; - console.log(">>> Selecting book", bookId); +function selectBook(bookIdx) { + ACTIVE_BOOK_IDX = bookIdx; + console.log(">>> Selecting book_idx", bookIdx); // Highlight sidebar $$(".book-list-item").forEach((el) => { - el.classList.toggle("active", el.dataset.bookId === bookId); + el.classList.toggle("active", el.dataset.bookIdx === bookIdx); }); // Reset polling if (REFRESH_INTERVAL) clearInterval(REFRESH_INTERVAL); REFRESH_INTERVAL = setInterval(() => { - refreshBook(ACTIVE_BOOK); + refreshBook(ACTIVE_BOOK_IDX); }, 2000); - refreshBook(ACTIVE_BOOK); + refreshBook(ACTIVE_BOOK_IDX); } /* --------------------------------------------------------- @@ -99,7 +99,7 @@ async function refreshActiveBooks() { books.forEach((b) => { const div = document.createElement("div"); div.className = "book-list-item"; - div.dataset.bookId = b.book_id; + div.dataset.bookIdx = b.book_idx; div.innerHTML = `
${b.title}
@@ -110,27 +110,27 @@ async function refreshActiveBooks() { `; - div.addEventListener("click", () => selectBook(b.book_id)); + div.addEventListener("click", () => selectBook(b.book_idx)); container.appendChild(div); }); - if (!ACTIVE_BOOK && books.length > 0) { - selectBook(books[0].book_id); + if (!ACTIVE_BOOK_IDX && books.length > 0) { + selectBook(books[0].book_idx); } } /* --------------------------------------------------------- Fetch logs + progress --------------------------------------------------------- */ -async function refreshBook(bookId) { - if (!bookId) { +async function refreshBook(bookIdx) { + if (!bookIdx) { const data = await apiGet("/logs"); if (data) updateLogs(data); return; } - const state = await apiGet(`/api/book/${bookId}/status`); - const logs = await apiGet(`/api/book/${bookId}/logs`); + const state = await apiGet(`/api/book/${bookIdx}/status`); + const logs = await apiGet(`/api/book/${bookIdx}/logs`); if (state) { updateProgressBars(state); @@ -140,24 +140,30 @@ async function refreshBook(bookId) { } /* --------------------------------------------------------- - BOOKCARD BUTTON BINDING — idempotent + BOOKCARD BUTTON BINDING (idempotent) --------------------------------------------------------- */ function bindBookCardButtons() { console.log(">>> bindBookCardButtons() scanning…"); // START BUTTONS document.querySelectorAll(".book-card .icon-start").forEach((btn) => { - if (btn.dataset.bound === "1") return; // prevent double-binding + if (btn.dataset.bound === "1") return; btn.dataset.bound = "1"; btn.addEventListener("click", (ev) => { ev.preventDefault(); if (btn.disabled) return; - const bookId = btn.closest(".book-card").dataset.bookId; - console.log(">>> START clicked:", bookId); + const card = btn.closest(".book-card"); + const bookIdx = card?.dataset.bookIdx; + + console.log(">>> START clicked:", bookIdx); + if (!bookIdx) { + console.error(">>> ERROR: bookIdx missing on .book-card dataset"); + return; + } - startBook(bookId); + startBook(bookIdx); }); }); @@ -170,10 +176,16 @@ function bindBookCardButtons() { ev.preventDefault(); if (btn.disabled) return; - const bookId = btn.closest(".book-card").dataset.bookId; - console.log(">>> ABORT clicked:", bookId); + const card = btn.closest(".book-card"); + const bookIdx = card?.dataset.bookIdx; - abortBookAjax(bookId); + console.log(">>> ABORT clicked:", bookIdx); + if (!bookIdx) { + console.error(">>> ERROR: bookIdx missing on .book-card dataset"); + return; + } + + abortBookAjax(bookIdx); }); }); } @@ -181,13 +193,13 @@ function bindBookCardButtons() { /* --------------------------------------------------------- AJAX START --------------------------------------------------------- */ -function startBook(bookId) { - console.log(">>> startBook():", bookId); +function startBook(bookIdx) { + console.log(">>> startBook():", bookIdx); fetch("/start", { method: "POST", headers: { "Content-Type": "application/x-www-form-urlencoded" }, - body: `book_id=${bookId}`, + body: `book_idx=${bookIdx}`, // backend expects field name book_idx }) .then(async (r) => { console.log(">>> /start status:", r.status); @@ -199,7 +211,7 @@ function startBook(bookId) { console.log(">>> /start response:", data); refreshBookCards(); - refreshBook(bookId); + refreshBook(bookIdx); }) .catch((err) => console.error("Start failed:", err)); } @@ -207,12 +219,12 @@ function startBook(bookId) { /* --------------------------------------------------------- AJAX ABORT --------------------------------------------------------- */ -function abortBookAjax(bookId) { - if (!confirm(`Abort tasks for book ${bookId}?`)) return; +function abortBookAjax(bookIdx) { + if (!confirm(`Abort tasks for book ${bookIdx}?`)) return; - console.log(">>> abortBookAjax():", bookId); + console.log(">>> abortBookAjax():", bookIdx); - fetch(`/abort/${bookId}`, { method: "POST" }) + fetch(`/abort/${bookIdx}`, { method: "POST" }) .then(async (r) => { let data = null; try { @@ -221,7 +233,7 @@ function abortBookAjax(bookId) { console.log(">>> /abort response:", data); refreshBookCards(); - refreshBook(bookId); + refreshBook(bookIdx); }) .catch((err) => console.error("Abort failed:", err)); } @@ -234,8 +246,8 @@ async function refreshBookCards() { if (!books) return; document.querySelectorAll(".book-card").forEach((card) => { - const id = card.dataset.bookId; - const info = books.find((b) => b.book_id === id); + const idx = card.dataset.bookIdx; + const info = books.find((b) => b.book_idx === idx); if (!info) return; // Status CSS @@ -255,5 +267,5 @@ async function refreshBookCards() { ].includes(info.status); }); - bindBookCardButtons(); // rebind new DOM + bindBookCardButtons(); } diff --git a/bookscraper/static/js/log_view.js b/bookscraper/static/js/log_view.js index 2191704..70de68f 100644 --- a/bookscraper/static/js/log_view.js +++ b/bookscraper/static/js/log_view.js @@ -15,7 +15,7 @@ console.log(">>> log_view.js LOADING…"); --------------------------------------------------------- */ let LOG_FILTER = "ALL"; let LAST_LOG_INDEX = -1; // delta offset -const MAX_LOG_LINES = 600; // safe rolling window +const MAX_LOG_LINES = 600; /* --------------------------------------------------------- Apply filter on existing log lines @@ -35,38 +35,25 @@ function applyLogFilter() { document.addEventListener("DOMContentLoaded", () => { console.log(">>> log_view.js DOMContentLoaded"); - const filterSel = $("#log-filter"); const clearBtn = $("#log-clear"); const output = $("#log-output"); if (!output) { - console.log( - ">>> log_view.js: No #log-output on this page → viewer disabled" - ); + console.log(">>> log_view.js: No #log-output → viewer disabled"); return; } - console.log(">>> log_view.js: log viewer detected."); - - // Filter dropdown (currently disabled in your UI) - // if (filterSel) { - // filterSel.addEventListener("change", () => { - // LOG_FILTER = filterSel.value; - // applyLogFilter(); - // }); - // } - if (clearBtn) { clearBtn.addEventListener("click", () => { console.log(">>> log_view.js: Clear log viewer"); output.innerHTML = ""; - LAST_LOG_INDEX = -1; // reset delta polling + LAST_LOG_INDEX = -1; }); } }); /* --------------------------------------------------------- - Append ONE line (smart class assignment) + Append ONE line --------------------------------------------------------- */ function rollingAppend(lineText) { const output = $("#log-output"); @@ -86,7 +73,6 @@ function rollingAppend(lineText) { else div.classList.add("default"); div.textContent = lineText; - output.appendChild(div); // Rolling limit @@ -96,31 +82,24 @@ function rollingAppend(lineText) { } /* --------------------------------------------------------- - Primary API entry: updateLogs() - Used by dashboard.js AND delta polling + Primary entry: updateLogs() Accepts: - { logs: [...], last_index: N } + { logs:[...], last:N } OR legacy: - { lines: [...], total: N } + { lines:[...], last:N } --------------------------------------------------------- */ function updateLogs(packet) { const output = $("#log-output"); - if (!output) return; - - if (!packet) return; + if (!output || !packet) return; - // Normalized log arrays let lines = packet.logs || packet.lines || []; if (!Array.isArray(lines)) return; - // Append only new lines lines.forEach((line) => rollingAppend(line)); - // Update delta index - if (packet.last_index !== undefined) { - LAST_LOG_INDEX = packet.last_index; - } else if (packet.total !== undefined) { - LAST_LOG_INDEX = packet.total - 1; + // Correct unified delta index handling + if (packet.last !== undefined) { + LAST_LOG_INDEX = packet.last; } applyLogFilter(); @@ -128,18 +107,17 @@ function updateLogs(packet) { } /* --------------------------------------------------------- - Delta polling: ONLY global logs use this - Dashboard overrides logs per book. + Delta polling — global logs ONLY + (dashboard.js overrides logs per-book) --------------------------------------------------------- */ function pollLogs() { fetch(`/logs?last_index=${LAST_LOG_INDEX}`) .then((r) => r.json()) .then((data) => { const lines = data.lines || []; - if (lines.length > 0) { - lines.forEach((line) => logAppend(line)); - LAST_LOG_INDEX = data.last; // <-- DE JUISTE INDEX! + lines.forEach((line) => rollingAppend(line)); + LAST_LOG_INDEX = data.last; } }) .catch((err) => { diff --git a/bookscraper/static/js/progress.js b/bookscraper/static/js/progress.js index a462eea..bcd834d 100644 --- a/bookscraper/static/js/progress.js +++ b/bookscraper/static/js/progress.js @@ -2,7 +2,7 @@ File: static/js/progress.js Purpose: Update progress bars dynamically for the current book. - Expects data from API endpoints via dashboard.js or start.js. + Only updates the main progress box (book_detail page). ======================================================================= */ console.log(">>> progress.js LOADED"); @@ -15,19 +15,20 @@ function updateProgressBars(data) { return; } - // Data format expected: - // { - // download_done, - // download_total, - // audio_done, - // audio_total - // } - - const barDL = $(".progress-bar-fill"); - const barAU = $(".progress-bar-fill.audio-fill"); + // We always update inside the main progress box: + const container = document.querySelector("#progressSection"); + if (!container) { + console.warn(">>> progress.js: #progressSection NOT FOUND"); + return; + } - console.log(">>> progress.js barDL =", barDL); - console.log(">>> progress.js barAU =", barAU); + // Select bars ONLY inside the correct section + const barDL = container.querySelector( + ".progress-bar:not(.audio) .progress-bar-fill" + ); + const barAU = container.querySelector( + ".progress-bar.audio .progress-bar-fill" + ); const pctDL = data.download_total > 0 @@ -39,23 +40,22 @@ function updateProgressBars(data) { if (barDL) { barDL.style.width = pctDL.toFixed(1) + "%"; - console.log(">>> progress.js updated DL bar to", pctDL.toFixed(1) + "%"); + console.log(">>> progress.js DL bar =", pctDL.toFixed(1) + "%"); } else { - console.warn(">>> progress.js: barDL NOT FOUND"); + console.warn(">>> progress.js: barDL NOT FOUND INSIDE #progressSection"); } if (barAU) { barAU.style.width = pctAU.toFixed(1) + "%"; - console.log(">>> progress.js updated AU bar to", pctAU.toFixed(1) + "%"); + console.log(">>> progress.js AU bar =", pctAU.toFixed(1) + "%"); } else { - console.warn(">>> progress.js: barAU NOT FOUND"); + console.warn(">>> progress.js: barAU NOT FOUND INSIDE #progressSection"); } - // Update textual stats - const stats = $$(".progress-stats span"); - console.log(">>> progress.js stats elements found:", stats.length); + // Textual stats — only update inside progress box + const stats = container.querySelectorAll(".progress-stats span"); - // Expected structure: [DL "x/y", DL "pct", AU "x/y", AU "pct"] + // Expected: [DL x/y, DL %, AU x/y, AU %] if (stats.length >= 4) { stats[0].innerText = `${data.download_done} / ${data.download_total}`; stats[1].innerText = pctDL.toFixed(1) + "%"; @@ -65,7 +65,7 @@ function updateProgressBars(data) { console.log(">>> progress.js stats updated"); } else { console.warn( - ">>> progress.js: not enough stats spans, found", + ">>> progress.js: not enough stats spans in the container, found", stats.length ); } diff --git a/bookscraper/templates/components/book_list_item.html b/bookscraper/templates/components/book_list_item.html index c234718..38d17cf 100644 --- a/bookscraper/templates/components/book_list_item.html +++ b/bookscraper/templates/components/book_list_item.html @@ -3,17 +3,17 @@ Purpose: Dashboard weergave van één boek in de lijst. Variabelen komen binnen via: - - → Dus alle velden moeten via "book." aangesproken worden. + book. + → Boek gebruikt nu uitsluitend book_idx als primaire sleutel ======================================================================= --> -
+
{{ book.title }}
- ID: {{ book.book_id }} {% if + IDX: {{ book.book_idx }} {% if book.last_update %} Updated: {{ book.last_update }} {% endif @@ -56,8 +56,10 @@ {{ pct_au }}%
+ +
-
diff --git a/bookscraper/templates/components/bookcard.html b/bookscraper/templates/components/bookcard.html index 9a88862..4815b5a 100644 --- a/bookscraper/templates/components/bookcard.html +++ b/bookscraper/templates/components/bookcard.html @@ -12,13 +12,13 @@ variable "b" in context ============================================================ #} -
+
- +
@@ -57,5 +57,6 @@ {{ pct2 }}%
+
diff --git a/bookscraper/templates/dashboard/book_detail.html b/bookscraper/templates/dashboard/book_detail.html index 54001d5..75a1c99 100644 --- a/bookscraper/templates/dashboard/book_detail.html +++ b/bookscraper/templates/dashboard/book_detail.html @@ -1,7 +1,7 @@ @@ -15,7 +15,9 @@
- {% include "components/progress_box.html" %} + {% include "components/progress_box.html" with book_idx=book_idx, + title=title, download_total=download_total, download_done=download_done, + audio_total=audio_total, audio_done=audio_done %}
@@ -27,13 +29,10 @@ - - - + - {% endblock %} diff --git a/bookscraper/templates/debug/inspect_state.html b/bookscraper/templates/debug/inspect_state.html index 25a757b..752266b 100644 --- a/bookscraper/templates/debug/inspect_state.html +++ b/bookscraper/templates/debug/inspect_state.html @@ -1,26 +1,27 @@ -{% extends "layout.html" %} {% block content %} +{# ============================================================ File: +templates/debug/inspect_state.html Purpose: Inspect SQLite vs Redis state per +book_idx. Left side: full book-card UI (same component as dashboard) Right side: +SQL / Redis / merged comparison table. +============================================================ #} {% extends +"layout.html" %} {% block content %}

State Inspection (SQL vs Redis)

@@ -56,33 +60,36 @@ {{ sqlval }} {{ redisval }} {% endif %} {% endmacro %} {% for entry in results %} -
-
📘 {{ entry.book_id }}
- - {% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged = - entry.would_merge_to %} - - - - - - - - - - {% for field in [ "status", "chapters_total", "downloaded", - "chapters_download_done", "chapters_download_skipped", "parsed", - "chapters_parsed_done", "audio_done", "audio_skipped", "last_update" ] %} - - - - - - - - - - {% endfor %} -
FieldSQLiteRedisMerged Result
{{ field }}{{ sql.get(field, '') }}{{ redis.get(field, '') }}{{ merged.get(field, '') }}
+
+ +
+ {% with b = entry.card %} {% include "components/bookcard.html" %} {% + endwith %} +
+ + +
+ + + + + + + + + {% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged = + entry.would_merge_to %} {% for field in [ "status", "chapters_total", + "downloaded", "chapters_download_done", "chapters_download_skipped", + "parsed", "chapters_parsed_done", "audio_done", "audio_skipped", + "last_update" ] %} + + + + + + + {% endfor %} +
FieldSQLiteRedisMerged Result
{{ field }}{{ sql.get(field, '') }}{{ redis.get(field, '') }}{{ merged.get(field, '') }}
+
{% endfor %} {% endblock %} diff --git a/bookscraper/templates/result.html b/bookscraper/templates/result.html index 57aabf9..16ef2d5 100644 --- a/bookscraper/templates/result.html +++ b/bookscraper/templates/result.html @@ -32,7 +32,6 @@ font-size: 13px; } - /* NEW: Clear button */ #clearLogBtn { margin-bottom: 10px; padding: 8px 16px; @@ -70,7 +69,7 @@ ← Terug -

Scrape Resultaat--

+

Scrape Resultaat

{% if error %}
Fout: {{ error }}
- {% endif %} {% if message %} + {% endif %} + + {% if message %}
{{ message }}
{% endif %} @@ -113,127 +114,29 @@ class="box hidden" style="background: #ffefef; border-left: 5px solid #cc0000" > - Failed chapters: + Mislukte hoofdstukken:
    Live log:
    -
    - - + // ----------------------------------------------------- + // Vraag Celery-result op, wacht tot de scraper een book_idx teruggeeft + // ----------------------------------------------------- + function pollForBookIdx() { + fetch(`/celery-result/${scrapingTask