Refactor: unified book_idx state model + controller rewrite + Redis/SQLite sync overhaul

- Removed old scraping→controller chain (no more async .get)
- New DownloadController pipeline structure
- Unified Redis Book State Model (book:{idx}:state)
- Updated all Celery tasks for unified IDs
- Removed old scraper/db.py
- Updated templates and dashboard components
- Added debug Inspect State system with bookcard preview
- Updated JS dashboard pipeline refresh
- Updated init_service + scrape_engine
- Improved abort logic
feature/bookstate-progress-fix
peter.fong 1 week ago
parent feb8ca60d7
commit 3a62dfae79

@ -28,9 +28,10 @@ from db.repository import (
get_progress, get_progress,
) )
from logbus.publisher import log
from scraper.logger import log_debug from scraper.logger import log_debug
from scraper.abort import set_abort from scraper.abort import set_abort
from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta from scraper.ui_log import get_ui_logs, reset_ui_logs
from scraper.state import state as r from scraper.state import state as r
from scraper.logger_decorators import logcall from scraper.logger_decorators import logcall
from scraper.utils.state_sync import sync_books_from_redis from scraper.utils.state_sync import sync_books_from_redis
@ -42,7 +43,6 @@ init_db()
app = Flask(__name__) app = Flask(__name__)
# ===================================================== # =====================================================
# STATIC FILE SERVING # STATIC FILE SERVING
# ===================================================== # =====================================================
@ -70,25 +70,25 @@ def index():
@logcall @logcall
def dashboard(): def dashboard():
logs_list = get_ui_logs() or [] logs_list = get_ui_logs() or []
registered_books = get_registered_books()
# Filter hidden books ONLY for GUI log(f"[WEB] Registered books: {registered_books}")
reg = [b for b in get_registered_books() if b.get("status") != "hidden"] reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template( return render_template(
"dashboard/dashboard.html", "dashboard/dashboard.html",
books=list_active_books(), # Redis books=list_active_books(),
registered=reg, # SQLite (filtered) registered=reg,
logs=logs_list, logs=logs_list,
) )
@app.route("/book/<book_id>") @app.route("/book/<book_idx>")
@logcall @logcall
def book_detail(book_id): def book_detail(book_idx):
title = r.get(f"book:{book_id}:title") or book_id title = r.get(f"book:{book_idx}:title") or book_idx
return render_template( return render_template(
"dashboard/book_detail.html", "dashboard/book_detail.html",
book_id=book_id, book_id=book_idx,
title=title, title=title,
logs=get_ui_logs(), logs=get_ui_logs(),
) )
@ -102,13 +102,6 @@ def book_detail(book_id):
@app.route("/init", methods=["POST"]) @app.route("/init", methods=["POST"])
@logcall @logcall
def init_book(): def init_book():
"""
INIT-flow:
- user enters URL
- metadata fetch
- insert into SQLite as 'registered'
- return dashboard
"""
url = request.form.get("url", "").strip() url = request.form.get("url", "").strip()
if not url: if not url:
@ -146,47 +139,64 @@ def init_book():
) )
@app.route("/hide/<book_id>", methods=["POST"]) @app.route("/hide/<book_idx>", methods=["POST"])
@logcall @logcall
def hide_registered_book(book_id): def hide_registered_book(book_idx):
""" # intentionally left disabled
Soft-delete/hide voor GUI. pass
De DB blijft intact.
"""
# try:
# hide_book(book_id)
# return redirect("/dashboard")
# # return jsonify({"status": "ok", "hidden": book_id})
# except Exception as e:
# return jsonify({"status": "error", "message": str(e)}), 500
@app.route("/start", methods=["POST"]) @app.route("/start", methods=["POST"])
@logcall @logcall
def start_scraping(): def start_scraping():
""" # 1) Form field: book_idx
Start FULL scraping vanuit een geregistreerd INIT-record. book_idx = request.form.get("book_idx")
""" log(f"[WEB][START] Received start request for book_idx={book_idx}")
book_id = request.form.get("book_id") if not book_idx:
if not book_id: msg = "book_idx ontbreekt in formulier"
return jsonify({"status": "error", "message": "book_id ontbreekt"}), 400 log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 400
book = fetch_book(book_id)
# 2) Fetch boek uit SQLite
try:
book = fetch_book(book_idx)
log(f"[WEB][START] Fetched book from DB: {book}")
except Exception as e:
log(f"[WEB][START] DB ERROR: {e}")
return jsonify({"status": "error", "message": "DB fout"}), 500
if not book: if not book:
return jsonify({"status": "error", "message": "Boek niet gevonden"}), 404 msg = f"Boek '{book_idx}' niet gevonden in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 404
# 3) Boek moet een URL hebben
url = book.get("book_url") url = book.get("book_url")
if not url: if not url:
return jsonify({"status": "error", "message": "book_url ontbreekt"}), 500 msg = f"Boek '{book_idx}' heeft geen book_url in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 500
# 4) Reset UI logs
reset_ui_logs() reset_ui_logs()
log_debug(f"[WEB] Starting FULL scrape for book_id={book_id}, url={url}")
async_result = celery_app.send_task( # 5) Logging
"scraper.tasks.scraping.start_scrape_book", log(f"[WEB][START] Starting full scrape book_idx={book_idx}, url={url}")
args=[url], log_debug(f"[WEB][START] DEBUG: book data = {book}")
queue="scraping",
) # 6) Celery controller taak starten
try:
async_result = celery_app.send_task(
"scraper.tasks.controller_tasks.start_full_scrape",
args=[book_idx],
queue="controller",
)
except Exception as e:
log(f"[WEB][START] Celery ERROR: {e}")
return jsonify({"status": "error", "message": f"Celery fout: {e}"}), 500
# 7) Successfully dispatched task
log(f"[WEB][START] Task dispatched: {async_result.id}")
reg = [b for b in get_registered_books() if b.get("status") != "hidden"] reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
@ -199,12 +209,12 @@ def start_scraping():
) )
@app.route("/abort/<book_id>", methods=["POST"]) @app.route("/abort/<book_idx>", methods=["POST"])
@logcall @logcall
def abort_download(book_id): def abort_download(book_idx):
log_debug(f"[WEB] Abort requested for book: {book_id}") log_debug(f"[WEB] Abort requested for book: {book_idx}")
set_abort(book_id) set_abort(book_idx)
return jsonify({"status": "ok", "aborted": book_id}) return jsonify({"status": "ok", "aborted": book_idx})
# ===================================================== # =====================================================
@ -218,23 +228,23 @@ def api_books():
return jsonify(list_active_books()) return jsonify(list_active_books())
@app.route("/api/book/<book_id>/status") @app.route("/api/book/<book_idx>/status")
@logcall @logcall
def api_book_status(book_id): def api_book_status(book_idx):
return jsonify(getStatus(book_id)) return jsonify(getStatus(book_idx))
@app.route("/api/book/<book_id>/logs") @app.route("/api/book/<book_idx>/logs")
@logcall @logcall
def api_book_logs(book_id): def api_book_logs(book_idx):
logs = r.lrange(f"logs:{book_id}", 0, -1) or [] logs = r.lrange(f"logs:{book_idx}", 0, -1) or []
return jsonify(logs) return jsonify(logs)
@app.route("/progress/<book_id>") @app.route("/progress/<book_idx>")
@logcall @logcall
def progress(book_id): def progress(book_idx):
return jsonify(get_progress(book_id)) return jsonify(get_progress(book_idx))
@app.route("/celery-result/<task_id>") @app.route("/celery-result/<task_id>")
@ -258,16 +268,13 @@ def clear_logs():
@app.route("/logs", methods=["GET"]) @app.route("/logs", methods=["GET"])
@logcall @logcall
def logs(): def logs():
# LAST_LOG_INDEX vanuit de client (default = -1 bij eerste call)
try: try:
last_index = int(request.args.get("last_index", -1)) last_index = int(request.args.get("last_index", -1))
except: except:
last_index = -1 last_index = -1
# Haal volledige huidige loglijst op
all_logs = get_ui_logs() or [] all_logs = get_ui_logs() or []
# Delta: alle regels met index > last_index
new_lines = [] new_lines = []
new_last = last_index new_last = last_index
@ -282,6 +289,8 @@ def logs():
# ===================================================== # =====================================================
# SECTION 4 — DEBUG ROUTES # SECTION 4 — DEBUG ROUTES
# ===================================================== # =====================================================
@app.route("/debug/sync_state", methods=["GET"]) @app.route("/debug/sync_state", methods=["GET"])
def debug_sync_state(): def debug_sync_state():
results = sync_books_from_redis() results = sync_books_from_redis()
@ -293,13 +302,6 @@ from scraper.utils.state_sync import inspect_books_state
@app.route("/debug/inspect_state", methods=["GET"]) @app.route("/debug/inspect_state", methods=["GET"])
def debug_inspect_state(): def debug_inspect_state():
"""
Shows:
- raw SQLite values,
- raw Redis values,
- what the merged result WOULD be.
No writes happen.
"""
results = inspect_books_state() results = inspect_books_state()
return render_template("debug/inspect_state.html", results=results) return render_template("debug/inspect_state.html", results=results)
@ -339,10 +341,10 @@ def api_db_books():
# ============================================= # =============================================
# DEBUG QUEUE VIEW (HTML) # DEBUG QUEUE VIEW (HTML)
# ============================================= # =============================================
from flask import render_template from flask import render_template
from urllib.parse import urlparse from urllib.parse import urlparse
import redis import redis
import os
from celery_app import celery_app from celery_app import celery_app
@ -354,11 +356,10 @@ def debug_queues():
workers_scheduled = insp.scheduled() or {} workers_scheduled = insp.scheduled() or {}
workers_reserved = insp.reserved() or {} workers_reserved = insp.reserved() or {}
# ---- Redis connection ----
redis_url = os.getenv("REDIS_BROKER") redis_url = os.getenv("REDIS_BROKER")
parsed = urlparse(redis_url) parsed = urlparse(redis_url)
r = redis.Redis( r2 = redis.Redis(
host=parsed.hostname, host=parsed.hostname,
port=parsed.port, port=parsed.port,
db=int(parsed.path.strip("/") or 0), db=int(parsed.path.strip("/") or 0),
@ -375,8 +376,8 @@ def debug_queues():
{ {
"name": q, "name": q,
"redis_key": key, "redis_key": key,
"length": r.llen(key), "length": r2.llen(key),
"items": r.lrange(key, 0, 30), # first 30 entries "items": r2.lrange(key, 0, 30),
} }
) )
except Exception as e: except Exception as e:
@ -404,17 +405,17 @@ def debug_queues():
@logcall @logcall
def getStatus(book_id): def getStatus(book_idx):
state = r.hgetall(f"book:{book_id}:state") state = r.hgetall(f"book:{book_idx}:state")
status = state.get("status") or "unknown" status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0)) dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0)) dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0)) dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0) au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_id title = state.get("title") or book_idx
return { return {
"book_id": book_id, "book_id": book_idx,
"title": title, "title": title,
"status": status, "status": status,
"download_done": dl_done, "download_done": dl_done,
@ -431,8 +432,8 @@ def list_active_books():
for key in r.scan_iter(match="book:*:state", count=1000): for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":") first = key.find(":")
second = key.find(":", first + 1) second = key.find(":", first + 1)
book_id = key[first + 1 : second] book_idx = key[first + 1 : second]
books.append(getStatus(book_id)) books.append(getStatus(book_idx))
return books return books

@ -1,12 +1,11 @@
# ============================================================ # ============================================================
# File: db/db.py # File: db/db.py (UPDATED for book_idx-only architecture)
# Purpose: # Purpose:
# Raw SQLite engine for BookScraper. # Raw SQLite engine for BookScraper.
# Provides ONLY low-level DB primitives. # - Connection management
# - Connection management (existing DELETE journal mode)
# - init_db() schema creation + safe schema upgrade # - init_db() schema creation + safe schema upgrade
# - upsert_book() atomic write # - upsert_book() atomic write (now uses book_idx)
# - raw fetch helpers (private) # - raw fetch helpers
# ============================================================ # ============================================================
import os import os
@ -52,12 +51,12 @@ def init_db():
conn = get_db() conn = get_db()
# -------------------------------------------------------- # --------------------------------------------------------
# BASE SCHEMA (unchanged) # BASE SCHEMA — book_idx is now PRIMARY KEY
# -------------------------------------------------------- # --------------------------------------------------------
conn.execute( conn.execute(
""" """
CREATE TABLE IF NOT EXISTS books ( CREATE TABLE IF NOT EXISTS books (
book_id TEXT PRIMARY KEY, book_idx INTEGER PRIMARY KEY,
title TEXT, title TEXT,
author TEXT, author TEXT,
description TEXT, description TEXT,
@ -81,7 +80,7 @@ def init_db():
conn.commit() conn.commit()
# -------------------------------------------------------- # --------------------------------------------------------
# SCHEMA UPGRADE UTIL # SCHEMA UPGRADE UTILITY
# -------------------------------------------------------- # --------------------------------------------------------
def add_column(name, type_): def add_column(name, type_):
try: try:
@ -92,16 +91,16 @@ def init_db():
cols = conn.execute("PRAGMA table_info(books);").fetchall() cols = conn.execute("PRAGMA table_info(books);").fetchall()
colnames = [c[1] for c in cols] colnames = [c[1] for c in cols]
# Existing: ensure new metadata fields exist # --------------------------------------------------------
# UPGRADE NEW FIELDS — future-proof, matched with Redis state model
# --------------------------------------------------------
# (book_idx already exists as PRIMARY KEY — no need to add)
add_column("description", "TEXT") add_column("description", "TEXT")
add_column("cover_path", "TEXT") add_column("cover_path", "TEXT")
add_column("book_url", "TEXT") add_column("book_url", "TEXT")
# --------------------------------------------------------
# NEW FIELDS — MATCH REDIS STATE MODEL (future-proof)
# These do NOT change logic, but enable repository snapshot sync.
# --------------------------------------------------------
# Download counters # Download counters
add_column("chapters_download_done", "INTEGER DEFAULT 0") add_column("chapters_download_done", "INTEGER DEFAULT 0")
add_column("chapters_download_skipped", "INTEGER DEFAULT 0") add_column("chapters_download_skipped", "INTEGER DEFAULT 0")
@ -116,13 +115,18 @@ def init_db():
# ------------------------------------------------------------ # ------------------------------------------------------------
# WRITE OPERATIONS # WRITE OPERATIONS (book_idx-based UPSERT)
# ------------------------------------------------------------ # ------------------------------------------------------------
def upsert_book(book_id, **fields): def upsert_book(book_idx, **fields):
"""
UPSERT by book_idx.
Replaces old upsert that used book_id.
"""
conn = get_db() conn = get_db()
keys = ["book_id"] + list(fields.keys()) keys = ["book_idx"] + list(fields.keys())
values = [book_id] + list(fields.values()) values = [book_idx] + list(fields.values())
placeholders = ",".join(["?"] * len(values)) placeholders = ",".join(["?"] * len(values))
updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()]) updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()])
@ -130,7 +134,7 @@ def upsert_book(book_id, **fields):
sql = f""" sql = f"""
INSERT INTO books ({','.join(keys)}) INSERT INTO books ({','.join(keys)})
VALUES ({placeholders}) VALUES ({placeholders})
ON CONFLICT(book_id) ON CONFLICT(book_idx)
DO UPDATE SET {updates}, DO UPDATE SET {updates},
last_update = CURRENT_TIMESTAMP; last_update = CURRENT_TIMESTAMP;
""" """
@ -140,11 +144,13 @@ def upsert_book(book_id, **fields):
# ------------------------------------------------------------ # ------------------------------------------------------------
# RAW READ OPERATIONS (PRIVATE) # RAW READ OPERATIONS
# ------------------------------------------------------------ # ------------------------------------------------------------
def _raw_get_book(book_id): def _raw_get_book(book_idx):
conn = get_db() conn = get_db()
row = conn.execute("SELECT * FROM books WHERE book_id = ?;", (book_id,)).fetchone() row = conn.execute(
"SELECT * FROM books WHERE book_idx = ?;", (book_idx,)
).fetchone()
return dict(row) if row else None return dict(row) if row else None

@ -8,6 +8,9 @@
# - Route counters → Redis (live) + SQLite (snapshot) # - Route counters → Redis (live) + SQLite (snapshot)
# - Provide a clean API for tasks and Flask UI # - Provide a clean API for tasks and Flask UI
# ============================================================ # ============================================================
# ============================================================
# File: db/repository.py (UPDATED for book_idx-only architecture)
# ============================================================
from scraper.logger_decorators import logcall from scraper.logger_decorators import logcall
from logbus.publisher import log from logbus.publisher import log
@ -53,43 +56,44 @@ _r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ============================================================ # ============================================================
# INTERNAL — legacy progress helpers # INTERNAL — LEGACY PROGRESS HELPERS (kept for UI)
# Keys remain: progress:{book_idx}:*
# ============================================================ # ============================================================
def _legacy_set_total(book_id, total): def _legacy_set_total(book_idx, total):
_r.set(f"progress:{book_id}:total", total) _r.set(f"progress:{book_idx}:total", total)
def _legacy_inc_completed(book_id): def _legacy_inc_completed(book_idx):
_r.incr(f"progress:{book_id}:completed") _r.incr(f"progress:{book_idx}:completed")
def _legacy_inc_skipped(book_id): def _legacy_inc_skipped(book_idx):
_r.incr(f"progress:{book_id}:skipped") _r.incr(f"progress:{book_idx}:skipped")
def _legacy_inc_failed(book_id): def _legacy_inc_failed(book_idx):
_r.incr(f"progress:{book_id}:failed") _r.incr(f"progress:{book_idx}:failed")
def _legacy_add_failed_chapter(book_id, chapter, reason): def _legacy_add_failed_chapter(book_idx, chapter, reason):
entry = f"Chapter {chapter}: {reason}" entry = f"Chapter {chapter}: {reason}"
_r.rpush(f"progress:{book_id}:failed_list", entry) _r.rpush(f"progress:{book_idx}:failed_list", entry)
def _legacy_get_failed_list(book_id): def _legacy_get_failed_list(book_idx):
return _r.lrange(f"progress:{book_id}:failed_list", 0, -1) return _r.lrange(f"progress:{book_idx}:failed_list", 0, -1)
def _legacy_get_progress(book_id): def _legacy_get_progress(book_idx):
total = int(_r.get(f"progress:{book_id}:total") or 0) total = int(_r.get(f"progress:{book_idx}:total") or 0)
completed = int(_r.get(f"progress:{book_id}:completed") or 0) completed = int(_r.get(f"progress:{book_idx}:completed") or 0)
skipped = int(_r.get(f"progress:{book_id}:skipped") or 0) skipped = int(_r.get(f"progress:{book_idx}:skipped") or 0)
failed = int(_r.get(f"progress:{book_id}:failed") or 0) failed = int(_r.get(f"progress:{book_idx}:failed") or 0)
abort = _r.exists(f"abort:{book_id}") == 1 abort = _r.exists(f"abort:{book_idx}") == 1
failed_list = _legacy_get_failed_list(book_id) failed_list = _legacy_get_failed_list(book_idx)
return { return {
"book_id": book_id, "book_idx": book_idx,
"total": total, "total": total,
"completed": completed, "completed": completed,
"skipped": skipped, "skipped": skipped,
@ -100,29 +104,29 @@ def _legacy_get_progress(book_id):
# ============================================================ # ============================================================
# PUBLIC — UI-ready legacy progress access # PUBLIC — PROGRESS API
# ============================================================ # ============================================================
@logcall @logcall
def get_progress(book_id): def get_progress(book_idx):
return _legacy_get_progress(book_id) return _legacy_get_progress(book_idx)
@logcall @logcall
def add_failed_chapter(book_id, chapter, reason): def add_failed_chapter(book_idx, chapter, reason):
_legacy_add_failed_chapter(book_id, chapter, reason) _legacy_add_failed_chapter(book_idx, chapter, reason)
@logcall @logcall
def get_failed_list(book_id): def get_failed_list(book_idx):
return _legacy_get_failed_list(book_id) return _legacy_get_failed_list(book_idx)
# ============================================================ # ============================================================
# FETCH OPERATIONS (SQLite snapshot) # FETCH OPERATIONS (SQLite snapshot)
# ============================================================ # ============================================================
@logcall @logcall
def fetch_book(book_id): def fetch_book(book_idx):
return sql_fetch_book(book_id) return sql_fetch_book(book_idx)
@logcall @logcall
@ -135,7 +139,7 @@ def fetch_all_books():
# ============================================================ # ============================================================
@logcall @logcall
def register_book( def register_book(
book_id, book_idx,
title, title,
author=None, author=None,
description=None, description=None,
@ -145,6 +149,7 @@ def register_book(
): ):
fields = { fields = {
"book_idx": book_idx,
"title": title, "title": title,
"author": author, "author": author,
"description": description, "description": description,
@ -154,19 +159,24 @@ def register_book(
"chapters_total": 0, "chapters_total": 0,
"status": "registered", "status": "registered",
} }
log(f"[DB] Registering new book={book_id} title='{title}'")
sql_register_book(book_id, fields) log(f"[DB] Registering new book_idx={book_idx} title='{title}'")
sql_register_book(book_idx, fields)
# ============================================================
# SCRAPE-FLOW UPDATE
# ============================================================
@logcall @logcall
def update_book_after_full_scrape( def update_book_after_full_scrape(
book_id, book_idx,
title=None, title=None,
author=None, author=None,
description=None, description=None,
cover_url=None, cover_url=None,
chapters_total=None, chapters_total=None,
): ):
fields = {} fields = {}
if title is not None: if title is not None:
@ -182,8 +192,8 @@ def update_book_after_full_scrape(
fields["status"] = "active" fields["status"] = "active"
log(f"[DB] update full scrape metadata book={book_id}") log(f"[DB] update metadata for book_idx={book_idx}")
sql_update_book(book_id, fields) sql_update_book(book_idx, fields)
# ============================================================ # ============================================================
@ -206,98 +216,82 @@ def get_active_books():
# STATUS MANAGEMENT # STATUS MANAGEMENT
# ============================================================ # ============================================================
@logcall @logcall
def set_status(book_id, status): def set_status(book_idx, status):
log(f"[DB] Setting status for {book_id} to '{status}'") log(f"[DB] Setting status for {book_idx} to '{status}'")
redis_set_status(book_id, status) redis_set_status(book_idx, status)
sql_set_status(book_id, status) sql_set_status(book_idx, status)
# ============================================================ # ============================================================
# CHAPTER TOTALS # CHAPTER TOTALS
# ============================================================ # ============================================================
@logcall @logcall
def set_chapters_total(book_id, total): def set_chapters_total(book_idx, total):
log(f"[DB] Setting chapter total for {book_id} to {total}") log(f"[DB] Setting chapter total for {book_idx} to {total}")
redis_set_chapters_total(book_id, total) redis_set_chapters_total(book_idx, total)
sql_set_chapters_total(book_id, total) sql_set_chapters_total(book_idx, total)
_legacy_set_total(book_id, total) # integrate legacy progress _legacy_set_total(book_idx, total)
# ============================================================ # ============================================================
# COUNTERS — DOWNLOAD # COUNTERS — DOWNLOAD
# ============================================================ # ============================================================
@logcall @logcall
def inc_download_done(book_id, amount=1): def inc_download_done(book_idx, amount=1):
log(f"[DB] Incrementing download done for {book_id} by {amount}") log(f"[DB] Incrementing download done for {book_idx} by {amount}")
redis_inc_download_done(book_id, amount) redis_inc_download_done(book_idx, amount)
sql_inc_downloaded(book_id, amount) sql_inc_downloaded(book_idx, amount)
_legacy_inc_completed(book_id) _legacy_inc_completed(book_idx)
@logcall @logcall
def inc_download_skipped(book_id, amount=1): def inc_download_skipped(book_idx, amount=1):
log(f"[DB] Incrementing download skipped for {book_id} by {amount}") log(f"[DB] Incrementing download skipped for {book_idx} by {amount}")
redis_inc_download_skipped(book_id, amount) redis_inc_download_skipped(book_idx, amount)
_legacy_inc_skipped(book_id) _legacy_inc_skipped(book_idx)
# ============================================================ # ============================================================
# COUNTERS — PARSE # COUNTERS — PARSE
# ============================================================ # ============================================================
@logcall @logcall
def inc_parsed_done(book_id, amount=1): def inc_parsed_done(book_idx, amount=1):
log(f"[DB] Incrementing parsed done for {book_id} by {amount}") log(f"[DB] Incrementing parsed done for {book_idx} by {amount}")
redis_inc_parsed_done(book_id, amount) redis_inc_parsed_done(book_idx, amount)
sql_inc_parsed(book_id, amount) sql_inc_parsed(book_idx, amount)
# ============================================================ # ============================================================
# COUNTERS — AUDIO # COUNTERS — AUDIO
# ============================================================ # ============================================================
# ============================================================
# COUNTERS — AUDIO SKIPPED
# ============================================================
@logcall @logcall
def inc_audio_skipped(book_id, amount=1): def inc_audio_skipped(book_idx, amount=1):
log(f"[DB] Incrementing audio skipped for {book_id} by {amount}") log(f"[DB] Incrementing audio skipped for {book_idx} by {amount}")
# Redis live counter (maak deze functie in state_redis wanneer nodig) sql_inc_audio_skipped(book_idx, amount)
sql_inc_audio_skipped(book_id, amount) redis_inc_audio_skipped(book_idx, amount)
redis_inc_audio_skipped(book_id, amount)
# Geen SQLite kolom? Dan overslaan.
@logcall @logcall
def inc_audio_done(book_id, amount=1): def inc_audio_done(book_idx, amount=1):
log(f"[DB] Incrementing audio done for {book_id} by {amount}") log(f"[DB] Incrementing audio done for {book_idx} by {amount}")
redis_inc_audio_done(book_id, amount) redis_inc_audio_done(book_idx, amount)
sql_inc_audio_done(book_id, amount) sql_inc_audio_done(book_idx, amount)
# ============================================================ # ============================================================
# BACKWARDS COMPATIBILITY SHIMS (old task API) # BACKWARDS COMPATIBILITY SHIMS
# These map the old API (book_id) to the new book_idx-only system
# ============================================================ # ============================================================
@logcall @logcall
def inc_downloaded(book_id, amount=1): def inc_downloaded(book_idx, amount=1):
""" return inc_download_done(book_idx, amount)
Old name used by older tasks.
Redirects to new unified counter.
"""
return inc_download_done(book_id, amount)
@logcall @logcall
def inc_parsed(book_id, amount=1): def inc_parsed(book_idx, amount=1):
""" return inc_parsed_done(book_idx, amount)
Old name used by older tasks.
"""
return inc_parsed_done(book_id, amount)
@logcall @logcall
def inc_audio_done_legacy(book_id, amount=1): def inc_audio_done_legacy(book_idx, amount=1):
""" return inc_audio_done(book_idx, amount)
Old audio name used by older tasks.
"""
return inc_audio_done(book_id, amount)

@ -1,5 +1,5 @@
# ============================================================ # ============================================================
# File: db/state_redis.py # File: db/state_redis.py (UPDATED for book_idx-only architecture)
# Purpose: # Purpose:
# Low-level Redis counters/state for BookScraper. # Low-level Redis counters/state for BookScraper.
# Used ONLY by db.repository façade. # Used ONLY by db.repository façade.
@ -15,11 +15,18 @@ REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True) r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ------------------------------------------------------------
# INTERNAL KEY BUILDER
# ------------------------------------------------------------
def _key(book_idx: str) -> str:
return f"book:{book_idx}:state"
# ------------------------------------------------------------ # ------------------------------------------------------------
# STATUS # STATUS
# ------------------------------------------------------------ # ------------------------------------------------------------
def redis_set_status(book_id: str, status: str): def redis_set_status(book_idx: str, status: str):
key = f"book:{book_id}:state" key = _key(book_idx)
r.hset(key, "status", status) r.hset(key, "status", status)
r.hset(key, "last_update", int(time.time())) r.hset(key, "last_update", int(time.time()))
@ -27,8 +34,8 @@ def redis_set_status(book_id: str, status: str):
# ------------------------------------------------------------ # ------------------------------------------------------------
# TOTAL CHAPTERS # TOTAL CHAPTERS
# ------------------------------------------------------------ # ------------------------------------------------------------
def redis_set_chapters_total(book_id: str, total: int): def redis_set_chapters_total(book_idx: str, total: int):
key = f"book:{book_id}:state" key = _key(book_idx)
r.hset(key, "chapters_total", total) r.hset(key, "chapters_total", total)
r.hset(key, "last_update", int(time.time())) r.hset(key, "last_update", int(time.time()))
@ -36,15 +43,15 @@ def redis_set_chapters_total(book_id: str, total: int):
# ------------------------------------------------------------ # ------------------------------------------------------------
# DOWNLOAD COUNTERS # DOWNLOAD COUNTERS
# ------------------------------------------------------------ # ------------------------------------------------------------
def redis_inc_download_done(book_id: str, amount: int = 1): def redis_inc_download_done(book_idx: str, amount: int = 1):
key = f"book:{book_id}:state" key = _key(book_idx)
r.hincrby(key, "chapters_download_done", amount) r.hincrby(key, "chapters_download_done", amount)
r.hset(key, "last_update", int(time.time())) r.hset(key, "last_update", int(time.time()))
def redis_inc_download_skipped(book_id: str, amount: int = 1): def redis_inc_download_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download skipped for {book_id} by {amount}") log(f"[DB-REDIS] Incrementing download skipped for {book_idx} by {amount}")
key = f"book:{book_id}:state" key = _key(book_idx)
r.hincrby(key, "chapters_download_skipped", amount) r.hincrby(key, "chapters_download_skipped", amount)
r.hset(key, "last_update", int(time.time())) r.hset(key, "last_update", int(time.time()))
@ -52,8 +59,8 @@ def redis_inc_download_skipped(book_id: str, amount: int = 1):
# ------------------------------------------------------------ # ------------------------------------------------------------
# PARSE COUNTERS # PARSE COUNTERS
# ------------------------------------------------------------ # ------------------------------------------------------------
def redis_inc_parsed_done(book_id: str, amount: int = 1): def redis_inc_parsed_done(book_idx: str, amount: int = 1):
key = f"book:{book_id}:state" key = _key(book_idx)
r.hincrby(key, "chapters_parsed_done", amount) r.hincrby(key, "chapters_parsed_done", amount)
r.hset(key, "last_update", int(time.time())) r.hset(key, "last_update", int(time.time()))
@ -61,19 +68,64 @@ def redis_inc_parsed_done(book_id: str, amount: int = 1):
# ------------------------------------------------------------ # ------------------------------------------------------------
# AUDIO COUNTERS # AUDIO COUNTERS
# ------------------------------------------------------------ # ------------------------------------------------------------
def redis_inc_audio_done(book_id: str, amount: int = 1): def redis_inc_audio_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio done for {book_id} by {amount}") log(f"[DB-REDIS] Incrementing audio done for {book_idx} by {amount}")
key = f"book:{book_id}:state" key = _key(book_idx)
r.hincrby(key, "audio_done", amount) r.hincrby(key, "audio_done", amount)
r.hset(key, "last_update", int(time.time())) r.hset(key, "last_update", int(time.time()))
def redis_inc_audio_skipped(book_id: str, amount: int = 1): def redis_inc_audio_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio skipped for {book_id} by {amount}") log(f"[DB-REDIS] Incrementing audio skipped for {book_idx} by {amount}")
""" """
New: Count skipped audio chapters (timeouts, pre-existing files, abort, etc.) New: Count skipped audio chapters (timeouts, pre-existing files, abort, etc.)
SQL does NOT track this; Redis-only metric. SQL does NOT track this; Redis-only metric.
""" """
key = f"book:{book_id}:state" key = _key(book_idx)
r.hincrby(key, "audio_skipped", amount) r.hincrby(key, "audio_skipped", amount)
r.hset(key, "last_update", int(time.time())) r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# INITIALISE BOOK STATE
# ------------------------------------------------------------
def init_book_state(book_id: str, title: str, url: str, chapters_total: int):
"""
Initialiseert de complete Redis state voor een nieuw boek.
LET OP:
- Als een key al bestaat NIET resetten (progress behouden).
- Alleen missende velden worden toegevoegd.
"""
key = f"book:{book_id}:state"
# Bestaat al? Dan vullen we alleen missende velden aan.
exists = r.exists(key)
pipeline = r.pipeline()
# Basis metadata
pipeline.hsetnx(key, "book_id", book_id)
pipeline.hsetnx(key, "title", title or "")
pipeline.hsetnx(key, "url", url or "")
# State
pipeline.hsetnx(key, "status", "registered")
# Counters
pipeline.hsetnx(key, "chapters_total", chapters_total)
pipeline.hsetnx(key, "chapters_download_done", 0)
pipeline.hsetnx(key, "chapters_download_skipped", 0)
pipeline.hsetnx(key, "chapters_parsed_done", 0)
pipeline.hsetnx(key, "audio_done", 0)
pipeline.hsetnx(key, "audio_skipped", 0)
# Timestamp
pipeline.hset(key, "last_update", int(time.time()))
pipeline.execute()
if exists:
log(f"[DB-REDIS] init_book_state(): UPDATED existing state for {book_id}")
else:
log(f"[DB-REDIS] init_book_state(): CREATED new state for {book_id}")

@ -1,5 +1,5 @@
# ============================================================ # ============================================================
# File: db/state_sql.py # File: db/state_sql.py (UPDATED for book_idx-only architecture)
# Purpose: # Purpose:
# Low-level SQLite snapshot layer for BookScraper metadata. # Low-level SQLite snapshot layer for BookScraper metadata.
# Used ONLY through db.repository façade. # Used ONLY through db.repository façade.
@ -10,7 +10,8 @@ import os
from logbus.publisher import log from logbus.publisher import log
DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/db/books.db") # Must match db/db.py
DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/data/books.db")
# ------------------------------------------------------------ # ------------------------------------------------------------
@ -25,10 +26,10 @@ def _connect():
# ------------------------------------------------------------ # ------------------------------------------------------------
# FETCH # FETCH
# ------------------------------------------------------------ # ------------------------------------------------------------
def sql_fetch_book(book_id): def sql_fetch_book(book_idx):
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,)) cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
row = cur.fetchone() row = cur.fetchone()
conn.close() conn.close()
return dict(row) if row else None return dict(row) if row else None
@ -37,7 +38,7 @@ def sql_fetch_book(book_id):
def sql_fetch_all_books(): def sql_fetch_all_books():
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cur.execute("SELECT * FROM books ORDER BY rowid DESC") cur.execute("SELECT * FROM books ORDER BY created_at DESC")
rows = cur.fetchall() rows = cur.fetchall()
conn.close() conn.close()
return [dict(r) for r in rows] return [dict(r) for r in rows]
@ -46,22 +47,27 @@ def sql_fetch_all_books():
# ------------------------------------------------------------ # ------------------------------------------------------------
# REGISTER / UPDATE # REGISTER / UPDATE
# ------------------------------------------------------------ # ------------------------------------------------------------
def sql_register_book(book_id, fields: dict): def sql_register_book(book_idx, fields: dict):
"""
Insert or replace entire book record.
book_idx is the PRIMARY KEY.
"""
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cols = ", ".join(["book_id"] + list(fields.keys())) cols = ", ".join(["book_idx"] + list(fields.keys()))
placeholders = ", ".join(["?"] * (1 + len(fields))) placeholders = ", ".join(["?"] * (1 + len(fields)))
values = [book_id] + list(fields.values()) values = [book_idx] + list(fields.values())
cur.execute( cur.execute(
f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})", values f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})",
values,
) )
conn.commit() conn.commit()
conn.close() conn.close()
def sql_update_book(book_id, fields: dict): def sql_update_book(book_idx, fields: dict):
if not fields: if not fields:
return return
@ -69,9 +75,12 @@ def sql_update_book(book_id, fields: dict):
cur = conn.cursor() cur = conn.cursor()
set_clause = ", ".join([f"{k} = ?" for k in fields]) set_clause = ", ".join([f"{k} = ?" for k in fields])
params = list(fields.values()) + [book_id] params = list(fields.values()) + [book_idx]
cur.execute(f"UPDATE books SET {set_clause} WHERE book_id = ?", params) cur.execute(
f"UPDATE books SET {set_clause} WHERE book_idx = ?",
params,
)
conn.commit() conn.commit()
conn.close() conn.close()
@ -79,10 +88,13 @@ def sql_update_book(book_id, fields: dict):
# ------------------------------------------------------------ # ------------------------------------------------------------
# STATUS # STATUS
# ------------------------------------------------------------ # ------------------------------------------------------------
def sql_set_status(book_id, status: str): def sql_set_status(book_idx, status: str):
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cur.execute("UPDATE books SET status = ? WHERE book_id = ?", (status, book_id)) cur.execute(
"UPDATE books SET status = ? WHERE book_idx = ?",
(status, book_idx),
)
conn.commit() conn.commit()
conn.close() conn.close()
@ -90,11 +102,12 @@ def sql_set_status(book_id, status: str):
# ------------------------------------------------------------ # ------------------------------------------------------------
# CHAPTER TOTAL (snapshot) # CHAPTER TOTAL (snapshot)
# ------------------------------------------------------------ # ------------------------------------------------------------
def sql_set_chapters_total(book_id, total: int): def sql_set_chapters_total(book_idx, total: int):
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
"UPDATE books SET chapters_total = ? WHERE book_id = ?", (total, book_id) "UPDATE books SET chapters_total = ? WHERE book_idx = ?",
(total, book_idx),
) )
conn.commit() conn.commit()
conn.close() conn.close()
@ -103,63 +116,63 @@ def sql_set_chapters_total(book_id, total: int):
# ------------------------------------------------------------ # ------------------------------------------------------------
# COUNTERS (SNAPSHOT-ONLY) # COUNTERS (SNAPSHOT-ONLY)
# ------------------------------------------------------------ # ------------------------------------------------------------
def sql_inc_downloaded(book_id, amount=1): def sql_inc_downloaded(book_idx, amount=1):
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
""" """
UPDATE books UPDATE books
SET downloaded = COALESCE(downloaded,0) + ? SET downloaded = COALESCE(downloaded,0) + ?
WHERE book_id = ? WHERE book_idx = ?
""", """,
(amount, book_id), (amount, book_idx),
) )
conn.commit() conn.commit()
conn.close() conn.close()
def sql_inc_parsed(book_id, amount=1): def sql_inc_parsed(book_idx, amount=1):
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
""" """
UPDATE books UPDATE books
SET parsed = COALESCE(parsed,0) + ? SET parsed = COALESCE(parsed,0) + ?
WHERE book_id = ? WHERE book_idx = ?
""", """,
(amount, book_id), (amount, book_idx),
) )
conn.commit() conn.commit()
conn.close() conn.close()
def sql_inc_audio_done(book_id, amount=1): def sql_inc_audio_done(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio done for {book_id} by {amount}") log(f"[DB-SQL] Incrementing audio_done for {book_idx} by {amount}")
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
""" """
UPDATE books UPDATE books
SET audio_done = COALESCE(audio_done,0) + ? SET audio_done = COALESCE(audio_done,0) + ?
WHERE book_id = ? WHERE book_idx = ?
""", """,
(amount, book_id), (amount, book_idx),
) )
conn.commit() conn.commit()
conn.close() conn.close()
def sql_inc_audio_skipped(book_id, amount=1): def sql_inc_audio_skipped(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio skipped for {book_id} by {amount}") log(f"[DB-SQL] Incrementing audio_skipped for {book_idx} by {amount}")
conn = _connect() conn = _connect()
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
""" """
UPDATE books UPDATE books
SET audio_skipped = COALESCE(audio_skipped,0) + ? SET audio_skipped = COALESCE(audio_skipped,0) + ?
WHERE book_id = ? WHERE book_idx = ?
""", """,
(amount, book_id), (amount, book_idx),
) )
conn.commit() conn.commit()
conn.close() conn.close()

@ -2,8 +2,6 @@ import os
import redis import redis
from scraper.logger_decorators import logcall from scraper.logger_decorators import logcall
# GUI log (non-breaking)
from scraper.ui_log import push_ui from scraper.ui_log import push_ui
# --------------------------------------------------------- # ---------------------------------------------------------
@ -15,55 +13,58 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# Debug mode (optional) # Debug mode (optional)
ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1" ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1"
# Internal flag to avoid spamming the same message # Avoid duplicate spam
_seen_debug_keys = set() _seen_debug_keys = set()
# ========================================================= # =========================================================
# ABORT FLAG # INTERNAL DEBUGGING
# ========================================================= # =========================================================
def _debug(msg: str): def _debug(msg: str):
"""Print + GUI log (non-breaking, minimal noise)."""
print(msg) print(msg)
push_ui(msg) push_ui(msg)
def set_abort(book_id: str): # =========================================================
"""Enable abort mode for this book.""" # ABORT FLAG — unified book_idx
key = f"abort:{book_id}" # =========================================================
def set_abort(book_idx: str):
"""Enable abort mode for book_idx."""
key = f"abort:{book_idx}"
r.set(key, "1") r.set(key, "1")
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT] SET {key}") _debug(f"[ABORT] SET {key}")
def clear_abort(book_id: str): def clear_abort(book_idx: str):
"""Clear abort flag.""" """Clear abort flag."""
key = f"abort:{book_id}" key = f"abort:{book_idx}"
r.delete(key) r.delete(key)
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT] CLEAR {key}") _debug(f"[ABORT] CLEAR {key}")
def abort_requested(book_id: str, redis_client=None) -> bool: def abort_requested(book_idx: str, redis_client=None) -> bool:
""" """
Return True if abort flag is set. Check whether abort flag is active for book_idx.
redis_client: redis_client:
- Docker workers None use default Redis (r) - Docker workers None use default Redis (r)
- Local macOS audio passes Redis(host=127.0.0.1) - Local macOS audio worker passes Redis(host=127.0.0.1)
""" """
client = redis_client or r client = redis_client or r
key = f"abort:{book_id}" key = f"abort:{book_idx}"
try: try:
exists = client.exists(key) exists = client.exists(key)
if ABORT_DEBUG: if ABORT_DEBUG:
# Log once per key
# Log only once per book
if key not in _seen_debug_keys: if key not in _seen_debug_keys:
try: try:
conn = client.connection_pool.connection_kwargs conn = client.connection_pool.connection_kwargs
@ -71,54 +72,54 @@ def abort_requested(book_id: str, redis_client=None) -> bool:
port = conn.get("port") port = conn.get("port")
db = conn.get("db") db = conn.get("db")
_debug( _debug(
f"[ABORT_DEBUG] first check book_id={book_id} " f"[ABORT_DEBUG] first check book_idx={book_idx} "
f"redis={host}:{port} db={db}" f"redis={host}:{port} db={db}"
) )
except Exception: except Exception:
_debug(f"[ABORT_DEBUG] first check book_id={book_id}") _debug(f"[ABORT_DEBUG] first check book_idx={book_idx}")
_seen_debug_keys.add(key) _seen_debug_keys.add(key)
# Only log abort ACTIVE # Log ACTIVE state
if exists == 1: if exists == 1:
_debug(f"[ABORT] ACTIVE for {book_id}") _debug(f"[ABORT] ACTIVE for {book_idx}")
return exists == 1 return exists == 1
except Exception as e: except Exception as e:
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}") _debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}")
return False return False
# ========================================================= # =========================================================
# PER-CHAPTER STATE # PER-CHAPTER STATE — unified book_idx
# ========================================================= # =========================================================
def mark_chapter_started(book_id: str, chapter_num: int): def mark_chapter_started(book_idx: str, chapter_num: int):
key = f"started:{book_id}:{chapter_num}" key = f"started:{book_idx}:{chapter_num}"
r.set(key, "1") r.set(key, "1")
def chapter_started(book_id: str, chapter_num: int) -> bool: def chapter_started(book_idx: str, chapter_num: int) -> bool:
key = f"started:{book_id}:{chapter_num}" key = f"started:{book_idx}:{chapter_num}"
return r.exists(key) == 1 return r.exists(key) == 1
# ========================================================= # =========================================================
# UTILITY: RESET FOR A BOOK # RESET STATE FOR BOOK_IDX
# ========================================================= # =========================================================
def reset_book_state(book_id: str): def reset_book_state(book_idx: str):
""" """
Remove abort flag and all chapter-start markers. Remove abort flag and all per-chapter started markers.
""" """
key = f"abort:{book_id}" # abort flag
r.delete(key) r.delete(f"abort:{book_idx}")
pattern = f"started:{book_id}:*" # chapter markers
pattern = f"started:{book_idx}:*"
for k in r.scan_iter(pattern): for k in r.scan_iter(pattern):
r.delete(k) r.delete(k)

@ -4,10 +4,9 @@
# Backwards-compatible wrapper giving the SAME public API # Backwards-compatible wrapper giving the SAME public API
# as the old BookScraper, but internally uses ScrapeEngine. # as the old BookScraper, but internally uses ScrapeEngine.
# #
# execute() → full metadata + chapterlist # execute() → full metadata + chapterlist (NO book_idx creation)
# #
# (* Chapter downloading komt later in ScrapeEngine, # ID management is now handled exclusively by InitService.
# maar deze wrapper hoeft NIET aangepast te worden.)
# ============================================================ # ============================================================
from scraper.logger_decorators import logcall from scraper.logger_decorators import logcall
@ -18,21 +17,15 @@ class BookScraper:
""" """
Backwards-compatible BookScraper façade. Backwards-compatible BookScraper façade.
In het oude systeem deed BookScraper ALLES: Old responsibilities (metadata, chapters, covers, downloads)
- metadata ophalen are now split:
- cover ophalen
- hoofdstukkenlijst
- hoofdstukken downloaden
- volume folders
- skip logic
In het nieuwe systeem is dát opgesplitst: ScrapeEngine metadata + chapterlist
Download tasks handle download/parse/save
InitService determines book_idx (single source of truth)
ScrapeEngine metadata / chapterlist / download engine (in ontwikkeling) This wrapper intentionally does NOT generate a book_idx or book_id.
BookScraper behoudt dezelfde API als voorheen It only returns metadata/chapters in legacy-compatible dict format.
Daardoor kunnen Celery-tasks en oudere modules blijven werken
zonder refactor-chaos.
""" """
@logcall @logcall
@ -43,18 +36,14 @@ class BookScraper:
@logcall @logcall
def execute(self): def execute(self):
""" """
Public legacy API. Legacy public API:
Retourneert metadata + chapters EXACT zoals de oude BookScraper Return metadata + chapter list EXACTLY as before,
vóór downloadfase. but without generating any book_id.
Dit is belangrijk:
- INIT-flow gebruikt metadata only
- scraping tasks gebruiken chapterlist
""" """
data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url) data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url)
# Legacy output structuur volledig repliceren: # Legacy structure preserved, unchanged:
return { return {
"title": data.get("title"), "title": data.get("title"),
"author": data.get("author"), "author": data.get("author"),
@ -62,5 +51,5 @@ class BookScraper:
"cover_url": data.get("cover_url"), "cover_url": data.get("cover_url"),
"chapters": data.get("chapters", []), "chapters": data.get("chapters", []),
"chapters_total": data.get("chapters_total", 0), "chapters_total": data.get("chapters_total", 0),
"book_url": data.get("book_url"), "book_url": data.get("book_url"), # used later by parse/save tasks
} }

@ -1,55 +1,54 @@
# ========================================================= # =========================================================
# File: scraper/download_controller.py # File: scraper/download_controller.py
# Purpose: # Purpose:
# Build Celery pipelines for all chapters # Build Celery pipelines for all chapters using book_idx
# and pass book_id for abort/progress/log functionality. # Handles:
# + Download and replicate cover image to all volume folders # • volume assignment
# + Generate scripts (allinone.txt, makebook, say) # • cover download + replication
# + Initialize Redis Book State Model (status + counters) # • script generation
# • Redis Book State Model init
# • abort tracking
# ========================================================= # =========================================================
from celery import group from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline from scraper.tasks.pipeline import build_chapter_pipeline
from scraper.scriptgen import generate_all_scripts
# ❗ IMPORTANT:
# generate_all_scripts MUST NOT import DownloadController, otherwise circular import.
# We keep the import, but scriptgen must be clean.
from scraper import scriptgen
from logbus.publisher import log from logbus.publisher import log
import os import os
import requests import requests
import shutil import shutil
from scraper.abort import abort_requested # DEBUG allowed
from scraper.abort import abort_requested
from db.state_redis import init_book_state
from db.repository import set_status, set_chapters_total
class DownloadController: class DownloadController:
""" """
Coordinates all chapter pipelines (download parse save), Coordinates all chapter pipelines (download parse save).
including:
- volume splitting
- consistent meta propagation
- book_id-based abort + progress tracking
- cover download + volume replication
- script generation (allinone.txt, makebook, say)
- Redis book state initialisation and status updates
""" """
def __init__(self, book_id: str, scrape_result: dict): def __init__(self, book_idx: str, scrape_result: dict):
self.book_id = book_id self.book_idx = str(book_idx)
self.scrape_result = scrape_result self.scrape_result = scrape_result
# Core metadata # Metadata
self.title = scrape_result.get("title", "UnknownBook") self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or [] self.chapters = scrape_result.get("chapters", []) or []
self.cover_url = scrape_result.get("cover_url") self.cover_url = scrape_result.get("cover_url")
# Output base dir # Output folder
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
# Volume size
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
# Base folder for the whole book
self.book_base = os.path.join(root, self.title) self.book_base = os.path.join(root, self.title)
os.makedirs(self.book_base, exist_ok=True) os.makedirs(self.book_base, exist_ok=True)
# Meta passed to parse/save stage # Meta passed downstream
self.meta = { self.meta = {
"title": self.title, "title": self.title,
"author": scrape_result.get("author"), "author": scrape_result.get("author"),
@ -57,200 +56,120 @@ class DownloadController:
"book_url": scrape_result.get("book_url"), "book_url": scrape_result.get("book_url"),
} }
# ------------------------------------------------- log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}")
# DEBUG — bevestig dat controller correct book_id ziet
# -------------------------------------------------
log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'")
try: # Init Redis Book State Model
abort_state = abort_requested(book_id)
log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested ERROR: {e}")
# -------------------------------------------------
# NEW: Initialize Redis Book State Model
# -------------------------------------------------
try: try:
init_book_state( init_book_state(
book_id=self.book_id, book_id=self.book_idx,
title=self.title, title=self.title,
url=self.scrape_result.get("book_url"), url=self.meta["book_url"],
chapters_total=len(self.chapters), chapters_total=len(self.chapters),
) )
log(f"[CTRL_STATE] init_book_state() completed for {self.title}")
except Exception as e: except Exception as e:
log(f"[CTRL_STATE] init_book_state FAILED: {e}") log(f"[CTRL_STATE] init_book_state FAILED: {e}")
# ---------------------------------------------------------
# Cover Download
# --------------------------------------------------------- # ---------------------------------------------------------
def download_cover(self): def download_cover(self):
"""Download one cover image into the root of the book folder."""
if not self.cover_url: if not self.cover_url:
log(f"[CTRL] No cover URL found for '{self.title}'") return log(f"[CTRL] No cover URL for '{self.title}'")
return
cover_path = os.path.join(self.book_base, "cover.jpg") cover_path = os.path.join(self.book_base, "cover.jpg")
headers = { headers = {
"User-Agent": ( "User-Agent": "Mozilla/5.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) " "Referer": self.scrape_result.get("book_url") or "",
"Gecko/20100101 Firefox/118.0"
),
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
} }
try: try:
log(f"[CTRL] Downloading cover: {self.cover_url}") log(f"[CTRL] Downloading cover: {self.cover_url}")
resp = requests.get(self.cover_url, timeout=10, headers=headers) resp = requests.get(self.cover_url, timeout=10, headers=headers)
resp.raise_for_status() resp.raise_for_status()
with open(cover_path, "wb") as f: with open(cover_path, "wb") as f:
f.write(resp.content) f.write(resp.content)
log(f"[CTRL] Cover saved to: {cover_path}") log(f"[CTRL] Cover saved: {cover_path}")
except Exception as e: except Exception as e:
log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})") log(f"[CTRL] Cover download failed: {e}")
# ---------------------------------------------------------
# Cover Replication to Volumes
# --------------------------------------------------------- # ---------------------------------------------------------
def replicate_cover_to_volumes(self): def replicate_cover_to_volumes(self):
"""Copy cover.jpg into each existing Volume_xxx directory."""
src = os.path.join(self.book_base, "cover.jpg") src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src): if not os.path.exists(src):
log("[CTRL] No cover.jpg found, replication skipped")
return return
try: for entry in os.listdir(self.book_base):
if entry.lower().startswith("volume_"):
for entry in os.listdir(self.book_base): dst = os.path.join(self.book_base, entry, "cover.jpg")
if entry.lower().startswith("volume_"): try:
vol_dir = os.path.join(self.book_base, entry)
dst = os.path.join(vol_dir, "cover.jpg")
shutil.copyfile(src, dst) shutil.copyfile(src, dst)
log(f"[CTRL] Cover replicated into: {dst}") log(f"[CTRL] Cover replicated → {dst}")
except Exception as e:
except Exception as e: log(f"[CTRL] Cover replication failed: {e}")
log(f"[CTRL] Cover replication failed: {e}")
# ---------------------------------------------------------
def store_cover_in_static(self): def store_cover_in_static(self):
"""
Copy the main cover.jpg from book_base into static/covers/<book_id>.jpg.
This allows the Flask web UI to serve the cover directly.
"""
src = os.path.join(self.book_base, "cover.jpg") src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src): if not os.path.exists(src):
log("[CTRL] No cover.jpg found, cannot store in static/covers")
return return
# static/covers/<book_id>.jpg os.makedirs("static/covers", exist_ok=True)
static_dir = os.path.join("static", "covers") dst = os.path.join("static/covers", f"{self.book_idx}.jpg")
os.makedirs(static_dir, exist_ok=True)
dst = os.path.join(static_dir, f"{self.book_id}.jpg")
try: try:
shutil.copyfile(src, dst) shutil.copyfile(src, dst)
log(f"[CTRL] Cover stored for UI: {dst}") log(f"[CTRL] Cover stored for UI: {dst}")
except Exception as e: except Exception as e:
log(f"[CTRL] Failed to store cover in static: {e}") log(f"[CTRL] Failed storing cover: {e}")
# ---------------------------------------------------------
# Volume isolation
# --------------------------------------------------------- # ---------------------------------------------------------
def get_volume_path(self, chapter_num: int) -> str: def get_volume_path(self, chapter_num: int) -> str:
"""Returns the correct volume directory for a chapter."""
vol_index = (chapter_num - 1) // self.max_vol + 1 vol_index = (chapter_num - 1) // self.max_vol + 1
vol_name = f"Volume_{vol_index:03d}" vol_name = f"Volume_{vol_index:03d}"
vol_path = os.path.join(self.book_base, vol_name) vol_path = os.path.join(self.book_base, vol_name)
os.makedirs(vol_path, exist_ok=True) os.makedirs(vol_path, exist_ok=True)
return vol_path return vol_path
# ---------------------------------------------------------
# Pipeline launcher
# --------------------------------------------------------- # ---------------------------------------------------------
def start(self): def start(self):
total = len(self.chapters) total = len(self.chapters)
log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)")
log( # Update Redis/SQLite state
f"[CTRL] Initialising pipeline for '{self.title}' "
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
)
log(f"[CTRL] Output root: {self.book_base}")
# -------------------------------------
# NEW: Redis state update
# -------------------------------------
try: try:
set_status(self.book_id, "downloading") set_status(self.book_idx, "downloading")
set_chapter_total(self.book_id, total) set_chapters_total(self.book_idx, total)
log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}")
except Exception as e: except Exception as e:
log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}") log(f"[CTRL_STATE] Unable to set state: {e}")
# ------------------------------------- # Download cover
# 1) Download cover
# -------------------------------------
self.download_cover() self.download_cover()
# Build pipeline tasks
tasks = [] tasks = []
for ch in self.chapters: for ch in self.chapters:
num = ch["num"]
# Build chapter_dict (NEW) chapter_info = {
chapter_num = ch["num"] "num": num,
chapter_url = ch["url"] "url": ch["url"],
chapter_title = ch.get("title") "title": ch.get("title"),
"volume_path": self.get_volume_path(num),
volume_path = self.get_volume_path(chapter_num)
chapter_dict = {
"num": chapter_num,
"url": chapter_url,
"title": chapter_title,
"volume_path": volume_path,
} }
tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta))
# Dispatch pipeline with chapter_dict
tasks.append(
build_chapter_pipeline(
self.book_id,
chapter_dict,
self.meta,
)
)
async_result = group(tasks).apply_async() async_result = group(tasks).apply_async()
log( # Replicate cover + place in static
f"[CTRL] Pipelines dispatched for '{self.title}' "
f"(book_id={self.book_id}, group_id={async_result.id})"
)
# Debug abort state
try:
abort_state = abort_requested(self.book_id)
log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}")
# -------------------------------------------------------
self.replicate_cover_to_volumes() self.replicate_cover_to_volumes()
self.store_cover_in_static() self.store_cover_in_static()
# -------------------------------------------------------
# Generate scripts (LATE IMPORT to avoid circular)
try: try:
generate_all_scripts( scriptgen.generate_all_scripts(
self.book_base, self.book_base, self.title, self.meta["author"]
self.title,
self.meta.get("author"),
) )
log(f"[CTRL] Scripts generated for '{self.title}'") log("[CTRL] Scripts generated")
except Exception as e: except Exception as e:
log(f"[CTRL] Script generation failed: {e}") log(f"[CTRL] Script generation failed: {e}")

@ -54,6 +54,8 @@ Copyright=
章节出错= 章节出错=
点此举报= 点此举报=
举报原因= 举报原因=
求收藏=
推荐票=
www.piaotia.com= www.piaotia.com=
www.piaotian.com= www.piaotian.com=
www.= www.=

@ -4,7 +4,7 @@
# Orchestrate INIT-flow: # Orchestrate INIT-flow:
# - resolve site # - resolve site
# - fetch minimal metadata # - fetch minimal metadata
# - derive book_id # - derive book_idx
# - register in SQLite # - register in SQLite
# - store main cover # - store main cover
# ============================================================ # ============================================================
@ -21,33 +21,47 @@ from scraper.logger_decorators import logcall
class InitService: class InitService:
# ------------------------------------------------------------
# BOOK IDX DERIVATION
# ------------------------------------------------------------
@staticmethod @staticmethod
@logcall @logcall
def derive_book_id(url: str) -> str: def derive_book_id(url: str) -> str:
""" """
PTWXZ URL format ends with /{id}.html. PTWXZ URL format ends with /{id}.html.
If no match fallback to sanitized URL. If no match fallback to sanitized URL.
Returns:
book_idx (string)
""" """
m = re.search(r"/(\d+)\.html$", url) m = re.search(r"/(\d+)\.html$", url)
if m: if m:
return m.group(1) return m.group(1)
return url.replace("/", "_")
# Fallback — ensures deterministic ID for unknown formats
return url.replace("/", "_").replace(":", "_")
# ------------------------------------------------------------
# MAIN INIT FLOW
# ------------------------------------------------------------
@staticmethod @staticmethod
@logcall @logcall
def execute(url: str) -> dict: def execute(url: str) -> dict:
""" """
Main INIT-flow entry point. INIT entry point.
Returns complete metadata + registration info. Returns complete metadata + registration result.
""" """
# 1) Determine site # 1) Resolve site handler
site = SiteResolver.resolve(url) site = SiteResolver.resolve(url)
book_id = InitService.derive_book_id(url) # 2) Create unified book_idx
book_idx = InitService.derive_book_id(url)
# Some site objects historically expect .book_id — we support it but DO NOT rely on it.
site.book_id = book_idx
site.book_id = book_id # 3) Fetch initial metadata (title/author/description/cover)
# 2) Metadata only
meta = ScrapeEngine.fetch_metadata_only(site, url) meta = ScrapeEngine.fetch_metadata_only(site, url)
title = meta.get("title") or "Unknown" title = meta.get("title") or "Unknown"
@ -55,27 +69,27 @@ class InitService:
description = meta.get("description") description = meta.get("description")
cover_url = meta.get("cover_url") cover_url = meta.get("cover_url")
# 4) Download UI cover (NEW: capture returned local path) # 4) Download & store main cover for UI
cover_path = CoverService.download_main_cover(cover_url, book_id) cover_path = CoverService.download_main_cover(cover_url, book_idx)
# 5) SQLite registration INCLUDING cover_path ← ★ FIX # 5) Register in SQLite (book_idx is the SOLE primary ID)
register_book( register_book(
book_id=book_id, book_idx=book_idx,
title=title, title=title,
author=author, author=author,
description=description, description=description,
cover_url=cover_url, cover_url=cover_url,
cover_path=cover_path, # ← ★ BELANGRIJK cover_path=cover_path,
book_url=url, book_url=url,
) )
# 6) Output for UI # 6) Return metadata for UI / API
return { return {
"book_id": book_id, "book_idx": book_idx,
"title": title, "title": title,
"author": author, "author": author,
"description": description, "description": description,
"cover_url": cover_url, "cover_url": cover_url,
"cover_path": cover_path, # ← handig voor UI "cover_path": cover_path,
"status": "registered", "status": "registered",
} }

@ -1,8 +1,8 @@
# ============================================================ # ============================================================
# File: scraper/services/scrape_engine.py # File: scraper/services/scrape_engine.py (C&U — no circular import)
# Purpose: # Purpose:
# Unified scraping engine for INIT-flow and Celery tasks. # Unified scraping engine for INIT-flow and Celery tasks.
# All functions are fully logged via @logcall. # ScrapeEngine does NOT determine book_idx itself.
# ============================================================ # ============================================================
import os import os
@ -23,6 +23,10 @@ class ScrapeEngine:
Central scraping engine. Central scraping engine.
Metadata + chapterlist scraping. Metadata + chapterlist scraping.
All methods logged with @logcall. All methods logged with @logcall.
IMPORTANT:
- ScrapeEngine NEVER decides book_idx.
- No dependency on InitService (prevents circular import).
""" """
# ------------------------------------------------------------ # ------------------------------------------------------------
@ -140,26 +144,23 @@ class ScrapeEngine:
return "\n".join(parts) return "\n".join(parts)
# ------------------------------------------------------------ # ------------------------------------------------------------
# COVER PARSER # COVER PARSER (NO InitService dependency)
# ------------------------------------------------------------ # ------------------------------------------------------------
@staticmethod @staticmethod
@logcall @logcall
def _parse_cover(soup, site): def _parse_cover(soup, site):
""" """
Vind cover door book_id substring matching: Extract book index from URL heuristically instead of InitService
- haal book_id uit site.url (prevents circular import).
- zoek IMG-tags waarvan filename book_id bevat
- kies kortste filename als beste match
""" """
# Typical Chinese novel sites embed numeric ID in URL path
try: try:
parsed = urlparse(site.url) parsed = urlparse(site.url)
m = re.search(r"/(\d+)\.html$", parsed.path) digits = re.findall(r"\d+", parsed.path)
if m: book_idx = digits[-1] if digits else None
book_id = m.group(1)
else:
book_id = parsed.path.rstrip("/").split("/")[-1]
except Exception: except Exception:
return None book_idx = None
imgs = soup.find_all("img", src=True) imgs = soup.find_all("img", src=True)
candidates = [] candidates = []
@ -167,16 +168,14 @@ class ScrapeEngine:
for img in imgs: for img in imgs:
src = img["src"].strip() src = img["src"].strip()
filename = os.path.basename(src) filename = os.path.basename(src)
if book_id in filename: if book_idx and book_idx in filename:
candidates.append((filename, src)) candidates.append((filename, src))
if not candidates: if not candidates:
return None return None
candidates.sort(key=lambda t: len(t[0])) # kortste filename wint candidates.sort(key=lambda t: len(t[0])) # smallest filename
best_src = candidates[0][1] return urljoin(site.root, candidates[0][1])
return urljoin(site.root, best_src)
# ------------------------------------------------------------ # ------------------------------------------------------------
# RESOLVE CHAPTER PAGE # RESOLVE CHAPTER PAGE
@ -233,7 +232,7 @@ class ScrapeEngine:
def fetch_metadata_only(site, url: str) -> dict: def fetch_metadata_only(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site) ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site) soup = ScrapeEngine._get_doc(url, site)
site.url = url # NODIG voor cover parsing site.url = url # needed for cover parsing
return { return {
"title": ScrapeEngine._parse_title(soup), "title": ScrapeEngine._parse_title(soup),

@ -1,109 +1,167 @@
# ============================================================ # ============================================================
# File: scraper/tasks/controller_tasks.py # File: scraper/tasks/controller_tasks.py
# Purpose: # Purpose:
# Start the download → parse → save pipeline for a scraped book, # FULL scrape entrypoint + launching download/parse/save pipelines.
# including progress/abort tracking via book_id. # NO result.get() anywhere. Scraping is done inline.
# ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from logbus.publisher import log from logbus.publisher import log
from scraper.download_controller import DownloadController import os
import time
import redis
from urllib.parse import urlparse from urllib.parse import urlparse
from scraper.logger_decorators import logcall from scraper.logger_decorators import logcall
import redis
import os
from scraper.abort import abort_requested from scraper.abort import abort_requested
from db.repository import set_chapters_total
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.site_resolver import SiteResolver
from db.repository import fetch_book, set_chapters_total
from scraper.download_controller import DownloadController
print(">>> [IMPORT] controller_tasks.py loaded") print(">>> [IMPORT] controller_tasks.py loaded")
@celery_app.task(bind=True, queue="controller", ignore_result=False) # =============================================================
# 1) PUBLIC ENTRYPOINT — CALLED FROM /start
# =============================================================
@celery_app.task(
bind=True,
queue="controller",
ignore_result=False,
name="scraper.tasks.controller_tasks.start_full_scrape",
)
@logcall @logcall
def launch_downloads(self, book_id: str, scrape_result: dict): def start_full_scrape(self, book_idx: str):
""" """
Launch the entire pipeline (download parse save), FULL SCRAPE ENTRYPOINT.
AND initialize progress counters. Scraping is done inline no Celery .get() needed.
"""
log(f"[CTRL] start_full_scrape(book_idx={book_idx})")
# Abort before doing anything
if abort_requested(book_idx):
log(f"[CTRL] PRE-ABORT flag detected for {book_idx}")
return {"book_idx": book_idx, "aborted": True, "reason": "pre-abort"}
# --------------------------------------------------------
# 1) Load book metadata from SQLite
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
msg = f"[CTRL] Book '{book_idx}' not found in DB"
log(msg)
raise ValueError(msg)
url = book.get("book_url")
if not url:
msg = f"[CTRL] No book_url stored for {book_idx}"
log(msg)
raise ValueError(msg)
# --------------------------------------------------------
# 2) INLINE SCRAPE (fast, no Celery wait)
# --------------------------------------------------------
site = SiteResolver.resolve(url)
try:
scrape_result = ScrapeEngine.fetch_metadata_and_chapters(site, url)
log(f"[CTRL] Scrape OK for {book_idx}: {scrape_result.get('title')}")
except Exception as e:
log(f"[CTRL] ERROR during scrape of {book_idx}: {e}")
raise
Chapter-level progress is updated INSIDE the download/parse/save tasks. # --------------------------------------------------------
This task MUST NOT call .get() on async subtasks (Celery restriction). # 3) Continue → dispatch pipelines
# --------------------------------------------------------
return launch_downloads(book_idx, scrape_result)
# =============================================================
# 2) PIPELINE DISPATCH (NOT a Celery task)
# =============================================================
@logcall
def launch_downloads(book_idx: str, scrape_result: dict):
"""
Launches the entire processing pipeline:
- initialize Redis UI state
- initialize SQLite totals
- dispatch per-chapter pipelines via DownloadController
""" """
title = scrape_result.get("title", "UnknownBook") title = scrape_result.get("title", "UnknownBook")
chapters = scrape_result.get("chapters", []) or [] chapters = scrape_result.get("chapters", []) or []
total = len(chapters) total = len(chapters)
# ------------------------------------------------------------ # ------------------------------------------------------------
# INIT BOOK STATE MODEL (required for Active Books dashboard) # INIT REDIS STATE
# ------------------------------------------------------------ # ------------------------------------------------------------
broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0") broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
parsed = urlparse(broker_url) parsed = urlparse(broker_url)
state = redis.Redis( r = redis.Redis(
host=parsed.hostname, host=parsed.hostname,
port=parsed.port, port=parsed.port,
db=int(parsed.path.strip("/")), db=int(parsed.path.strip("/")),
decode_responses=True, decode_responses=True,
) )
# Book metadata base = f"book:{book_idx}:state"
state.set(f"book:{book_id}:title", title)
state.set(f"book:{book_id}:status", "starting")
# Download counters r.hset(base, "title", title)
state.set(f"book:{book_id}:download:total", total) r.hset(base, "status", "starting")
state.set(f"book:{book_id}:download:done", 0) r.hset(base, "chapters_total", total)
r.hset(base, "chapters_download_done", 0)
# Audio counters (start at zero) r.hset(base, "chapters_download_skipped", 0)
state.set(f"book:{book_id}:audio:done", 0) r.hset(base, "chapters_parsed_done", 0)
r.hset(base, "audio_done", 0)
r.hset(base, "audio_skipped", 0)
r.hset(base, "last_update", int(time.time()))
# ------------------------------------------------------------ # ------------------------------------------------------------
# INIT PROGRESS # INIT SQLITE SNAPSHOT
# ------------------------------------------------------------ # ------------------------------------------------------------
set_chapters_total(book_id, total) try:
set_chapters_total(book_idx, total)
except Exception as e:
log(f"[CTRL] ERROR updating SQLite totals: {e}")
raise
log(f"[CTRL] Progress initialized for {book_id}: total={total}") log(f"[CTRL] Initialized totals for {book_idx}: {total}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# BUILD CONTROLLER # ABORT CHECK BEFORE LAUNCHING JOBS
# ------------------------------------------------------------ # ------------------------------------------------------------
ctl = DownloadController(book_id, scrape_result) if abort_requested(book_idx):
log(f"[CTRL] ABORT flag detected — stopping BEFORE dispatch for {book_idx}")
r.hset(base, "status", "aborted")
return {"book_idx": book_idx, "aborted": True, "reason": "abort-before-start"}
# ------------------------------------------------------------ # ------------------------------------------------------------
# START PIPELINES (ASYNC) # BUILD + DISPATCH PER-CHAPTER PIPELINES
# Returns a celery group AsyncResult. We DO NOT iterate or get().
# Progress & failures are handled by the worker subtasks.
# ------------------------------------------------------------ # ------------------------------------------------------------
try: controller = DownloadController(book_idx, scrape_result)
group_result = ctl.start()
log(
f"[CTRL] Pipelines dispatched for '{title}' "
f"(book_id={book_id}, group_id={group_result.id})"
)
# Abort flag set BEFORE tasks start?
if abort_requested(book_id):
log(f"[CTRL] ABORT requested before tasks start")
return {"book_id": book_id, "aborted": True}
except Exception as exc: try:
log(f"[CTRL] ERROR while dispatching pipelines: {exc}") group_result = controller.start()
gid = getattr(group_result, "id", None)
log(f"[CTRL] Pipelines dispatched for {book_idx} (group_id={gid})")
except Exception as e:
log(f"[CTRL] ERROR dispatching pipelines for {book_idx}: {e}")
raise raise
# ------------------------------------------------------------ # Update UI state to "downloading"
# CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS r.hset(base, "status", "downloading")
# (Download/parse/save tasks update progress themselves) r.hset(base, "last_update", int(time.time()))
# ------------------------------------------------------------
log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
return { return {
"book_id": book_id, "book_idx": book_idx,
"total": total, "total": total,
"started": True, "started": True,
"group_id": group_result.id, "group_id": gid,
} }

@ -1,12 +1,15 @@
# ============================================================ # ============================================================
# File: scraper/tasks/download_tasks.py # File: scraper/tasks/download_tasks.py
# Purpose:
# Download chapter HTML into payload["html"].
# Updated for book_idx unified ID model.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from scraper.utils.utils import get_save_path from scraper.utils.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started from scraper.abort import abort_requested, chapter_started, mark_chapter_started
# Repository façade — correct imports only # Unified repository façade
from db.repository import ( from db.repository import (
set_status, set_status,
inc_download_done, inc_download_done,
@ -30,9 +33,9 @@ print(">>> [IMPORT] download_tasks.py loaded")
# ----------------------------------------------------------- # -----------------------------------------------------------
# TIMESTAMPED LOG WRAPPER # TIMESTAMPED LOG WRAPPER
# ----------------------------------------------------------- # -----------------------------------------------------------
def log_msg(book_id: str, message: str): def log_msg(book_idx: str, message: str):
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
full = f"{ts} [{book_id}] {message}" full = f"{ts} [{book_idx}] {message}"
log(full) log(full)
push_ui(full) push_ui(full)
@ -85,19 +88,27 @@ def release_global_slot():
# ============================================================ # ============================================================
# CELERY TASK — Unified payload v3 # CELERY TASK — Payload v3 (book_idx model)
# ============================================================ # ============================================================
@celery_app.task(bind=True, queue="download", ignore_result=False) @celery_app.task(bind=True, queue="download", ignore_result=False)
@logcall @logcall
def download_chapter(self, payload: dict): def download_chapter(self, payload: dict):
""" """
Payload: Payload format:
{ {
"book_id": str, "book_idx": str,
"chapter": { "num", "url", "title", "volume_path" }, "chapter": {
"num": int,
"title": str,
"url": str,
"volume_path": str
},
"book_meta": dict, "book_meta": dict,
# fields filled during pipeline:
"html": None | str, "html": None | str,
"parsed": None | dict, "parsed": None | str,
"skipped": bool, "skipped": bool,
"path": None | str "path": None | str
} }
@ -106,7 +117,7 @@ def download_chapter(self, payload: dict):
if not payload: if not payload:
raise ValueError("download_chapter received empty payload") raise ValueError("download_chapter received empty payload")
book_id = payload["book_id"] book_idx = payload["book_idx"]
chapter = payload["chapter"] chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {} book_meta = payload.get("book_meta") or {}
@ -115,44 +126,55 @@ def download_chapter(self, payload: dict):
chapter_title = chapter.get("title") or f"Chapter {chapter_num}" chapter_title = chapter.get("title") or f"Chapter {chapter_num}"
volume_path = chapter["volume_path"] volume_path = chapter["volume_path"]
# STATUS UPDATE # -----------------------------------------------------------
set_status(book_id, "downloading") # STATUS UPDATE (book is now in 'downloading')
# -----------------------------------------------------------
set_status(book_idx, "downloading")
# ABORT CHECK # -----------------------------------------------------------
if abort_requested(book_id) and not chapter_started(book_id, chapter_num): # ABORT CHECK (skip if not yet started)
log_msg(book_id, f"[ABORT] Skip chapter {chapter_num}") # -----------------------------------------------------------
if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num):
log_msg(book_idx, f"[ABORT] Skip chapter {chapter_num}")
inc_download_skipped(book_id) inc_download_skipped(book_idx)
payload["html"] = None payload["html"] = None
payload["skipped"] = True payload["skipped"] = True
payload["path"] = None payload["path"] = None
return payload return payload
mark_chapter_started(book_id, chapter_num) mark_chapter_started(book_idx, chapter_num)
# SKIP IF FILE ALREADY SAVED # -----------------------------------------------------------
# SKIP IF FILE ALREADY EXISTS
# -----------------------------------------------------------
save_path = get_save_path(chapter_num, volume_path) save_path = get_save_path(chapter_num, volume_path)
if os.path.exists(save_path): if os.path.exists(save_path):
log_msg(book_id, f"[DL] SKIP {chapter_num}{save_path}") log_msg(book_idx, f"[DL] SKIP {chapter_num}{save_path}")
inc_download_skipped(book_id) inc_download_skipped(book_idx)
payload["html"] = None payload["html"] = None
payload["skipped"] = True payload["skipped"] = True
payload["path"] = save_path payload["path"] = save_path
return payload return payload
# GLOBAL DELAY + SLOT # -----------------------------------------------------------
# GLOBAL DELAY + CONCURRENCY
# -----------------------------------------------------------
if GLOBAL_DELAY > 0: if GLOBAL_DELAY > 0:
time.sleep(GLOBAL_DELAY) time.sleep(GLOBAL_DELAY)
wait_for_global_delay() wait_for_global_delay()
acquire_global_slot(MAX_CONCURRENCY) acquire_global_slot(MAX_CONCURRENCY)
# -----------------------------------------------------------
# HTTP DOWNLOAD # HTTP DOWNLOAD
# -----------------------------------------------------------
try: try:
log_msg(book_id, f"[DL] Downloading {chapter_num} ({chapter_title})") log_msg(book_idx, f"[DL] Downloading {chapter_num} ({chapter_title})")
resp = requests.get( resp = requests.get(
chapter_url, chapter_url,
@ -164,11 +186,10 @@ def download_chapter(self, payload: dict):
resp.encoding = resp.apparent_encoding or "gb2312" resp.encoding = resp.apparent_encoding or "gb2312"
html = resp.text html = resp.text
log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes") log_msg(book_idx, f"[DL] OK {chapter_num}: {len(html)} bytes")
inc_download_done(book_id) inc_download_done(book_idx)
# --- attach results ---
payload["html"] = html payload["html"] = html
payload["skipped"] = False payload["skipped"] = False
payload["path"] = save_path payload["path"] = save_path
@ -178,13 +199,15 @@ def download_chapter(self, payload: dict):
attempt = self.request.retries attempt = self.request.retries
delay = BASE_DELAY * (BACKOFF**attempt) delay = BASE_DELAY * (BACKOFF**attempt)
# Handle 429
if getattr(getattr(exc, "response", None), "status_code", None) == 429: if getattr(getattr(exc, "response", None), "status_code", None) == 429:
log_msg(book_id, f"[DL] 429 → WAIT {DELAY_429}s") log_msg(book_idx, f"[DL] 429 → WAIT {DELAY_429}s")
time.sleep(DELAY_429) time.sleep(DELAY_429)
set_global_delay() set_global_delay()
raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES) raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES)
log_msg(book_id, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s") # General retry with backoff
log_msg(book_idx, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s")
raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES) raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
finally: finally:

@ -2,7 +2,7 @@
# File: scraper/tasks/parse_tasks.py # File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text. # Purpose: Parse downloaded HTML into clean chapter text.
# Enhanced Piaotia extractor + selector fallback + clean pipeline. # Enhanced Piaotia extractor + selector fallback + clean pipeline.
# Compatible with payload pipeline v3. # Compatible with payload pipeline v3 + book_idx refactor.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
@ -14,11 +14,11 @@ from scraper.logger_decorators import logcall
from db.repository import inc_parsed_done from db.repository import inc_parsed_done
print(">>> [IMPORT] parse_tasks.py loaded (RESTORED + payload v3)") print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)")
# ============================================================ # ============================================================
# PIAOTIA ADVANCED CONTENT EXTRACTOR (unchanged from original) # PIAOTIA ADVANCED CONTENT EXTRACTOR
# ============================================================ # ============================================================
def extract_piaotia_content(soup): def extract_piaotia_content(soup):
h1 = soup.find("h1") h1 = soup.find("h1")
@ -44,39 +44,32 @@ def extract_piaotia_content(soup):
if hasattr(sib, "get_text"): if hasattr(sib, "get_text"):
text = sib.get_text(strip=True) text = sib.get_text(strip=True)
# STOP CONDITIONS # Stop conditions
# <!-- 翻页 -->
if isinstance(sib, Comment) and ("翻页" in sib): if isinstance(sib, Comment) and ("翻页" in sib):
break break
# explicit footer blocks
if name == "div": if name == "div":
sid = sib.get("id", "") sid = sib.get("id", "")
cls = sib.get("class", []) cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"): if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break break
# copyright block
if text and ("重要声明" in text or "Copyright" in text): if text and ("重要声明" in text or "Copyright" in text):
break break
# navigation blocks
if text and (text.startswith(("推荐阅读", "目录", "目 录"))): if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break break
if name in ("script", "style"): if name in ("script", "style"):
continue continue
if name == "center": if name == "center":
continue continue
# ACCUMULATE # Accumulate
if isinstance(sib, NavigableString): if isinstance(sib, NavigableString):
s = sib.strip() s = sib.strip()
if s: if s:
parts.append(s) parts.append(s)
elif hasattr(sib, "get_text"): elif hasattr(sib, "get_text"):
t = sib.get_text(separator="\n").strip() t = sib.get_text(separator="\n").strip()
if t: if t:
@ -86,7 +79,7 @@ def extract_piaotia_content(soup):
# ============================================================ # ============================================================
# PARSE TASK — PAYLOAD PIPELINE (CORRECT v3 FORMAT) # PARSE TASK — PAYLOAD PIPELINE v3 (book_idx)
# ============================================================ # ============================================================
@celery_app.task(bind=True, queue="parse", ignore_result=False) @celery_app.task(bind=True, queue="parse", ignore_result=False)
@logcall @logcall
@ -95,7 +88,8 @@ def parse_chapter(self, payload: dict):
if not payload: if not payload:
return {"skipped": True, "reason": "empty_payload"} return {"skipped": True, "reason": "empty_payload"}
book_id = payload["book_id"] # NEW MODEL
book_idx = payload["book_idx"]
chapter = payload["chapter"] chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {} book_meta = payload.get("book_meta") or {}
@ -103,24 +97,26 @@ def parse_chapter(self, payload: dict):
title = chapter.get("title") or f"Chapter {num}" title = chapter.get("title") or f"Chapter {num}"
html = payload.get("html") html = payload.get("html")
# SKIPPED DOWNLOAD → SKIP PARSE # ------------------------------------------------------------
# DOWNLOAD SKIPPED → PARSE SKIP
# ------------------------------------------------------------
if payload.get("skipped"): if payload.get("skipped"):
log_msg(book_id, f"[PARSE] SKIP chapter {num} (download skipped)") log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)")
return payload return payload
if not html: if not html:
log_msg(book_id, f"[PARSE] Missing HTML for chapter {num} → SKIP") log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP")
payload["parsed"] = None payload["parsed"] = None
payload["skipped"] = True payload["skipped"] = True
return payload return payload
log_msg(book_id, f"[PARSE] Parsing chapter {num}") log_msg(book_idx, f"[PARSE] Parsing chapter {num}")
soup = BeautifulSoup(html, "lxml") soup = BeautifulSoup(html, "lxml")
# ============================================================ # ------------------------------------------------------------
# STRICT SELECTORS # STRICT SELECTORS
# ============================================================ # ------------------------------------------------------------
selectors = [ selectors = [
"#content", "#content",
"div#content", "div#content",
@ -142,7 +138,7 @@ def parse_chapter(self, payload: dict):
raw = None raw = None
# --- STRICT SELECTOR FAILED → Piaotia extractor --- # strict selectors failed → piaotia extractor
if node is None: if node is None:
raw = extract_piaotia_content(soup) raw = extract_piaotia_content(soup)
else: else:
@ -154,55 +150,56 @@ def parse_chapter(self, payload: dict):
tag.decompose() tag.decompose()
raw = soup.get_text(separator="\n") raw = soup.get_text(separator="\n")
# ============================================================ # ------------------------------------------------------------
# MULTIPASS CLEANING via replacement files # MULTIPASS CLEANING VIA replacement-block files
# ============================================================ # ------------------------------------------------------------
REPL = load_all_replacements() REPL = load_all_replacements()
text = raw text = raw
for _ in range(5): for _ in range(5):
text = clean_text(text, REPL) text = clean_text(text, REPL)
# ============================================================ # ------------------------------------------------------------
# Collapse double blank lines # Collapse double blank lines
# ============================================================ # ------------------------------------------------------------
cleaned = [] cleaned = []
prev_blank = False prev_blank = False
for line in text.split("\n"): for line in text.split("\n"):
stripped = line.rstrip() s = line.rstrip()
if stripped == "": if s == "":
if prev_blank: if prev_blank:
continue continue
prev_blank = True prev_blank = True
cleaned.append("") cleaned.append("")
else: else:
prev_blank = False prev_blank = False
cleaned.append(stripped) cleaned.append(s)
text = "\n".join(cleaned) text = "\n".join(cleaned)
text = f"{title}\n{text}" text = f"{title}\n{text}"
# ============================================================ # ------------------------------------------------------------
# Add header to chapter 1 # Header on chapter 1
# ============================================================ # ------------------------------------------------------------
if num == 1: if num == 1:
book_url = book_meta.get("book_url") or "UNKNOWN" book_url = book_meta.get("book_url") or "UNKNOWN"
header = ( header = (
f"{book_meta.get('title', '')}\n" f"{book_meta.get('title','')}\n"
f"Author: {book_meta.get('author','')}\n" f"Author: {book_meta.get('author','')}\n"
f"Description:\n{book_meta.get('description','')}\n" f"Description:\n{book_meta.get('description','')}\n"
f"Book URL: {book_url}\n" + "-" * 50 + "\n\n" f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
) )
text = header + text text = header + text
log_msg(book_id, f"[PARSE] Parsed chapter {num}: {len(text)} chars") log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars")
# ============================================================ # ------------------------------------------------------------
# PAYLOAD OUTPUT (v3) # OUTPUT PAYLOAD
# ============================================================ # ------------------------------------------------------------
payload["parsed"] = text payload["parsed"] = text
payload["skipped"] = False payload["skipped"] = False
inc_parsed_done(book_id) inc_parsed_done(book_idx)
return payload return payload

@ -7,6 +7,10 @@
# download_chapter(payload) # download_chapter(payload)
# → parse_chapter(payload) # → parse_chapter(payload)
# → save_chapter(payload) # → save_chapter(payload)
#
# NOTE:
# - book_idx is the single authoritative key for all tasks
# - payload travels unchanged through the entire pipeline
# ========================================================= # =========================================================
from celery import chain from celery import chain
@ -19,18 +23,23 @@ from scraper.logger_decorators import logcall
@logcall @logcall
def build_chapter_pipeline(book_id: str, chapter_dict: dict, book_meta: dict): def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict):
""" """
Payload model passed through entire pipeline. Create a payload object passed through the pipeline.
Consistent with the chapter_dict-based task signature.
""" """
payload = { payload = {
"book_id": book_id, "book_idx": book_idx,
"chapter": chapter_dict, "chapter": chapter_dict,
"book_meta": book_meta, "book_meta": book_meta,
# Will be filled by download_chapter
"html": None, "html": None,
# Will be filled by parse_chapter
"parsed": None, "parsed": None,
# Set by download or parse on skip/404/etc
"skipped": False, "skipped": False,
# Final path written by save_chapter
"path": None, "path": None,
} }

@ -1,5 +1,5 @@
# ============================================================ # ============================================================
# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC) # File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC + book_idx)
# ============================================================ # ============================================================
print(">>> [IMPORT] save_tasks.py loaded") print(">>> [IMPORT] save_tasks.py loaded")
@ -24,7 +24,9 @@ def save_chapter(self, payload: dict):
log("[SAVE] ERROR: payload is None") log("[SAVE] ERROR: payload is None")
return {"error": True} return {"error": True}
book_id = payload["book_id"] # NEW unified ID
book_idx = payload["book_idx"]
chapter = payload["chapter"] chapter = payload["chapter"]
parsed = payload.get("parsed") parsed = payload.get("parsed")
path = payload.get("path") path = payload.get("path")
@ -36,20 +38,19 @@ def save_chapter(self, payload: dict):
volume_name = os.path.basename(volume.rstrip("/")) volume_name = os.path.basename(volume.rstrip("/"))
# ============================================================ # ============================================================
# SKIPPED CASE (restore old behavior) # SKIPPED CASE (old behavior restored)
# ============================================================ # ============================================================
if skipped or not parsed: if skipped or not parsed:
log_msg(book_id, f"[SAVE] SKIP chapter {num}") log_msg(book_idx, f"[SAVE] SKIP chapter {num}")
inc_download_skipped(book_id) inc_download_skipped(book_idx)
# Restore old behavior: # OLD behavior: even skipped chapters still queue audio
# If file already exists, STILL trigger audio.
if path and os.path.exists(path): if path and os.path.exists(path):
log_msg(book_id, f"[AUDIO] Queueing audio for SKIPPED chapter {num}") log_msg(book_idx, f"[AUDIO] Queueing audio for SKIPPED chapter {num}")
try: try:
generate_audio.delay(book_id, volume_name, num, title, path) generate_audio.delay(book_idx, volume_name, num, title, path)
except Exception as exc: except Exception as exc:
log_msg(book_id, f"[AUDIO] ERROR queueing skipped audio: {exc}") log_msg(book_idx, f"[AUDIO] ERROR queueing skipped audio: {exc}")
return payload return payload
@ -63,21 +64,21 @@ def save_chapter(self, payload: dict):
with open(save_path, "w", encoding="utf-8") as f: with open(save_path, "w", encoding="utf-8") as f:
f.write(parsed) f.write(parsed)
log_msg(book_id, f"[SAVE] Saved chapter {num}{save_path}") log_msg(book_idx, f"[SAVE] Saved chapter {num}{save_path}")
inc_download_done(book_id) inc_download_done(book_idx)
# Restore old behavior → ALWAYS queue audio # OLD behavior: ALWAYS queue audio
try: try:
generate_audio.delay(book_id, volume_name, num, title, save_path) generate_audio.delay(book_idx, volume_name, num, title, save_path)
log_msg(book_id, f"[AUDIO] Task queued for chapter {num}") log_msg(book_idx, f"[AUDIO] Task queued for chapter {num}")
except Exception as exc: except Exception as exc:
log_msg(book_id, f"[AUDIO] ERROR queueing chapter {num}: {exc}") log_msg(book_idx, f"[AUDIO] ERROR queueing chapter {num}: {exc}")
payload["path"] = save_path payload["path"] = save_path
payload["skipped"] = False payload["skipped"] = False
return payload return payload
except Exception as exc: except Exception as exc:
log_msg(book_id, f"[SAVE] ERROR saving chapter {num}: {exc}") log_msg(book_idx, f"[SAVE] ERROR saving chapter {num}: {exc}")
raise raise

@ -1,7 +1,9 @@
# ============================================================ # ============================================================
# File: scraper/tasks/scraping.py # File: scraper/tasks/scraping.py
# Purpose: Scrape metadata + chapter list and initialise # Purpose:
# Redis progress tracking + launch download controller # Scrape ONLY metadata + chapter list.
# Does NOT launch download controller anymore.
# Controller decides when pipelines start.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
@ -12,86 +14,88 @@ import redis
from scraper.logger_decorators import logcall from scraper.logger_decorators import logcall
from scraper.sites import BookSite from scraper.sites import BookSite
from scraper.book_scraper import BookScraper from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort # no circular deps from scraper.abort import clear_abort
from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT from scraper.ui_log import reset_ui_logs
from scraper.services.init_service import InitService
print(">>> [IMPORT] scraping.py loaded") print(">>> [IMPORT] scraping.py loaded")
# Redis connection (same as Celery broker) # Redis connection (same DB as Celery broker)
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True) r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
@celery_app.task(bind=True, queue="scraping", ignore_result=False) @celery_app.task(
bind=True,
queue="scraping",
ignore_result=False,
name="scraper.tasks.scraping.start_scrape_book",
)
@logcall
def start_scrape_book(self, url: str): def start_scrape_book(self, url: str):
"""Scrapes metadata + chapters and prepares download tracking.""" """
Scrapes metadata + chapters.
DOES NOT START download / pipeline controller.
The controller_tasks.start_full_scrape() task will call this one.
"""
# ------------------------------------------------------------ # ------------------------------------------------------------
# NEW: clear UI log buffer at start of new run # CLEAR UI LOG BUFFER
# ------------------------------------------------------------ # ------------------------------------------------------------
reset_ui_logs() reset_ui_logs()
log(f"[SCRAPING] Start scraping for: {url}") log(f"[SCRAPING] Start scraping for: {url}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# Book scrape # SCRAPE (old engine)
# ------------------------------------------------------------ # ------------------------------------------------------------
site = BookSite() site = BookSite()
scraper = BookScraper(site, url) scraper = BookScraper(site, url)
result = scraper.execute() # returns dict with metadata + chapters result = scraper.execute() # → { title, author, chapters, cover_url, ... }
chapters = result.get("chapters", []) chapters = result.get("chapters", [])
full_count = len(chapters) full_count = len(chapters)
# ------------------------------------------------------------ # ------------------------------------------------------------
# DRY RUN # Compute unified book_idx
# ------------------------------------------------------------
book_idx = InitService.derive_book_id(url)
result["book_idx"] = book_idx
log(f"[SCRAPING] Assigned book_idx = {book_idx}")
# ------------------------------------------------------------
# DRY RUN TEST LIMIT
# ------------------------------------------------------------ # ------------------------------------------------------------
DRY_RUN = os.getenv("DRY_RUN", "0") == "1" DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5")) TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
if DRY_RUN: if DRY_RUN:
log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}") log(f"[SCRAPING] DRY RUN active → limiting chapters to {TEST_LIMIT}")
chapters = chapters[:TEST_LIMIT] result["chapters"] = chapters[:TEST_LIMIT]
result["chapters"] = chapters
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
# ------------------------------------------------------------ # ------------------------------------------------------------
# BOOK RUN ID (using title as ID) # LOG RESULTS
# ------------------------------------------------------------ # ------------------------------------------------------------
title = result.get("title") or "UnknownBook" log(
book_id = title # user requirement f"[SCRAPING] Completed scrape: "
f"{len(result['chapters'])}/{full_count} chapters"
result["book_id"] = book_id )
log(f"[SCRAPING] Assigned book_id = '{book_id}'")
# ------------------------------------------------------------ # ------------------------------------------------------------
# RESET ABORT + INITIALISE PROGRESS # RESET ABORT + INITIALIZE LEGACY PROGRESS
# ------------------------------------------------------------ # ------------------------------------------------------------
clear_abort(book_id) clear_abort(book_idx)
r.set(f"progress:{book_id}:total", len(chapters)) r.set(f"progress:{book_idx}:total", len(result["chapters"]))
r.set(f"progress:{book_id}:done", 0) r.set(f"progress:{book_idx}:done", 0)
r.delete(f"logs:{book_id}") # clear old logs if any
r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}") r.delete(f"logs:{book_idx}")
r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters") r.rpush(f"logs:{book_idx}", f":: SCRAPING STARTED for {url}")
r.rpush(f"logs:{book_idx}", f":: Found {len(result['chapters'])} chapters")
# ------------------------------------------------------------ # ------------------------------------------------------------
# DISPATCH DOWNLOAD CONTROLLER # IMPORTANT: DO NOT DISPATCH any pipelines here
# Controller will receive scrape_result and continue.
# ------------------------------------------------------------ # ------------------------------------------------------------
celery_app.send_task( return result
"scraper.tasks.controller_tasks.launch_downloads",
args=[book_id, result],
queue="controller",
)
log(f"[SCRAPING] Dispatched download controller for '{book_id}'")
return {
"book_id": book_id,
"title": result.get("title"),
"author": result.get("author"),
"chapters": len(chapters),
}

@ -1,10 +1,8 @@
# ============================================================ # ============================================================
# File: scraper/utils/state_sync.py # File: scraper/utils/state_sync.py
# Purpose: # Purpose:
# State inspection + optional sync logic for book progress. # State inspection + optional sync logic for unified book_idx model.
# This version provides: # Generates full book-card compatible dicts for debug UI.
# • inspect_books_state() → NO writes, just a dry-run
# • sync_books_from_redis() → NOT USED YET (kept commented)
# ============================================================ # ============================================================
import os import os
@ -12,17 +10,53 @@ import redis
from db.db import get_db from db.db import get_db
def inspect_books_state(): def _build_card(sqlite_row, redis_state, merged):
"""
Creates a dict that matches the fields required by components/bookcard.html:
b.book_idx
b.title
b.author
b.cover_path
b.status
b.created_at
b.download_done
b.download_total
b.audio_done
b.audio_total
""" """
Reads all books from SQLite and fetches Redis progress,
but performs NO writes. Only shows:
- sqlite row
- redis state
- merged result (dry-run)
Returns a list of inspection dicts. return {
"book_idx": sqlite_row.get("book_idx"),
"title": sqlite_row.get("title") or "Unknown",
"author": sqlite_row.get("author"),
"cover_path": sqlite_row.get("cover_path"),
# Use merged status (Redis > SQLite)
"status": merged.get("status") or sqlite_row.get("status") or "unknown",
# Meta
"created_at": sqlite_row.get("created_at"),
# Download counters
"download_done": merged.get("downloaded", 0),
"download_total": merged.get("chapters_total", 0),
# Audio counters
"audio_done": merged.get("audio_done", 0),
"audio_total": merged.get("chapters_total", 0),
}
# ============================================================
# INSPECT ONLY — NO WRITES
# ============================================================
def inspect_books_state():
"""
Reads all books from SQLite and fetches Redis progress.
Builds:
entry.sqlite
entry.redis
entry.would_merge_to
entry.card (book-card compatible)
""" """
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"))
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db() db = get_db()
cur = db.cursor() cur = db.cursor()
@ -32,110 +66,125 @@ def inspect_books_state():
results = [] results = []
for row in rows: for row in rows:
book_id = row["book_id"]
sqlite_row = dict(row) sqlite_row = dict(row)
book_idx = sqlite_row["book_idx"]
# Read redis state redis_key = f"book:{book_idx}:state"
redis_key = f"book:{book_id}:state" redis_state = r.hgetall(redis_key) or {}
progress = r.hgetall(redis_key)
if progress:
decoded = {k.decode(): v.decode() for k, v in progress.items()}
else:
decoded = {}
# Determine dry-run merged result # ================================
# DRY-RUN MERGE LOGIC
# ================================
merged = sqlite_row.copy() merged = sqlite_row.copy()
if decoded: if redis_state:
merged["downloaded"] = int( merged["downloaded"] = int(
decoded.get("download_done", merged.get("downloaded", 0)) redis_state.get("chapters_download_done", merged.get("downloaded", 0))
)
merged["parsed"] = int(
redis_state.get("chapters_parsed_done", merged.get("parsed", 0))
) )
merged["parsed"] = int(decoded.get("parsed_done", merged.get("parsed", 0)))
merged["audio_done"] = int( merged["audio_done"] = int(
decoded.get("audio_done", merged.get("audio_done", 0)) redis_state.get("audio_done", merged.get("audio_done", 0))
) )
merged["chapters_total"] = int( merged["chapters_total"] = int(
decoded.get("chapters_total", merged.get("chapters_total", 0)) redis_state.get("chapters_total", merged.get("chapters_total", 0))
)
merged["status"] = redis_state.get(
"status", merged.get("status", "unknown")
) )
merged["status"] = decoded.get("status", merged.get("status", "unknown"))
# ================================
# Build book-card data
# ================================
card = _build_card(sqlite_row, redis_state, merged)
# ================================
# Append final result entry
# ================================
results.append( results.append(
{ {
"book_id": book_id, "book_idx": book_idx,
"title": sqlite_row.get("title"),
"sqlite": sqlite_row, "sqlite": sqlite_row,
"redis": decoded, "redis": redis_state,
"would_merge_to": merged, "would_merge_to": merged,
"card": card,
} }
) )
return results return results
# ============================================================
# SYNC REDIS → SQLITE (writes)
# ============================================================
def sync_books_from_redis(): def sync_books_from_redis():
""" """
Reads all books from SQLite, fetches Redis progress, Writes Redis progress values back into SQLite.
and updates SQLite rows accordingly. Uses unified book_idx as identifier.
Returns a list of {
"book_id": ...,
"before": ...,
"redis": ...,
"after": ...
}
""" """
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"))
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db() db = get_db()
cur = db.cursor() cur = db.cursor()
# Haal alle boeken op
cur.execute("SELECT * FROM books") cur.execute("SELECT * FROM books")
rows = cur.fetchall() rows = cur.fetchall()
results = [] results = []
for row in rows: for row in rows:
book_id = row["book_id"]
before = dict(row) before = dict(row)
book_idx = before["book_idx"]
redis_key = f"book:{book_id}:state" redis_key = f"book:{book_idx}:state"
progress = r.hgetall(redis_key) redis_state = r.hgetall(redis_key)
if not progress: if not redis_state:
results.append( results.append(
{"book_id": book_id, "before": before, "redis": {}, "after": before} {
"book_idx": book_idx,
"before": before,
"redis": {},
"after": before,
}
) )
continue continue
# Decode Redis bytes → string dictionary # Extract progress from Redis
decoded = {k.decode(): v.decode() for k, v in progress.items()} downloaded = int(redis_state.get("chapters_download_done", 0))
parsed = int(redis_state.get("chapters_parsed_done", 0))
# Extract counters audio_done = int(redis_state.get("audio_done", 0))
downloaded = int(decoded.get("download_done", 0)) total = int(redis_state.get("chapters_total", 0))
parsed = int(decoded.get("parsed_done", 0)) status = redis_state.get("status", before.get("status"))
audio_done = int(decoded.get("audio_done", 0))
chapters_total = int(decoded.get("chapters_total", 0))
# Redis status wins # Update SQLite
status = decoded.get("status", before["status"])
# Write back to SQLite
cur.execute( cur.execute(
""" """
UPDATE books UPDATE books
SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now') SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now')
WHERE book_id = ? WHERE book_idx = ?
""", """,
(downloaded, parsed, audio_done, chapters_total, status, book_id), (downloaded, parsed, audio_done, total, status, book_idx),
) )
db.commit() db.commit()
# Fetch updated row cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,))
after = dict(cur.fetchone()) after = dict(cur.fetchone())
results.append( results.append(
{"book_id": book_id, "before": before, "redis": decoded, "after": after} {
"book_idx": book_idx,
"before": before,
"redis": redis_state,
"after": after,
}
) )
return results return results

@ -2,7 +2,7 @@
File: static/js/dashboard.js File: static/js/dashboard.js
Purpose: Purpose:
Dashboard interactions: Dashboard interactions:
- Select active book - Select active book_idx
- Live logs & progress - Live logs & progress
- Bookcard AJAX start/abort - Bookcard AJAX start/abort
NOTE: NOTE:
@ -26,7 +26,7 @@ async function apiGet(url) {
/* --------------------------------------------------------- /* ---------------------------------------------------------
Dashboard state Dashboard state
--------------------------------------------------------- */ --------------------------------------------------------- */
let ACTIVE_BOOK = null; let ACTIVE_BOOK_IDX = null;
let REFRESH_INTERVAL = null; let REFRESH_INTERVAL = null;
console.log(">>> dashboard.js LOADED"); console.log(">>> dashboard.js LOADED");
@ -37,51 +37,51 @@ console.log(">>> dashboard.js LOADED");
document.addEventListener("DOMContentLoaded", () => { document.addEventListener("DOMContentLoaded", () => {
console.log(">>> dashboard.js DOMContentLoaded"); console.log(">>> dashboard.js DOMContentLoaded");
// Fallback: fetch global logs if no active book // Fallback: global logs when no active book_idx
setInterval(() => { setInterval(() => {
if (!ACTIVE_BOOK) refreshBook(null); if (!ACTIVE_BOOK_IDX) refreshBook(null);
}, 2000); }, 2000);
// Sidebar items // Sidebar items
const items = $$(".book-list-item"); const items = $$(".book-list-item");
items.forEach((item) => { items.forEach((item) => {
item.addEventListener("click", () => { item.addEventListener("click", () => {
selectBook(item.dataset.bookId); selectBook(item.dataset.bookIdx);
}); });
}); });
// Auto-select // Auto-select first book
if (!ACTIVE_BOOK && items[0]) { if (!ACTIVE_BOOK_IDX && items[0]) {
selectBook(items[0].dataset.bookId); selectBook(items[0].dataset.bookIdx);
} }
// Initial binding of book-card buttons // Bind start/abort buttons inside cards
bindBookCardButtons(); bindBookCardButtons();
// Refresh sidebar every 2 seconds // Refresh sidebar every few seconds
setInterval(refreshActiveBooks, 2800); setInterval(refreshActiveBooks, 2800);
}); });
/* --------------------------------------------------------- /* ---------------------------------------------------------
Select a book Select a book_idx
--------------------------------------------------------- */ --------------------------------------------------------- */
function selectBook(bookId) { function selectBook(bookIdx) {
ACTIVE_BOOK = bookId; ACTIVE_BOOK_IDX = bookIdx;
console.log(">>> Selecting book", bookId); console.log(">>> Selecting book_idx", bookIdx);
// Highlight sidebar // Highlight sidebar
$$(".book-list-item").forEach((el) => { $$(".book-list-item").forEach((el) => {
el.classList.toggle("active", el.dataset.bookId === bookId); el.classList.toggle("active", el.dataset.bookIdx === bookIdx);
}); });
// Reset polling // Reset polling
if (REFRESH_INTERVAL) clearInterval(REFRESH_INTERVAL); if (REFRESH_INTERVAL) clearInterval(REFRESH_INTERVAL);
REFRESH_INTERVAL = setInterval(() => { REFRESH_INTERVAL = setInterval(() => {
refreshBook(ACTIVE_BOOK); refreshBook(ACTIVE_BOOK_IDX);
}, 2000); }, 2000);
refreshBook(ACTIVE_BOOK); refreshBook(ACTIVE_BOOK_IDX);
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
@ -99,7 +99,7 @@ async function refreshActiveBooks() {
books.forEach((b) => { books.forEach((b) => {
const div = document.createElement("div"); const div = document.createElement("div");
div.className = "book-list-item"; div.className = "book-list-item";
div.dataset.bookId = b.book_id; div.dataset.bookIdx = b.book_idx;
div.innerHTML = ` div.innerHTML = `
<div class="book-title">${b.title}</div> <div class="book-title">${b.title}</div>
@ -110,27 +110,27 @@ async function refreshActiveBooks() {
</div> </div>
`; `;
div.addEventListener("click", () => selectBook(b.book_id)); div.addEventListener("click", () => selectBook(b.book_idx));
container.appendChild(div); container.appendChild(div);
}); });
if (!ACTIVE_BOOK && books.length > 0) { if (!ACTIVE_BOOK_IDX && books.length > 0) {
selectBook(books[0].book_id); selectBook(books[0].book_idx);
} }
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
Fetch logs + progress Fetch logs + progress
--------------------------------------------------------- */ --------------------------------------------------------- */
async function refreshBook(bookId) { async function refreshBook(bookIdx) {
if (!bookId) { if (!bookIdx) {
const data = await apiGet("/logs"); const data = await apiGet("/logs");
if (data) updateLogs(data); if (data) updateLogs(data);
return; return;
} }
const state = await apiGet(`/api/book/${bookId}/status`); const state = await apiGet(`/api/book/${bookIdx}/status`);
const logs = await apiGet(`/api/book/${bookId}/logs`); const logs = await apiGet(`/api/book/${bookIdx}/logs`);
if (state) { if (state) {
updateProgressBars(state); updateProgressBars(state);
@ -140,24 +140,30 @@ async function refreshBook(bookId) {
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
BOOKCARD BUTTON BINDING idempotent BOOKCARD BUTTON BINDING (idempotent)
--------------------------------------------------------- */ --------------------------------------------------------- */
function bindBookCardButtons() { function bindBookCardButtons() {
console.log(">>> bindBookCardButtons() scanning…"); console.log(">>> bindBookCardButtons() scanning…");
// START BUTTONS // START BUTTONS
document.querySelectorAll(".book-card .icon-start").forEach((btn) => { document.querySelectorAll(".book-card .icon-start").forEach((btn) => {
if (btn.dataset.bound === "1") return; // prevent double-binding if (btn.dataset.bound === "1") return;
btn.dataset.bound = "1"; btn.dataset.bound = "1";
btn.addEventListener("click", (ev) => { btn.addEventListener("click", (ev) => {
ev.preventDefault(); ev.preventDefault();
if (btn.disabled) return; if (btn.disabled) return;
const bookId = btn.closest(".book-card").dataset.bookId; const card = btn.closest(".book-card");
console.log(">>> START clicked:", bookId); const bookIdx = card?.dataset.bookIdx;
console.log(">>> START clicked:", bookIdx);
if (!bookIdx) {
console.error(">>> ERROR: bookIdx missing on .book-card dataset");
return;
}
startBook(bookId); startBook(bookIdx);
}); });
}); });
@ -170,10 +176,16 @@ function bindBookCardButtons() {
ev.preventDefault(); ev.preventDefault();
if (btn.disabled) return; if (btn.disabled) return;
const bookId = btn.closest(".book-card").dataset.bookId; const card = btn.closest(".book-card");
console.log(">>> ABORT clicked:", bookId); const bookIdx = card?.dataset.bookIdx;
abortBookAjax(bookId); console.log(">>> ABORT clicked:", bookIdx);
if (!bookIdx) {
console.error(">>> ERROR: bookIdx missing on .book-card dataset");
return;
}
abortBookAjax(bookIdx);
}); });
}); });
} }
@ -181,13 +193,13 @@ function bindBookCardButtons() {
/* --------------------------------------------------------- /* ---------------------------------------------------------
AJAX START AJAX START
--------------------------------------------------------- */ --------------------------------------------------------- */
function startBook(bookId) { function startBook(bookIdx) {
console.log(">>> startBook():", bookId); console.log(">>> startBook():", bookIdx);
fetch("/start", { fetch("/start", {
method: "POST", method: "POST",
headers: { "Content-Type": "application/x-www-form-urlencoded" }, headers: { "Content-Type": "application/x-www-form-urlencoded" },
body: `book_id=${bookId}`, body: `book_idx=${bookIdx}`, // backend expects field name book_idx
}) })
.then(async (r) => { .then(async (r) => {
console.log(">>> /start status:", r.status); console.log(">>> /start status:", r.status);
@ -199,7 +211,7 @@ function startBook(bookId) {
console.log(">>> /start response:", data); console.log(">>> /start response:", data);
refreshBookCards(); refreshBookCards();
refreshBook(bookId); refreshBook(bookIdx);
}) })
.catch((err) => console.error("Start failed:", err)); .catch((err) => console.error("Start failed:", err));
} }
@ -207,12 +219,12 @@ function startBook(bookId) {
/* --------------------------------------------------------- /* ---------------------------------------------------------
AJAX ABORT AJAX ABORT
--------------------------------------------------------- */ --------------------------------------------------------- */
function abortBookAjax(bookId) { function abortBookAjax(bookIdx) {
if (!confirm(`Abort tasks for book ${bookId}?`)) return; if (!confirm(`Abort tasks for book ${bookIdx}?`)) return;
console.log(">>> abortBookAjax():", bookId); console.log(">>> abortBookAjax():", bookIdx);
fetch(`/abort/${bookId}`, { method: "POST" }) fetch(`/abort/${bookIdx}`, { method: "POST" })
.then(async (r) => { .then(async (r) => {
let data = null; let data = null;
try { try {
@ -221,7 +233,7 @@ function abortBookAjax(bookId) {
console.log(">>> /abort response:", data); console.log(">>> /abort response:", data);
refreshBookCards(); refreshBookCards();
refreshBook(bookId); refreshBook(bookIdx);
}) })
.catch((err) => console.error("Abort failed:", err)); .catch((err) => console.error("Abort failed:", err));
} }
@ -234,8 +246,8 @@ async function refreshBookCards() {
if (!books) return; if (!books) return;
document.querySelectorAll(".book-card").forEach((card) => { document.querySelectorAll(".book-card").forEach((card) => {
const id = card.dataset.bookId; const idx = card.dataset.bookIdx;
const info = books.find((b) => b.book_id === id); const info = books.find((b) => b.book_idx === idx);
if (!info) return; if (!info) return;
// Status CSS // Status CSS
@ -255,5 +267,5 @@ async function refreshBookCards() {
].includes(info.status); ].includes(info.status);
}); });
bindBookCardButtons(); // rebind new DOM bindBookCardButtons();
} }

@ -15,7 +15,7 @@ console.log(">>> log_view.js LOADING…");
--------------------------------------------------------- */ --------------------------------------------------------- */
let LOG_FILTER = "ALL"; let LOG_FILTER = "ALL";
let LAST_LOG_INDEX = -1; // delta offset let LAST_LOG_INDEX = -1; // delta offset
const MAX_LOG_LINES = 600; // safe rolling window const MAX_LOG_LINES = 600;
/* --------------------------------------------------------- /* ---------------------------------------------------------
Apply filter on existing log lines Apply filter on existing log lines
@ -35,38 +35,25 @@ function applyLogFilter() {
document.addEventListener("DOMContentLoaded", () => { document.addEventListener("DOMContentLoaded", () => {
console.log(">>> log_view.js DOMContentLoaded"); console.log(">>> log_view.js DOMContentLoaded");
const filterSel = $("#log-filter");
const clearBtn = $("#log-clear"); const clearBtn = $("#log-clear");
const output = $("#log-output"); const output = $("#log-output");
if (!output) { if (!output) {
console.log( console.log(">>> log_view.js: No #log-output → viewer disabled");
">>> log_view.js: No #log-output on this page → viewer disabled"
);
return; return;
} }
console.log(">>> log_view.js: log viewer detected.");
// Filter dropdown (currently disabled in your UI)
// if (filterSel) {
// filterSel.addEventListener("change", () => {
// LOG_FILTER = filterSel.value;
// applyLogFilter();
// });
// }
if (clearBtn) { if (clearBtn) {
clearBtn.addEventListener("click", () => { clearBtn.addEventListener("click", () => {
console.log(">>> log_view.js: Clear log viewer"); console.log(">>> log_view.js: Clear log viewer");
output.innerHTML = ""; output.innerHTML = "";
LAST_LOG_INDEX = -1; // reset delta polling LAST_LOG_INDEX = -1;
}); });
} }
}); });
/* --------------------------------------------------------- /* ---------------------------------------------------------
Append ONE line (smart class assignment) Append ONE line
--------------------------------------------------------- */ --------------------------------------------------------- */
function rollingAppend(lineText) { function rollingAppend(lineText) {
const output = $("#log-output"); const output = $("#log-output");
@ -86,7 +73,6 @@ function rollingAppend(lineText) {
else div.classList.add("default"); else div.classList.add("default");
div.textContent = lineText; div.textContent = lineText;
output.appendChild(div); output.appendChild(div);
// Rolling limit // Rolling limit
@ -96,31 +82,24 @@ function rollingAppend(lineText) {
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
Primary API entry: updateLogs() Primary entry: updateLogs()
Used by dashboard.js AND delta polling
Accepts: Accepts:
{ logs: [...], last_index: N } { logs:[...], last:N }
OR legacy: OR legacy:
{ lines: [...], total: N } { lines:[...], last:N }
--------------------------------------------------------- */ --------------------------------------------------------- */
function updateLogs(packet) { function updateLogs(packet) {
const output = $("#log-output"); const output = $("#log-output");
if (!output) return; if (!output || !packet) return;
if (!packet) return;
// Normalized log arrays
let lines = packet.logs || packet.lines || []; let lines = packet.logs || packet.lines || [];
if (!Array.isArray(lines)) return; if (!Array.isArray(lines)) return;
// Append only new lines
lines.forEach((line) => rollingAppend(line)); lines.forEach((line) => rollingAppend(line));
// Update delta index // Correct unified delta index handling
if (packet.last_index !== undefined) { if (packet.last !== undefined) {
LAST_LOG_INDEX = packet.last_index; LAST_LOG_INDEX = packet.last;
} else if (packet.total !== undefined) {
LAST_LOG_INDEX = packet.total - 1;
} }
applyLogFilter(); applyLogFilter();
@ -128,18 +107,17 @@ function updateLogs(packet) {
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
Delta polling: ONLY global logs use this Delta polling global logs ONLY
Dashboard overrides logs per book. (dashboard.js overrides logs per-book)
--------------------------------------------------------- */ --------------------------------------------------------- */
function pollLogs() { function pollLogs() {
fetch(`/logs?last_index=${LAST_LOG_INDEX}`) fetch(`/logs?last_index=${LAST_LOG_INDEX}`)
.then((r) => r.json()) .then((r) => r.json())
.then((data) => { .then((data) => {
const lines = data.lines || []; const lines = data.lines || [];
if (lines.length > 0) { if (lines.length > 0) {
lines.forEach((line) => logAppend(line)); lines.forEach((line) => rollingAppend(line));
LAST_LOG_INDEX = data.last; // <-- DE JUISTE INDEX! LAST_LOG_INDEX = data.last;
} }
}) })
.catch((err) => { .catch((err) => {

@ -2,7 +2,7 @@
File: static/js/progress.js File: static/js/progress.js
Purpose: Purpose:
Update progress bars dynamically for the current book. Update progress bars dynamically for the current book.
Expects data from API endpoints via dashboard.js or start.js. Only updates the main progress box (book_detail page).
======================================================================= */ ======================================================================= */
console.log(">>> progress.js LOADED"); console.log(">>> progress.js LOADED");
@ -15,19 +15,20 @@ function updateProgressBars(data) {
return; return;
} }
// Data format expected: // We always update inside the main progress box:
// { const container = document.querySelector("#progressSection");
// download_done, if (!container) {
// download_total, console.warn(">>> progress.js: #progressSection NOT FOUND");
// audio_done, return;
// audio_total }
// }
const barDL = $(".progress-bar-fill");
const barAU = $(".progress-bar-fill.audio-fill");
console.log(">>> progress.js barDL =", barDL); // Select bars ONLY inside the correct section
console.log(">>> progress.js barAU =", barAU); const barDL = container.querySelector(
".progress-bar:not(.audio) .progress-bar-fill"
);
const barAU = container.querySelector(
".progress-bar.audio .progress-bar-fill"
);
const pctDL = const pctDL =
data.download_total > 0 data.download_total > 0
@ -39,23 +40,22 @@ function updateProgressBars(data) {
if (barDL) { if (barDL) {
barDL.style.width = pctDL.toFixed(1) + "%"; barDL.style.width = pctDL.toFixed(1) + "%";
console.log(">>> progress.js updated DL bar to", pctDL.toFixed(1) + "%"); console.log(">>> progress.js DL bar =", pctDL.toFixed(1) + "%");
} else { } else {
console.warn(">>> progress.js: barDL NOT FOUND"); console.warn(">>> progress.js: barDL NOT FOUND INSIDE #progressSection");
} }
if (barAU) { if (barAU) {
barAU.style.width = pctAU.toFixed(1) + "%"; barAU.style.width = pctAU.toFixed(1) + "%";
console.log(">>> progress.js updated AU bar to", pctAU.toFixed(1) + "%"); console.log(">>> progress.js AU bar =", pctAU.toFixed(1) + "%");
} else { } else {
console.warn(">>> progress.js: barAU NOT FOUND"); console.warn(">>> progress.js: barAU NOT FOUND INSIDE #progressSection");
} }
// Update textual stats // Textual stats — only update inside progress box
const stats = $$(".progress-stats span"); const stats = container.querySelectorAll(".progress-stats span");
console.log(">>> progress.js stats elements found:", stats.length);
// Expected structure: [DL "x/y", DL "pct", AU "x/y", AU "pct"] // Expected: [DL x/y, DL %, AU x/y, AU %]
if (stats.length >= 4) { if (stats.length >= 4) {
stats[0].innerText = `${data.download_done} / ${data.download_total}`; stats[0].innerText = `${data.download_done} / ${data.download_total}`;
stats[1].innerText = pctDL.toFixed(1) + "%"; stats[1].innerText = pctDL.toFixed(1) + "%";
@ -65,7 +65,7 @@ function updateProgressBars(data) {
console.log(">>> progress.js stats updated"); console.log(">>> progress.js stats updated");
} else { } else {
console.warn( console.warn(
">>> progress.js: not enough stats spans, found", ">>> progress.js: not enough stats spans in the container, found",
stats.length stats.length
); );
} }

@ -3,17 +3,17 @@
Purpose: Purpose:
Dashboard weergave van één boek in de lijst. Dashboard weergave van één boek in de lijst.
Variabelen komen binnen via: Variabelen komen binnen via:
book.<veld>
Dus alle velden moeten via "book.<veld>" aangesproken worden. Boek gebruikt nu uitsluitend book_idx als primaire sleutel
======================================================================= --> ======================================================================= -->
<div class="book-list-item" data-book-id="{{ book.book_id }}"> <div class="book-list-item" data-book-idx="{{ book.book_idx }}">
<!-- Left area: title + metadata --> <!-- Left area: title + metadata -->
<div class="book-info"> <div class="book-info">
<div class="book-title">{{ book.title }}</div> <div class="book-title">{{ book.title }}</div>
<div class="book-meta"> <div class="book-meta">
<span class="meta-label">ID:</span> {{ book.book_id }} {% if <span class="meta-label">IDX:</span> {{ book.book_idx }} {% if
book.last_update %} book.last_update %}
<span class="meta-separator"></span> <span class="meta-separator"></span>
<span class="meta-label">Updated:</span> {{ book.last_update }} {% endif <span class="meta-label">Updated:</span> {{ book.last_update }} {% endif
@ -56,8 +56,10 @@
<span class="mini-value">{{ pct_au }}%</span> <span class="mini-value">{{ pct_au }}%</span>
</div> </div>
</div> </div>
<!-- Abort button -->
<div class="book-abort-area"> <div class="book-abort-area">
<button class="abort-btn" onclick="abortBook('{{ book.book_id }}')"> <button class="abort-btn" onclick="abortBookAjax('{{ book.book_idx }}')">
Abort Abort
</button> </button>
</div> </div>

@ -12,13 +12,13 @@
variable "b" in context variable "b" in context
============================================================ #} ============================================================ #}
<div class="book-card {{ b.status }}" data-book-id="{{ b.book_id }}"> <div class="book-card {{ b.status }}" data-book-idx="{{ b.book_idx }}">
<!-- ====================================================== <!-- ======================================================
HIDE BUTTON (icon-only) HIDE BUTTON (icon-only)
====================================================== --> ====================================================== -->
<form <form
action="/hide/{{ b.book_id }}" action="/hide/{{ b.book_idx }}"
method="POST" method="POST"
onsubmit="return confirm('Dit boek verbergen?')" onsubmit="return confirm('Dit boek verbergen?')"
class="hide-form" class="hide-form"
@ -50,7 +50,7 @@
<div class="book-actions"> <div class="book-actions">
<!-- START --> <!-- START -->
<form action="/start" method="POST"> <form action="/start" method="POST">
<input type="hidden" name="book_id" value="{{ b.book_id }}" /> <input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button <button
class="icon-btn icon-start" class="icon-btn icon-start"
title="Start scraping" title="Start scraping"
@ -63,8 +63,8 @@
</form> </form>
<!-- ABORT --> <!-- ABORT -->
<form action="/abort/{{ b.book_id }}" method="POST"> <form action="/abort/{{ b.book_idx }}" method="POST">
<input type="hidden" name="book_id" value="{{ b.book_id }}" /> <input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button <button
class="icon-btn icon-abort" class="icon-btn icon-abort"
title="Stoppen (abort)" title="Stoppen (abort)"

@ -3,7 +3,7 @@
Purpose: Reusable progress overview (download + audio) for any book. Purpose: Reusable progress overview (download + audio) for any book.
Notes: Notes:
- Expects the following variables from Flask: - Expects the following variables from Flask:
book_id: str book_idx: str
title: str title: str
download_total: int download_total: int
download_done: int download_done: int
@ -18,8 +18,8 @@
<h2>Progress</h2> <h2>Progress</h2>
{% if title %} {% if title %}
<div class="progress-subtitle">{{ title }}</div> <div class="progress-subtitle">{{ title }}</div>
{% endif %} {% if book_id %} {% endif %} {% if book_idx %}
<div class="progress-bookid">Book ID: <span>{{ book_id }}</span></div> <div class="progress-bookid">Book IDX: <span>{{ book_idx }}</span></div>
{% endif %} {% endif %}
</div> </div>
@ -57,5 +57,6 @@
<span>{{ pct2 }}%</span> <span>{{ pct2 }}%</span>
</div> </div>
</div> </div>
<script src="/static/js/progress.js"></script> <script src="/static/js/progress.js"></script>
</div> </div>

@ -1,7 +1,7 @@
<!-- ======================================================================= <!-- =======================================================================
File: templates/dashboard/book_detail.html File: templates/dashboard/book_detail.html
Purpose: Purpose:
Detailpagina voor één book_id. Detailpagina voor één book_idx.
Toont progress (download/audio) + filters + live logs. Toont progress (download/audio) + filters + live logs.
======================================================================= --> ======================================================================= -->
@ -15,7 +15,9 @@
<!-- Progress box --> <!-- Progress box -->
<section id="progressSection"> <section id="progressSection">
{% include "components/progress_box.html" %} {% include "components/progress_box.html" with book_idx=book_idx,
title=title, download_total=download_total, download_done=download_done,
audio_total=audio_total, audio_done=audio_done %}
</section> </section>
<!-- Log view --> <!-- Log view -->
@ -27,13 +29,10 @@
<!-- PAGE-SPECIFIC JS --> <!-- PAGE-SPECIFIC JS -->
<script> <script>
const BOOK_ID = "{{ book_id }}"; const BOOK_IDX = "{{ book_idx }}";
</script> </script>
<!-- Shared log viewer -->
<script src="/static/js/log_view.js"></script>
<!-- Dashboard behaviour (only does something if dashboard HTML is present) --> <script src="/static/js/log_view.js"></script>
<script src="/static/js/dashboard.js"></script> <script src="/static/js/dashboard.js"></script>
<!-- Existing global app logic -->
{% endblock %} {% endblock %}

@ -1,26 +1,27 @@
{% extends "layout.html" %} {% block content %} {# ============================================================ File:
templates/debug/inspect_state.html Purpose: Inspect SQLite vs Redis state per
book_idx. Left side: full book-card UI (same component as dashboard) Right side:
SQL / Redis / merged comparison table.
============================================================ #} {% extends
"layout.html" %} {% block content %}
<h1>State Inspection (SQL vs Redis)</h1> <h1>State Inspection (SQL vs Redis)</h1>
<style> <style>
.state-card { .state-block {
border: 1px solid #444; display: grid;
grid-template-columns: 380px 1fr;
gap: 20px;
margin-bottom: 35px;
padding: 18px; padding: 18px;
margin-bottom: 30px; border: 1px solid #444;
background: #222; background: #222;
border-radius: 8px; border-radius: 8px;
} }
.state-title { .state-table {
font-size: 1.4em;
margin-bottom: 14px;
color: #9cf;
}
table.state-table {
width: 100%; width: 100%;
border-collapse: collapse; border-collapse: collapse;
margin-bottom: 12px;
} }
.state-table th, .state-table th,
@ -34,17 +35,20 @@
color: #fff; color: #fff;
} }
.same { .state-table td {
color: #9f9; background: #2a2a2a;
color: #ddd;
} }
.same {
color: #9f9 !important;
}
.diff { .diff {
color: #ff7b7b; color: #ff7b7b !important;
font-weight: bold; font-weight: bold;
} }
.empty { .empty {
color: #aaa; color: #aaa !important;
font-style: italic; font-style: italic;
} }
</style> </style>
@ -56,33 +60,36 @@
<td class="diff">{{ sqlval }}</td> <td class="diff">{{ sqlval }}</td>
<td class="diff">{{ redisval }}</td> <td class="diff">{{ redisval }}</td>
{% endif %} {% endmacro %} {% for entry in results %} {% endif %} {% endmacro %} {% for entry in results %}
<div class="state-card"> <div class="state-block">
<div class="state-title">📘 {{ entry.book_id }}</div> <!-- LEFT COLUMN: book-card preview -->
<div>
{% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged = {% with b = entry.card %} {% include "components/bookcard.html" %} {%
entry.would_merge_to %} endwith %}
</div>
<table class="state-table">
<tr> <!-- RIGHT COLUMN: SQL vs Redis comparison -->
<th>Field</th> <div>
<th>SQLite</th> <table class="state-table">
<th>Redis</th> <tr>
<th>Merged Result</th> <th>Field</th>
</tr> <th>SQLite</th>
<th>Redis</th>
{% for field in [ "status", "chapters_total", "downloaded", <th>Merged Result</th>
"chapters_download_done", "chapters_download_skipped", "parsed", </tr>
"chapters_parsed_done", "audio_done", "audio_skipped", "last_update" ] %}
{% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged =
<tr> entry.would_merge_to %} {% for field in [ "status", "chapters_total",
<th>{{ field }}</th> "downloaded", "chapters_download_done", "chapters_download_skipped",
"parsed", "chapters_parsed_done", "audio_done", "audio_skipped",
<td>{{ sql.get(field, '') }}</td> "last_update" ] %}
<td>{{ redis.get(field, '') }}</td> <tr>
<td>{{ merged.get(field, '') }}</td> <th>{{ field }}</th>
</tr> <td>{{ sql.get(field, '') }}</td>
<td>{{ redis.get(field, '') }}</td>
{% endfor %} <td>{{ merged.get(field, '') }}</td>
</table> </tr>
{% endfor %}
</table>
</div>
</div> </div>
{% endfor %} {% endblock %} {% endfor %} {% endblock %}

@ -32,7 +32,6 @@
font-size: 13px; font-size: 13px;
} }
/* NEW: Clear button */
#clearLogBtn { #clearLogBtn {
margin-bottom: 10px; margin-bottom: 10px;
padding: 8px 16px; padding: 8px 16px;
@ -70,7 +69,7 @@
<body> <body>
<a href="/">&larr; Terug</a> <a href="/">&larr; Terug</a>
<h1>Scrape Resultaat--</h1> <h1>Scrape Resultaat</h1>
{% if error %} {% if error %}
<div <div
@ -79,7 +78,9 @@
> >
<strong>Fout:</strong> {{ error }} <strong>Fout:</strong> {{ error }}
</div> </div>
{% endif %} {% if message %} {% endif %}
{% if message %}
<div class="box">{{ message }}</div> <div class="box">{{ message }}</div>
{% endif %} {% endif %}
@ -113,127 +114,29 @@
class="box hidden" class="box hidden"
style="background: #ffefef; border-left: 5px solid #cc0000" style="background: #ffefef; border-left: 5px solid #cc0000"
> >
<strong>Failed chapters:</strong> <strong>Mislukte hoofdstukken:</strong>
<ul id="failedList" style="margin-top: 10px"></ul> <ul id="failedList" style="margin-top: 10px"></ul>
</div> </div>
<div class="box"> <div class="box">
<strong>Live log:</strong><br /> <strong>Live log:</strong><br />
<!-- NEW BUTTON -->
<button id="clearLogBtn" onclick="clearLogs()">Clear logs</button> <button id="clearLogBtn" onclick="clearLogs()">Clear logs</button>
<div id="logbox" class="logbox"></div> <div id="logbox" class="logbox"></div>
</div> </div>
<script> <script>
// komt terug van Celery-scraping task
const scrapingTaskId = "{{ scraping_task_id or '' }}"; const scrapingTaskId = "{{ scraping_task_id or '' }}";
let bookId = null; let bookIdx = null;
let polling = true; let polling = true;
if (scrapingTaskId) pollForBookId(); if (scrapingTaskId) pollForBookIdx();
function pollForBookId() {
fetch(`/celery-result/${scrapingTaskId}`)
.then((r) => r.json())
.then((data) => {
if (data.ready && data.result && data.result.book_id) {
bookId = data.result.book_id;
startLiveUI();
} else setTimeout(pollForBookId, 800);
})
.catch(() => setTimeout(pollForBookId, 1200));
}
function startLiveUI() {
document.getElementById("statusBox").classList.remove("hidden");
document.getElementById("abortBtn").classList.remove("hidden");
document.getElementById("abortBtn").onclick = () => {
fetch(`/abort/${bookId}`, { method: "POST" });
};
pollProgress();
pollLogs();
}
function pollProgress() {
if (!bookId) return;
fetch(`/progress/${bookId}`)
.then((r) => r.json())
.then((p) => {
const done = p.completed || 0;
const total = p.total || 0;
document.getElementById(
"progressText"
).innerText = `Completed: ${done} / ${total} | Skipped: ${
p.skipped || 0
} | Failed: ${p.failed || 0}`;
const failedBox = document.getElementById("failedBox");
const failedList = document.getElementById("failedList");
if (p.failed_list && p.failed_list.length > 0) {
failedBox.classList.remove("hidden");
failedList.innerHTML = "";
p.failed_list.forEach((entry) => {
const li = document.createElement("li");
li.textContent = entry;
failedList.appendChild(li);
});
}
if (p.abort) {
document.getElementById("statusLine").innerText = "ABORTED";
polling = false;
} else if (done >= total && total > 0) {
document.getElementById("statusLine").innerText = "KLAAR ✔";
polling = false;
} else {
document.getElementById("statusLine").innerText = "Bezig…";
}
if (polling) setTimeout(pollProgress, 1000);
})
.catch(() => {
if (polling) setTimeout(pollProgress, 1500);
});
}
function pollLogs() { // -----------------------------------------------------
if (!polling) return; // Vraag Celery-result op, wacht tot de scraper een book_idx teruggeeft
// -----------------------------------------------------
fetch(`/logs`) function pollForBookIdx() {
.then((r) => r.json()) fetch(`/celery-result/${scrapingTask
.then((data) => {
const logbox = document.getElementById("logbox");
logbox.innerHTML = "";
data.logs.forEach((line) => {
const div = document.createElement("div");
div.textContent = line;
logbox.appendChild(div);
});
logbox.scrollTop = logbox.scrollHeight;
setTimeout(pollLogs, 1000);
})
.catch(() => setTimeout(pollLogs, 1500));
}
// =========================================================
// NEW: Clear logs button handler
// =========================================================
function clearLogs() {
fetch("/clear-logs", { method: "POST" })
.then(() => {
document.getElementById("logbox").innerHTML = "";
})
.catch((e) => console.error("Clear logs failed:", e));
}
</script>
</body>
</html>

Loading…
Cancel
Save