Refactor: unified book_idx state model + controller rewrite + Redis/SQLite sync overhaul

- Removed old scraping→controller chain (no more async .get)
- New DownloadController pipeline structure
- Unified Redis Book State Model (book:{idx}:state)
- Updated all Celery tasks for unified IDs
- Removed old scraper/db.py
- Updated templates and dashboard components
- Added debug Inspect State system with bookcard preview
- Updated JS dashboard pipeline refresh
- Updated init_service + scrape_engine
- Improved abort logic
feature/bookstate-progress-fix
peter.fong 1 week ago
parent feb8ca60d7
commit 3a62dfae79

@ -28,9 +28,10 @@ from db.repository import (
get_progress,
)
from logbus.publisher import log
from scraper.logger import log_debug
from scraper.abort import set_abort
from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta
from scraper.ui_log import get_ui_logs, reset_ui_logs
from scraper.state import state as r
from scraper.logger_decorators import logcall
from scraper.utils.state_sync import sync_books_from_redis
@ -42,7 +43,6 @@ init_db()
app = Flask(__name__)
# =====================================================
# STATIC FILE SERVING
# =====================================================
@ -70,25 +70,25 @@ def index():
@logcall
def dashboard():
logs_list = get_ui_logs() or []
# Filter hidden books ONLY for GUI
registered_books = get_registered_books()
log(f"[WEB] Registered books: {registered_books}")
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
books=list_active_books(), # Redis
registered=reg, # SQLite (filtered)
books=list_active_books(),
registered=reg,
logs=logs_list,
)
@app.route("/book/<book_id>")
@app.route("/book/<book_idx>")
@logcall
def book_detail(book_id):
title = r.get(f"book:{book_id}:title") or book_id
def book_detail(book_idx):
title = r.get(f"book:{book_idx}:title") or book_idx
return render_template(
"dashboard/book_detail.html",
book_id=book_id,
book_id=book_idx,
title=title,
logs=get_ui_logs(),
)
@ -102,13 +102,6 @@ def book_detail(book_id):
@app.route("/init", methods=["POST"])
@logcall
def init_book():
"""
INIT-flow:
- user enters URL
- metadata fetch
- insert into SQLite as 'registered'
- return dashboard
"""
url = request.form.get("url", "").strip()
if not url:
@ -146,47 +139,64 @@ def init_book():
)
@app.route("/hide/<book_id>", methods=["POST"])
@app.route("/hide/<book_idx>", methods=["POST"])
@logcall
def hide_registered_book(book_id):
"""
Soft-delete/hide voor GUI.
De DB blijft intact.
"""
# try:
# hide_book(book_id)
# return redirect("/dashboard")
# # return jsonify({"status": "ok", "hidden": book_id})
# except Exception as e:
# return jsonify({"status": "error", "message": str(e)}), 500
def hide_registered_book(book_idx):
# intentionally left disabled
pass
@app.route("/start", methods=["POST"])
@logcall
def start_scraping():
"""
Start FULL scraping vanuit een geregistreerd INIT-record.
"""
book_id = request.form.get("book_id")
if not book_id:
return jsonify({"status": "error", "message": "book_id ontbreekt"}), 400
book = fetch_book(book_id)
# 1) Form field: book_idx
book_idx = request.form.get("book_idx")
log(f"[WEB][START] Received start request for book_idx={book_idx}")
if not book_idx:
msg = "book_idx ontbreekt in formulier"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 400
# 2) Fetch boek uit SQLite
try:
book = fetch_book(book_idx)
log(f"[WEB][START] Fetched book from DB: {book}")
except Exception as e:
log(f"[WEB][START] DB ERROR: {e}")
return jsonify({"status": "error", "message": "DB fout"}), 500
if not book:
return jsonify({"status": "error", "message": "Boek niet gevonden"}), 404
msg = f"Boek '{book_idx}' niet gevonden in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 404
# 3) Boek moet een URL hebben
url = book.get("book_url")
if not url:
return jsonify({"status": "error", "message": "book_url ontbreekt"}), 500
msg = f"Boek '{book_idx}' heeft geen book_url in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 500
# 4) Reset UI logs
reset_ui_logs()
log_debug(f"[WEB] Starting FULL scrape for book_id={book_id}, url={url}")
# 5) Logging
log(f"[WEB][START] Starting full scrape book_idx={book_idx}, url={url}")
log_debug(f"[WEB][START] DEBUG: book data = {book}")
# 6) Celery controller taak starten
try:
async_result = celery_app.send_task(
"scraper.tasks.scraping.start_scrape_book",
args=[url],
queue="scraping",
"scraper.tasks.controller_tasks.start_full_scrape",
args=[book_idx],
queue="controller",
)
except Exception as e:
log(f"[WEB][START] Celery ERROR: {e}")
return jsonify({"status": "error", "message": f"Celery fout: {e}"}), 500
# 7) Successfully dispatched task
log(f"[WEB][START] Task dispatched: {async_result.id}")
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
@ -199,12 +209,12 @@ def start_scraping():
)
@app.route("/abort/<book_id>", methods=["POST"])
@app.route("/abort/<book_idx>", methods=["POST"])
@logcall
def abort_download(book_id):
log_debug(f"[WEB] Abort requested for book: {book_id}")
set_abort(book_id)
return jsonify({"status": "ok", "aborted": book_id})
def abort_download(book_idx):
log_debug(f"[WEB] Abort requested for book: {book_idx}")
set_abort(book_idx)
return jsonify({"status": "ok", "aborted": book_idx})
# =====================================================
@ -218,23 +228,23 @@ def api_books():
return jsonify(list_active_books())
@app.route("/api/book/<book_id>/status")
@app.route("/api/book/<book_idx>/status")
@logcall
def api_book_status(book_id):
return jsonify(getStatus(book_id))
def api_book_status(book_idx):
return jsonify(getStatus(book_idx))
@app.route("/api/book/<book_id>/logs")
@app.route("/api/book/<book_idx>/logs")
@logcall
def api_book_logs(book_id):
logs = r.lrange(f"logs:{book_id}", 0, -1) or []
def api_book_logs(book_idx):
logs = r.lrange(f"logs:{book_idx}", 0, -1) or []
return jsonify(logs)
@app.route("/progress/<book_id>")
@app.route("/progress/<book_idx>")
@logcall
def progress(book_id):
return jsonify(get_progress(book_id))
def progress(book_idx):
return jsonify(get_progress(book_idx))
@app.route("/celery-result/<task_id>")
@ -258,16 +268,13 @@ def clear_logs():
@app.route("/logs", methods=["GET"])
@logcall
def logs():
# LAST_LOG_INDEX vanuit de client (default = -1 bij eerste call)
try:
last_index = int(request.args.get("last_index", -1))
except:
last_index = -1
# Haal volledige huidige loglijst op
all_logs = get_ui_logs() or []
# Delta: alle regels met index > last_index
new_lines = []
new_last = last_index
@ -282,6 +289,8 @@ def logs():
# =====================================================
# SECTION 4 — DEBUG ROUTES
# =====================================================
@app.route("/debug/sync_state", methods=["GET"])
def debug_sync_state():
results = sync_books_from_redis()
@ -293,13 +302,6 @@ from scraper.utils.state_sync import inspect_books_state
@app.route("/debug/inspect_state", methods=["GET"])
def debug_inspect_state():
"""
Shows:
- raw SQLite values,
- raw Redis values,
- what the merged result WOULD be.
No writes happen.
"""
results = inspect_books_state()
return render_template("debug/inspect_state.html", results=results)
@ -339,10 +341,10 @@ def api_db_books():
# =============================================
# DEBUG QUEUE VIEW (HTML)
# =============================================
from flask import render_template
from urllib.parse import urlparse
import redis
import os
from celery_app import celery_app
@ -354,11 +356,10 @@ def debug_queues():
workers_scheduled = insp.scheduled() or {}
workers_reserved = insp.reserved() or {}
# ---- Redis connection ----
redis_url = os.getenv("REDIS_BROKER")
parsed = urlparse(redis_url)
r = redis.Redis(
r2 = redis.Redis(
host=parsed.hostname,
port=parsed.port,
db=int(parsed.path.strip("/") or 0),
@ -375,8 +376,8 @@ def debug_queues():
{
"name": q,
"redis_key": key,
"length": r.llen(key),
"items": r.lrange(key, 0, 30), # first 30 entries
"length": r2.llen(key),
"items": r2.lrange(key, 0, 30),
}
)
except Exception as e:
@ -404,17 +405,17 @@ def debug_queues():
@logcall
def getStatus(book_id):
state = r.hgetall(f"book:{book_id}:state")
def getStatus(book_idx):
state = r.hgetall(f"book:{book_idx}:state")
status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_id
title = state.get("title") or book_idx
return {
"book_id": book_id,
"book_id": book_idx,
"title": title,
"status": status,
"download_done": dl_done,
@ -431,8 +432,8 @@ def list_active_books():
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_id = key[first + 1 : second]
books.append(getStatus(book_id))
book_idx = key[first + 1 : second]
books.append(getStatus(book_idx))
return books

@ -1,12 +1,11 @@
# ============================================================
# File: db/db.py
# File: db/db.py (UPDATED for book_idx-only architecture)
# Purpose:
# Raw SQLite engine for BookScraper.
# Provides ONLY low-level DB primitives.
# - Connection management (existing DELETE journal mode)
# - Connection management
# - init_db() schema creation + safe schema upgrade
# - upsert_book() atomic write
# - raw fetch helpers (private)
# - upsert_book() atomic write (now uses book_idx)
# - raw fetch helpers
# ============================================================
import os
@ -52,12 +51,12 @@ def init_db():
conn = get_db()
# --------------------------------------------------------
# BASE SCHEMA (unchanged)
# BASE SCHEMA — book_idx is now PRIMARY KEY
# --------------------------------------------------------
conn.execute(
"""
CREATE TABLE IF NOT EXISTS books (
book_id TEXT PRIMARY KEY,
book_idx INTEGER PRIMARY KEY,
title TEXT,
author TEXT,
description TEXT,
@ -81,7 +80,7 @@ def init_db():
conn.commit()
# --------------------------------------------------------
# SCHEMA UPGRADE UTIL
# SCHEMA UPGRADE UTILITY
# --------------------------------------------------------
def add_column(name, type_):
try:
@ -92,16 +91,16 @@ def init_db():
cols = conn.execute("PRAGMA table_info(books);").fetchall()
colnames = [c[1] for c in cols]
# Existing: ensure new metadata fields exist
# --------------------------------------------------------
# UPGRADE NEW FIELDS — future-proof, matched with Redis state model
# --------------------------------------------------------
# (book_idx already exists as PRIMARY KEY — no need to add)
add_column("description", "TEXT")
add_column("cover_path", "TEXT")
add_column("book_url", "TEXT")
# --------------------------------------------------------
# NEW FIELDS — MATCH REDIS STATE MODEL (future-proof)
# These do NOT change logic, but enable repository snapshot sync.
# --------------------------------------------------------
# Download counters
add_column("chapters_download_done", "INTEGER DEFAULT 0")
add_column("chapters_download_skipped", "INTEGER DEFAULT 0")
@ -116,13 +115,18 @@ def init_db():
# ------------------------------------------------------------
# WRITE OPERATIONS
# WRITE OPERATIONS (book_idx-based UPSERT)
# ------------------------------------------------------------
def upsert_book(book_id, **fields):
def upsert_book(book_idx, **fields):
"""
UPSERT by book_idx.
Replaces old upsert that used book_id.
"""
conn = get_db()
keys = ["book_id"] + list(fields.keys())
values = [book_id] + list(fields.values())
keys = ["book_idx"] + list(fields.keys())
values = [book_idx] + list(fields.values())
placeholders = ",".join(["?"] * len(values))
updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()])
@ -130,7 +134,7 @@ def upsert_book(book_id, **fields):
sql = f"""
INSERT INTO books ({','.join(keys)})
VALUES ({placeholders})
ON CONFLICT(book_id)
ON CONFLICT(book_idx)
DO UPDATE SET {updates},
last_update = CURRENT_TIMESTAMP;
"""
@ -140,11 +144,13 @@ def upsert_book(book_id, **fields):
# ------------------------------------------------------------
# RAW READ OPERATIONS (PRIVATE)
# RAW READ OPERATIONS
# ------------------------------------------------------------
def _raw_get_book(book_id):
def _raw_get_book(book_idx):
conn = get_db()
row = conn.execute("SELECT * FROM books WHERE book_id = ?;", (book_id,)).fetchone()
row = conn.execute(
"SELECT * FROM books WHERE book_idx = ?;", (book_idx,)
).fetchone()
return dict(row) if row else None

@ -8,6 +8,9 @@
# - Route counters → Redis (live) + SQLite (snapshot)
# - Provide a clean API for tasks and Flask UI
# ============================================================
# ============================================================
# File: db/repository.py (UPDATED for book_idx-only architecture)
# ============================================================
from scraper.logger_decorators import logcall
from logbus.publisher import log
@ -53,43 +56,44 @@ _r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ============================================================
# INTERNAL — legacy progress helpers
# INTERNAL — LEGACY PROGRESS HELPERS (kept for UI)
# Keys remain: progress:{book_idx}:*
# ============================================================
def _legacy_set_total(book_id, total):
_r.set(f"progress:{book_id}:total", total)
def _legacy_set_total(book_idx, total):
_r.set(f"progress:{book_idx}:total", total)
def _legacy_inc_completed(book_id):
_r.incr(f"progress:{book_id}:completed")
def _legacy_inc_completed(book_idx):
_r.incr(f"progress:{book_idx}:completed")
def _legacy_inc_skipped(book_id):
_r.incr(f"progress:{book_id}:skipped")
def _legacy_inc_skipped(book_idx):
_r.incr(f"progress:{book_idx}:skipped")
def _legacy_inc_failed(book_id):
_r.incr(f"progress:{book_id}:failed")
def _legacy_inc_failed(book_idx):
_r.incr(f"progress:{book_idx}:failed")
def _legacy_add_failed_chapter(book_id, chapter, reason):
def _legacy_add_failed_chapter(book_idx, chapter, reason):
entry = f"Chapter {chapter}: {reason}"
_r.rpush(f"progress:{book_id}:failed_list", entry)
_r.rpush(f"progress:{book_idx}:failed_list", entry)
def _legacy_get_failed_list(book_id):
return _r.lrange(f"progress:{book_id}:failed_list", 0, -1)
def _legacy_get_failed_list(book_idx):
return _r.lrange(f"progress:{book_idx}:failed_list", 0, -1)
def _legacy_get_progress(book_id):
total = int(_r.get(f"progress:{book_id}:total") or 0)
completed = int(_r.get(f"progress:{book_id}:completed") or 0)
skipped = int(_r.get(f"progress:{book_id}:skipped") or 0)
failed = int(_r.get(f"progress:{book_id}:failed") or 0)
abort = _r.exists(f"abort:{book_id}") == 1
failed_list = _legacy_get_failed_list(book_id)
def _legacy_get_progress(book_idx):
total = int(_r.get(f"progress:{book_idx}:total") or 0)
completed = int(_r.get(f"progress:{book_idx}:completed") or 0)
skipped = int(_r.get(f"progress:{book_idx}:skipped") or 0)
failed = int(_r.get(f"progress:{book_idx}:failed") or 0)
abort = _r.exists(f"abort:{book_idx}") == 1
failed_list = _legacy_get_failed_list(book_idx)
return {
"book_id": book_id,
"book_idx": book_idx,
"total": total,
"completed": completed,
"skipped": skipped,
@ -100,29 +104,29 @@ def _legacy_get_progress(book_id):
# ============================================================
# PUBLIC — UI-ready legacy progress access
# PUBLIC — PROGRESS API
# ============================================================
@logcall
def get_progress(book_id):
return _legacy_get_progress(book_id)
def get_progress(book_idx):
return _legacy_get_progress(book_idx)
@logcall
def add_failed_chapter(book_id, chapter, reason):
_legacy_add_failed_chapter(book_id, chapter, reason)
def add_failed_chapter(book_idx, chapter, reason):
_legacy_add_failed_chapter(book_idx, chapter, reason)
@logcall
def get_failed_list(book_id):
return _legacy_get_failed_list(book_id)
def get_failed_list(book_idx):
return _legacy_get_failed_list(book_idx)
# ============================================================
# FETCH OPERATIONS (SQLite snapshot)
# ============================================================
@logcall
def fetch_book(book_id):
return sql_fetch_book(book_id)
def fetch_book(book_idx):
return sql_fetch_book(book_idx)
@logcall
@ -135,7 +139,7 @@ def fetch_all_books():
# ============================================================
@logcall
def register_book(
book_id,
book_idx,
title,
author=None,
description=None,
@ -145,6 +149,7 @@ def register_book(
):
fields = {
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
@ -154,19 +159,24 @@ def register_book(
"chapters_total": 0,
"status": "registered",
}
log(f"[DB] Registering new book={book_id} title='{title}'")
sql_register_book(book_id, fields)
log(f"[DB] Registering new book_idx={book_idx} title='{title}'")
sql_register_book(book_idx, fields)
# ============================================================
# SCRAPE-FLOW UPDATE
# ============================================================
@logcall
def update_book_after_full_scrape(
book_id,
book_idx,
title=None,
author=None,
description=None,
cover_url=None,
chapters_total=None,
):
fields = {}
if title is not None:
@ -182,8 +192,8 @@ def update_book_after_full_scrape(
fields["status"] = "active"
log(f"[DB] update full scrape metadata book={book_id}")
sql_update_book(book_id, fields)
log(f"[DB] update metadata for book_idx={book_idx}")
sql_update_book(book_idx, fields)
# ============================================================
@ -206,98 +216,82 @@ def get_active_books():
# STATUS MANAGEMENT
# ============================================================
@logcall
def set_status(book_id, status):
log(f"[DB] Setting status for {book_id} to '{status}'")
redis_set_status(book_id, status)
sql_set_status(book_id, status)
def set_status(book_idx, status):
log(f"[DB] Setting status for {book_idx} to '{status}'")
redis_set_status(book_idx, status)
sql_set_status(book_idx, status)
# ============================================================
# CHAPTER TOTALS
# ============================================================
@logcall
def set_chapters_total(book_id, total):
log(f"[DB] Setting chapter total for {book_id} to {total}")
redis_set_chapters_total(book_id, total)
sql_set_chapters_total(book_id, total)
_legacy_set_total(book_id, total) # integrate legacy progress
def set_chapters_total(book_idx, total):
log(f"[DB] Setting chapter total for {book_idx} to {total}")
redis_set_chapters_total(book_idx, total)
sql_set_chapters_total(book_idx, total)
_legacy_set_total(book_idx, total)
# ============================================================
# COUNTERS — DOWNLOAD
# ============================================================
@logcall
def inc_download_done(book_id, amount=1):
log(f"[DB] Incrementing download done for {book_id} by {amount}")
redis_inc_download_done(book_id, amount)
sql_inc_downloaded(book_id, amount)
_legacy_inc_completed(book_id)
def inc_download_done(book_idx, amount=1):
log(f"[DB] Incrementing download done for {book_idx} by {amount}")
redis_inc_download_done(book_idx, amount)
sql_inc_downloaded(book_idx, amount)
_legacy_inc_completed(book_idx)
@logcall
def inc_download_skipped(book_id, amount=1):
log(f"[DB] Incrementing download skipped for {book_id} by {amount}")
redis_inc_download_skipped(book_id, amount)
_legacy_inc_skipped(book_id)
def inc_download_skipped(book_idx, amount=1):
log(f"[DB] Incrementing download skipped for {book_idx} by {amount}")
redis_inc_download_skipped(book_idx, amount)
_legacy_inc_skipped(book_idx)
# ============================================================
# COUNTERS — PARSE
# ============================================================
@logcall
def inc_parsed_done(book_id, amount=1):
log(f"[DB] Incrementing parsed done for {book_id} by {amount}")
redis_inc_parsed_done(book_id, amount)
sql_inc_parsed(book_id, amount)
def inc_parsed_done(book_idx, amount=1):
log(f"[DB] Incrementing parsed done for {book_idx} by {amount}")
redis_inc_parsed_done(book_idx, amount)
sql_inc_parsed(book_idx, amount)
# ============================================================
# COUNTERS — AUDIO
# ============================================================
# ============================================================
# COUNTERS — AUDIO SKIPPED
# ============================================================
@logcall
def inc_audio_skipped(book_id, amount=1):
log(f"[DB] Incrementing audio skipped for {book_id} by {amount}")
# Redis live counter (maak deze functie in state_redis wanneer nodig)
sql_inc_audio_skipped(book_id, amount)
redis_inc_audio_skipped(book_id, amount)
# Geen SQLite kolom? Dan overslaan.
def inc_audio_skipped(book_idx, amount=1):
log(f"[DB] Incrementing audio skipped for {book_idx} by {amount}")
sql_inc_audio_skipped(book_idx, amount)
redis_inc_audio_skipped(book_idx, amount)
@logcall
def inc_audio_done(book_id, amount=1):
log(f"[DB] Incrementing audio done for {book_id} by {amount}")
redis_inc_audio_done(book_id, amount)
sql_inc_audio_done(book_id, amount)
def inc_audio_done(book_idx, amount=1):
log(f"[DB] Incrementing audio done for {book_idx} by {amount}")
redis_inc_audio_done(book_idx, amount)
sql_inc_audio_done(book_idx, amount)
# ============================================================
# BACKWARDS COMPATIBILITY SHIMS (old task API)
# BACKWARDS COMPATIBILITY SHIMS
# These map the old API (book_id) to the new book_idx-only system
# ============================================================
@logcall
def inc_downloaded(book_id, amount=1):
"""
Old name used by older tasks.
Redirects to new unified counter.
"""
return inc_download_done(book_id, amount)
def inc_downloaded(book_idx, amount=1):
return inc_download_done(book_idx, amount)
@logcall
def inc_parsed(book_id, amount=1):
"""
Old name used by older tasks.
"""
return inc_parsed_done(book_id, amount)
def inc_parsed(book_idx, amount=1):
return inc_parsed_done(book_idx, amount)
@logcall
def inc_audio_done_legacy(book_id, amount=1):
"""
Old audio name used by older tasks.
"""
return inc_audio_done(book_id, amount)
def inc_audio_done_legacy(book_idx, amount=1):
return inc_audio_done(book_idx, amount)

@ -1,5 +1,5 @@
# ============================================================
# File: db/state_redis.py
# File: db/state_redis.py (UPDATED for book_idx-only architecture)
# Purpose:
# Low-level Redis counters/state for BookScraper.
# Used ONLY by db.repository façade.
@ -15,11 +15,18 @@ REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ------------------------------------------------------------
# INTERNAL KEY BUILDER
# ------------------------------------------------------------
def _key(book_idx: str) -> str:
return f"book:{book_idx}:state"
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def redis_set_status(book_id: str, status: str):
key = f"book:{book_id}:state"
def redis_set_status(book_idx: str, status: str):
key = _key(book_idx)
r.hset(key, "status", status)
r.hset(key, "last_update", int(time.time()))
@ -27,8 +34,8 @@ def redis_set_status(book_id: str, status: str):
# ------------------------------------------------------------
# TOTAL CHAPTERS
# ------------------------------------------------------------
def redis_set_chapters_total(book_id: str, total: int):
key = f"book:{book_id}:state"
def redis_set_chapters_total(book_idx: str, total: int):
key = _key(book_idx)
r.hset(key, "chapters_total", total)
r.hset(key, "last_update", int(time.time()))
@ -36,15 +43,15 @@ def redis_set_chapters_total(book_id: str, total: int):
# ------------------------------------------------------------
# DOWNLOAD COUNTERS
# ------------------------------------------------------------
def redis_inc_download_done(book_id: str, amount: int = 1):
key = f"book:{book_id}:state"
def redis_inc_download_done(book_idx: str, amount: int = 1):
key = _key(book_idx)
r.hincrby(key, "chapters_download_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_download_skipped(book_id: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download skipped for {book_id} by {amount}")
key = f"book:{book_id}:state"
def redis_inc_download_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download skipped for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_download_skipped", amount)
r.hset(key, "last_update", int(time.time()))
@ -52,8 +59,8 @@ def redis_inc_download_skipped(book_id: str, amount: int = 1):
# ------------------------------------------------------------
# PARSE COUNTERS
# ------------------------------------------------------------
def redis_inc_parsed_done(book_id: str, amount: int = 1):
key = f"book:{book_id}:state"
def redis_inc_parsed_done(book_idx: str, amount: int = 1):
key = _key(book_idx)
r.hincrby(key, "chapters_parsed_done", amount)
r.hset(key, "last_update", int(time.time()))
@ -61,19 +68,64 @@ def redis_inc_parsed_done(book_id: str, amount: int = 1):
# ------------------------------------------------------------
# AUDIO COUNTERS
# ------------------------------------------------------------
def redis_inc_audio_done(book_id: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio done for {book_id} by {amount}")
key = f"book:{book_id}:state"
def redis_inc_audio_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "audio_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_audio_skipped(book_id: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio skipped for {book_id} by {amount}")
def redis_inc_audio_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio skipped for {book_idx} by {amount}")
"""
New: Count skipped audio chapters (timeouts, pre-existing files, abort, etc.)
SQL does NOT track this; Redis-only metric.
"""
key = f"book:{book_id}:state"
key = _key(book_idx)
r.hincrby(key, "audio_skipped", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# INITIALISE BOOK STATE
# ------------------------------------------------------------
def init_book_state(book_id: str, title: str, url: str, chapters_total: int):
"""
Initialiseert de complete Redis state voor een nieuw boek.
LET OP:
- Als een key al bestaat NIET resetten (progress behouden).
- Alleen missende velden worden toegevoegd.
"""
key = f"book:{book_id}:state"
# Bestaat al? Dan vullen we alleen missende velden aan.
exists = r.exists(key)
pipeline = r.pipeline()
# Basis metadata
pipeline.hsetnx(key, "book_id", book_id)
pipeline.hsetnx(key, "title", title or "")
pipeline.hsetnx(key, "url", url or "")
# State
pipeline.hsetnx(key, "status", "registered")
# Counters
pipeline.hsetnx(key, "chapters_total", chapters_total)
pipeline.hsetnx(key, "chapters_download_done", 0)
pipeline.hsetnx(key, "chapters_download_skipped", 0)
pipeline.hsetnx(key, "chapters_parsed_done", 0)
pipeline.hsetnx(key, "audio_done", 0)
pipeline.hsetnx(key, "audio_skipped", 0)
# Timestamp
pipeline.hset(key, "last_update", int(time.time()))
pipeline.execute()
if exists:
log(f"[DB-REDIS] init_book_state(): UPDATED existing state for {book_id}")
else:
log(f"[DB-REDIS] init_book_state(): CREATED new state for {book_id}")

@ -1,5 +1,5 @@
# ============================================================
# File: db/state_sql.py
# File: db/state_sql.py (UPDATED for book_idx-only architecture)
# Purpose:
# Low-level SQLite snapshot layer for BookScraper metadata.
# Used ONLY through db.repository façade.
@ -10,7 +10,8 @@ import os
from logbus.publisher import log
DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/db/books.db")
# Must match db/db.py
DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/data/books.db")
# ------------------------------------------------------------
@ -25,10 +26,10 @@ def _connect():
# ------------------------------------------------------------
# FETCH
# ------------------------------------------------------------
def sql_fetch_book(book_id):
def sql_fetch_book(book_idx):
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,))
cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
row = cur.fetchone()
conn.close()
return dict(row) if row else None
@ -37,7 +38,7 @@ def sql_fetch_book(book_id):
def sql_fetch_all_books():
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books ORDER BY rowid DESC")
cur.execute("SELECT * FROM books ORDER BY created_at DESC")
rows = cur.fetchall()
conn.close()
return [dict(r) for r in rows]
@ -46,22 +47,27 @@ def sql_fetch_all_books():
# ------------------------------------------------------------
# REGISTER / UPDATE
# ------------------------------------------------------------
def sql_register_book(book_id, fields: dict):
def sql_register_book(book_idx, fields: dict):
"""
Insert or replace entire book record.
book_idx is the PRIMARY KEY.
"""
conn = _connect()
cur = conn.cursor()
cols = ", ".join(["book_id"] + list(fields.keys()))
cols = ", ".join(["book_idx"] + list(fields.keys()))
placeholders = ", ".join(["?"] * (1 + len(fields)))
values = [book_id] + list(fields.values())
values = [book_idx] + list(fields.values())
cur.execute(
f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})", values
f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})",
values,
)
conn.commit()
conn.close()
def sql_update_book(book_id, fields: dict):
def sql_update_book(book_idx, fields: dict):
if not fields:
return
@ -69,9 +75,12 @@ def sql_update_book(book_id, fields: dict):
cur = conn.cursor()
set_clause = ", ".join([f"{k} = ?" for k in fields])
params = list(fields.values()) + [book_id]
params = list(fields.values()) + [book_idx]
cur.execute(f"UPDATE books SET {set_clause} WHERE book_id = ?", params)
cur.execute(
f"UPDATE books SET {set_clause} WHERE book_idx = ?",
params,
)
conn.commit()
conn.close()
@ -79,10 +88,13 @@ def sql_update_book(book_id, fields: dict):
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def sql_set_status(book_id, status: str):
def sql_set_status(book_idx, status: str):
conn = _connect()
cur = conn.cursor()
cur.execute("UPDATE books SET status = ? WHERE book_id = ?", (status, book_id))
cur.execute(
"UPDATE books SET status = ? WHERE book_idx = ?",
(status, book_idx),
)
conn.commit()
conn.close()
@ -90,11 +102,12 @@ def sql_set_status(book_id, status: str):
# ------------------------------------------------------------
# CHAPTER TOTAL (snapshot)
# ------------------------------------------------------------
def sql_set_chapters_total(book_id, total: int):
def sql_set_chapters_total(book_idx, total: int):
conn = _connect()
cur = conn.cursor()
cur.execute(
"UPDATE books SET chapters_total = ? WHERE book_id = ?", (total, book_id)
"UPDATE books SET chapters_total = ? WHERE book_idx = ?",
(total, book_idx),
)
conn.commit()
conn.close()
@ -103,63 +116,63 @@ def sql_set_chapters_total(book_id, total: int):
# ------------------------------------------------------------
# COUNTERS (SNAPSHOT-ONLY)
# ------------------------------------------------------------
def sql_inc_downloaded(book_id, amount=1):
def sql_inc_downloaded(book_idx, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET downloaded = COALESCE(downloaded,0) + ?
WHERE book_id = ?
WHERE book_idx = ?
""",
(amount, book_id),
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_parsed(book_id, amount=1):
def sql_inc_parsed(book_idx, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET parsed = COALESCE(parsed,0) + ?
WHERE book_id = ?
WHERE book_idx = ?
""",
(amount, book_id),
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_audio_done(book_id, amount=1):
log(f"[DB-SQL] Incrementing audio done for {book_id} by {amount}")
def sql_inc_audio_done(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio_done for {book_idx} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_done = COALESCE(audio_done,0) + ?
WHERE book_id = ?
WHERE book_idx = ?
""",
(amount, book_id),
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_audio_skipped(book_id, amount=1):
log(f"[DB-SQL] Incrementing audio skipped for {book_id} by {amount}")
def sql_inc_audio_skipped(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio_skipped for {book_idx} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_skipped = COALESCE(audio_skipped,0) + ?
WHERE book_id = ?
WHERE book_idx = ?
""",
(amount, book_id),
(amount, book_idx),
)
conn.commit()
conn.close()

@ -2,8 +2,6 @@ import os
import redis
from scraper.logger_decorators import logcall
# GUI log (non-breaking)
from scraper.ui_log import push_ui
# ---------------------------------------------------------
@ -15,55 +13,58 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# Debug mode (optional)
ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1"
# Internal flag to avoid spamming the same message
# Avoid duplicate spam
_seen_debug_keys = set()
# =========================================================
# ABORT FLAG
# INTERNAL DEBUGGING
# =========================================================
def _debug(msg: str):
"""Print + GUI log (non-breaking, minimal noise)."""
print(msg)
push_ui(msg)
def set_abort(book_id: str):
"""Enable abort mode for this book."""
key = f"abort:{book_id}"
# =========================================================
# ABORT FLAG — unified book_idx
# =========================================================
def set_abort(book_idx: str):
"""Enable abort mode for book_idx."""
key = f"abort:{book_idx}"
r.set(key, "1")
if ABORT_DEBUG:
_debug(f"[ABORT] SET {key}")
def clear_abort(book_id: str):
def clear_abort(book_idx: str):
"""Clear abort flag."""
key = f"abort:{book_id}"
key = f"abort:{book_idx}"
r.delete(key)
if ABORT_DEBUG:
_debug(f"[ABORT] CLEAR {key}")
def abort_requested(book_id: str, redis_client=None) -> bool:
def abort_requested(book_idx: str, redis_client=None) -> bool:
"""
Return True if abort flag is set.
Check whether abort flag is active for book_idx.
redis_client:
- Docker workers None use default Redis (r)
- Local macOS audio passes Redis(host=127.0.0.1)
- Local macOS audio worker passes Redis(host=127.0.0.1)
"""
client = redis_client or r
key = f"abort:{book_id}"
key = f"abort:{book_idx}"
try:
exists = client.exists(key)
if ABORT_DEBUG:
# Log once per key
# Log only once per book
if key not in _seen_debug_keys:
try:
conn = client.connection_pool.connection_kwargs
@ -71,54 +72,54 @@ def abort_requested(book_id: str, redis_client=None) -> bool:
port = conn.get("port")
db = conn.get("db")
_debug(
f"[ABORT_DEBUG] first check book_id={book_id} "
f"[ABORT_DEBUG] first check book_idx={book_idx} "
f"redis={host}:{port} db={db}"
)
except Exception:
_debug(f"[ABORT_DEBUG] first check book_id={book_id}")
_debug(f"[ABORT_DEBUG] first check book_idx={book_idx}")
_seen_debug_keys.add(key)
# Only log abort ACTIVE
# Log ACTIVE state
if exists == 1:
_debug(f"[ABORT] ACTIVE for {book_id}")
_debug(f"[ABORT] ACTIVE for {book_idx}")
return exists == 1
except Exception as e:
if ABORT_DEBUG:
_debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}")
return False
# =========================================================
# PER-CHAPTER STATE
# PER-CHAPTER STATE — unified book_idx
# =========================================================
def mark_chapter_started(book_id: str, chapter_num: int):
key = f"started:{book_id}:{chapter_num}"
def mark_chapter_started(book_idx: str, chapter_num: int):
key = f"started:{book_idx}:{chapter_num}"
r.set(key, "1")
def chapter_started(book_id: str, chapter_num: int) -> bool:
key = f"started:{book_id}:{chapter_num}"
def chapter_started(book_idx: str, chapter_num: int) -> bool:
key = f"started:{book_idx}:{chapter_num}"
return r.exists(key) == 1
# =========================================================
# UTILITY: RESET FOR A BOOK
# RESET STATE FOR BOOK_IDX
# =========================================================
def reset_book_state(book_id: str):
def reset_book_state(book_idx: str):
"""
Remove abort flag and all chapter-start markers.
Remove abort flag and all per-chapter started markers.
"""
key = f"abort:{book_id}"
r.delete(key)
# abort flag
r.delete(f"abort:{book_idx}")
pattern = f"started:{book_id}:*"
# chapter markers
pattern = f"started:{book_idx}:*"
for k in r.scan_iter(pattern):
r.delete(k)

@ -4,10 +4,9 @@
# Backwards-compatible wrapper giving the SAME public API
# as the old BookScraper, but internally uses ScrapeEngine.
#
# execute() → full metadata + chapterlist
# execute() → full metadata + chapterlist (NO book_idx creation)
#
# (* Chapter downloading komt later in ScrapeEngine,
# maar deze wrapper hoeft NIET aangepast te worden.)
# ID management is now handled exclusively by InitService.
# ============================================================
from scraper.logger_decorators import logcall
@ -18,21 +17,15 @@ class BookScraper:
"""
Backwards-compatible BookScraper façade.
In het oude systeem deed BookScraper ALLES:
- metadata ophalen
- cover ophalen
- hoofdstukkenlijst
- hoofdstukken downloaden
- volume folders
- skip logic
Old responsibilities (metadata, chapters, covers, downloads)
are now split:
In het nieuwe systeem is dát opgesplitst:
ScrapeEngine metadata + chapterlist
Download tasks handle download/parse/save
InitService determines book_idx (single source of truth)
ScrapeEngine metadata / chapterlist / download engine (in ontwikkeling)
BookScraper behoudt dezelfde API als voorheen
Daardoor kunnen Celery-tasks en oudere modules blijven werken
zonder refactor-chaos.
This wrapper intentionally does NOT generate a book_idx or book_id.
It only returns metadata/chapters in legacy-compatible dict format.
"""
@logcall
@ -43,18 +36,14 @@ class BookScraper:
@logcall
def execute(self):
"""
Public legacy API.
Retourneert metadata + chapters EXACT zoals de oude BookScraper
vóór downloadfase.
Dit is belangrijk:
- INIT-flow gebruikt metadata only
- scraping tasks gebruiken chapterlist
Legacy public API:
Return metadata + chapter list EXACTLY as before,
but without generating any book_id.
"""
data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url)
# Legacy output structuur volledig repliceren:
# Legacy structure preserved, unchanged:
return {
"title": data.get("title"),
"author": data.get("author"),
@ -62,5 +51,5 @@ class BookScraper:
"cover_url": data.get("cover_url"),
"chapters": data.get("chapters", []),
"chapters_total": data.get("chapters_total", 0),
"book_url": data.get("book_url"),
"book_url": data.get("book_url"), # used later by parse/save tasks
}

@ -1,55 +1,54 @@
# =========================================================
# File: scraper/download_controller.py
# Purpose:
# Build Celery pipelines for all chapters
# and pass book_id for abort/progress/log functionality.
# + Download and replicate cover image to all volume folders
# + Generate scripts (allinone.txt, makebook, say)
# + Initialize Redis Book State Model (status + counters)
# Build Celery pipelines for all chapters using book_idx
# Handles:
# • volume assignment
# • cover download + replication
# • script generation
# • Redis Book State Model init
# • abort tracking
# =========================================================
from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline
from scraper.scriptgen import generate_all_scripts
# ❗ IMPORTANT:
# generate_all_scripts MUST NOT import DownloadController, otherwise circular import.
# We keep the import, but scriptgen must be clean.
from scraper import scriptgen
from logbus.publisher import log
import os
import requests
import shutil
from scraper.abort import abort_requested # DEBUG allowed
from scraper.abort import abort_requested
from db.state_redis import init_book_state
from db.repository import set_status, set_chapters_total
class DownloadController:
"""
Coordinates all chapter pipelines (download parse save),
including:
- volume splitting
- consistent meta propagation
- book_id-based abort + progress tracking
- cover download + volume replication
- script generation (allinone.txt, makebook, say)
- Redis book state initialisation and status updates
Coordinates all chapter pipelines (download parse save).
"""
def __init__(self, book_id: str, scrape_result: dict):
self.book_id = book_id
def __init__(self, book_idx: str, scrape_result: dict):
self.book_idx = str(book_idx)
self.scrape_result = scrape_result
# Core metadata
# Metadata
self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or []
self.cover_url = scrape_result.get("cover_url")
# Output base dir
# Output folder
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
# Volume size
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
# Base folder for the whole book
self.book_base = os.path.join(root, self.title)
os.makedirs(self.book_base, exist_ok=True)
# Meta passed to parse/save stage
# Meta passed downstream
self.meta = {
"title": self.title,
"author": scrape_result.get("author"),
@ -57,200 +56,120 @@ class DownloadController:
"book_url": scrape_result.get("book_url"),
}
# -------------------------------------------------
# DEBUG — bevestig dat controller correct book_id ziet
# -------------------------------------------------
log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'")
log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}")
try:
abort_state = abort_requested(book_id)
log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested ERROR: {e}")
# -------------------------------------------------
# NEW: Initialize Redis Book State Model
# -------------------------------------------------
# Init Redis Book State Model
try:
init_book_state(
book_id=self.book_id,
book_id=self.book_idx,
title=self.title,
url=self.scrape_result.get("book_url"),
url=self.meta["book_url"],
chapters_total=len(self.chapters),
)
log(f"[CTRL_STATE] init_book_state() completed for {self.title}")
except Exception as e:
log(f"[CTRL_STATE] init_book_state FAILED: {e}")
# ---------------------------------------------------------
# Cover Download
# ---------------------------------------------------------
def download_cover(self):
"""Download one cover image into the root of the book folder."""
if not self.cover_url:
log(f"[CTRL] No cover URL found for '{self.title}'")
return
return log(f"[CTRL] No cover URL for '{self.title}'")
cover_path = os.path.join(self.book_base, "cover.jpg")
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
),
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
"User-Agent": "Mozilla/5.0",
"Referer": self.scrape_result.get("book_url") or "",
}
try:
log(f"[CTRL] Downloading cover: {self.cover_url}")
resp = requests.get(self.cover_url, timeout=10, headers=headers)
resp.raise_for_status()
with open(cover_path, "wb") as f:
f.write(resp.content)
log(f"[CTRL] Cover saved to: {cover_path}")
log(f"[CTRL] Cover saved: {cover_path}")
except Exception as e:
log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
log(f"[CTRL] Cover download failed: {e}")
# ---------------------------------------------------------
# Cover Replication to Volumes
# ---------------------------------------------------------
def replicate_cover_to_volumes(self):
"""Copy cover.jpg into each existing Volume_xxx directory."""
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
log("[CTRL] No cover.jpg found, replication skipped")
return
try:
for entry in os.listdir(self.book_base):
if entry.lower().startswith("volume_"):
vol_dir = os.path.join(self.book_base, entry)
dst = os.path.join(vol_dir, "cover.jpg")
dst = os.path.join(self.book_base, entry, "cover.jpg")
try:
shutil.copyfile(src, dst)
log(f"[CTRL] Cover replicated into: {dst}")
log(f"[CTRL] Cover replicated → {dst}")
except Exception as e:
log(f"[CTRL] Cover replication failed: {e}")
# ---------------------------------------------------------
def store_cover_in_static(self):
"""
Copy the main cover.jpg from book_base into static/covers/<book_id>.jpg.
This allows the Flask web UI to serve the cover directly.
"""
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
log("[CTRL] No cover.jpg found, cannot store in static/covers")
return
# static/covers/<book_id>.jpg
static_dir = os.path.join("static", "covers")
os.makedirs(static_dir, exist_ok=True)
dst = os.path.join(static_dir, f"{self.book_id}.jpg")
os.makedirs("static/covers", exist_ok=True)
dst = os.path.join("static/covers", f"{self.book_idx}.jpg")
try:
shutil.copyfile(src, dst)
log(f"[CTRL] Cover stored for UI: {dst}")
except Exception as e:
log(f"[CTRL] Failed to store cover in static: {e}")
log(f"[CTRL] Failed storing cover: {e}")
# ---------------------------------------------------------
# Volume isolation
# ---------------------------------------------------------
def get_volume_path(self, chapter_num: int) -> str:
"""Returns the correct volume directory for a chapter."""
vol_index = (chapter_num - 1) // self.max_vol + 1
vol_name = f"Volume_{vol_index:03d}"
vol_path = os.path.join(self.book_base, vol_name)
os.makedirs(vol_path, exist_ok=True)
return vol_path
# ---------------------------------------------------------
# Pipeline launcher
# ---------------------------------------------------------
def start(self):
total = len(self.chapters)
log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)")
log(
f"[CTRL] Initialising pipeline for '{self.title}' "
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
)
log(f"[CTRL] Output root: {self.book_base}")
# -------------------------------------
# NEW: Redis state update
# -------------------------------------
# Update Redis/SQLite state
try:
set_status(self.book_id, "downloading")
set_chapter_total(self.book_id, total)
log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}")
set_status(self.book_idx, "downloading")
set_chapters_total(self.book_idx, total)
except Exception as e:
log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}")
log(f"[CTRL_STATE] Unable to set state: {e}")
# -------------------------------------
# 1) Download cover
# -------------------------------------
# Download cover
self.download_cover()
# Build pipeline tasks
tasks = []
for ch in self.chapters:
# Build chapter_dict (NEW)
chapter_num = ch["num"]
chapter_url = ch["url"]
chapter_title = ch.get("title")
volume_path = self.get_volume_path(chapter_num)
chapter_dict = {
"num": chapter_num,
"url": chapter_url,
"title": chapter_title,
"volume_path": volume_path,
num = ch["num"]
chapter_info = {
"num": num,
"url": ch["url"],
"title": ch.get("title"),
"volume_path": self.get_volume_path(num),
}
# Dispatch pipeline with chapter_dict
tasks.append(
build_chapter_pipeline(
self.book_id,
chapter_dict,
self.meta,
)
)
tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta))
async_result = group(tasks).apply_async()
log(
f"[CTRL] Pipelines dispatched for '{self.title}' "
f"(book_id={self.book_id}, group_id={async_result.id})"
)
# Debug abort state
try:
abort_state = abort_requested(self.book_id)
log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}")
# -------------------------------------------------------
# Replicate cover + place in static
self.replicate_cover_to_volumes()
self.store_cover_in_static()
# -------------------------------------------------------
# Generate scripts (LATE IMPORT to avoid circular)
try:
generate_all_scripts(
self.book_base,
self.title,
self.meta.get("author"),
scriptgen.generate_all_scripts(
self.book_base, self.title, self.meta["author"]
)
log(f"[CTRL] Scripts generated for '{self.title}'")
log("[CTRL] Scripts generated")
except Exception as e:
log(f"[CTRL] Script generation failed: {e}")

@ -54,6 +54,8 @@ Copyright=
章节出错=
点此举报=
举报原因=
求收藏=
推荐票=
www.piaotia.com=
www.piaotian.com=
www.=

@ -4,7 +4,7 @@
# Orchestrate INIT-flow:
# - resolve site
# - fetch minimal metadata
# - derive book_id
# - derive book_idx
# - register in SQLite
# - store main cover
# ============================================================
@ -21,33 +21,47 @@ from scraper.logger_decorators import logcall
class InitService:
# ------------------------------------------------------------
# BOOK IDX DERIVATION
# ------------------------------------------------------------
@staticmethod
@logcall
def derive_book_id(url: str) -> str:
"""
PTWXZ URL format ends with /{id}.html.
If no match fallback to sanitized URL.
Returns:
book_idx (string)
"""
m = re.search(r"/(\d+)\.html$", url)
if m:
return m.group(1)
return url.replace("/", "_")
# Fallback — ensures deterministic ID for unknown formats
return url.replace("/", "_").replace(":", "_")
# ------------------------------------------------------------
# MAIN INIT FLOW
# ------------------------------------------------------------
@staticmethod
@logcall
def execute(url: str) -> dict:
"""
Main INIT-flow entry point.
Returns complete metadata + registration info.
INIT entry point.
Returns complete metadata + registration result.
"""
# 1) Determine site
# 1) Resolve site handler
site = SiteResolver.resolve(url)
book_id = InitService.derive_book_id(url)
# 2) Create unified book_idx
book_idx = InitService.derive_book_id(url)
# Some site objects historically expect .book_id — we support it but DO NOT rely on it.
site.book_id = book_idx
site.book_id = book_id
# 2) Metadata only
# 3) Fetch initial metadata (title/author/description/cover)
meta = ScrapeEngine.fetch_metadata_only(site, url)
title = meta.get("title") or "Unknown"
@ -55,27 +69,27 @@ class InitService:
description = meta.get("description")
cover_url = meta.get("cover_url")
# 4) Download UI cover (NEW: capture returned local path)
cover_path = CoverService.download_main_cover(cover_url, book_id)
# 4) Download & store main cover for UI
cover_path = CoverService.download_main_cover(cover_url, book_idx)
# 5) SQLite registration INCLUDING cover_path ← ★ FIX
# 5) Register in SQLite (book_idx is the SOLE primary ID)
register_book(
book_id=book_id,
book_idx=book_idx,
title=title,
author=author,
description=description,
cover_url=cover_url,
cover_path=cover_path, # ← ★ BELANGRIJK
cover_path=cover_path,
book_url=url,
)
# 6) Output for UI
# 6) Return metadata for UI / API
return {
"book_id": book_id,
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path, # ← handig voor UI
"cover_path": cover_path,
"status": "registered",
}

@ -1,8 +1,8 @@
# ============================================================
# File: scraper/services/scrape_engine.py
# File: scraper/services/scrape_engine.py (C&U — no circular import)
# Purpose:
# Unified scraping engine for INIT-flow and Celery tasks.
# All functions are fully logged via @logcall.
# ScrapeEngine does NOT determine book_idx itself.
# ============================================================
import os
@ -23,6 +23,10 @@ class ScrapeEngine:
Central scraping engine.
Metadata + chapterlist scraping.
All methods logged with @logcall.
IMPORTANT:
- ScrapeEngine NEVER decides book_idx.
- No dependency on InitService (prevents circular import).
"""
# ------------------------------------------------------------
@ -140,26 +144,23 @@ class ScrapeEngine:
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSER
# COVER PARSER (NO InitService dependency)
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_cover(soup, site):
"""
Vind cover door book_id substring matching:
- haal book_id uit site.url
- zoek IMG-tags waarvan filename book_id bevat
- kies kortste filename als beste match
Extract book index from URL heuristically instead of InitService
(prevents circular import).
"""
# Typical Chinese novel sites embed numeric ID in URL path
try:
parsed = urlparse(site.url)
m = re.search(r"/(\d+)\.html$", parsed.path)
if m:
book_id = m.group(1)
else:
book_id = parsed.path.rstrip("/").split("/")[-1]
digits = re.findall(r"\d+", parsed.path)
book_idx = digits[-1] if digits else None
except Exception:
return None
book_idx = None
imgs = soup.find_all("img", src=True)
candidates = []
@ -167,16 +168,14 @@ class ScrapeEngine:
for img in imgs:
src = img["src"].strip()
filename = os.path.basename(src)
if book_id in filename:
if book_idx and book_idx in filename:
candidates.append((filename, src))
if not candidates:
return None
candidates.sort(key=lambda t: len(t[0])) # kortste filename wint
best_src = candidates[0][1]
return urljoin(site.root, best_src)
candidates.sort(key=lambda t: len(t[0])) # smallest filename
return urljoin(site.root, candidates[0][1])
# ------------------------------------------------------------
# RESOLVE CHAPTER PAGE
@ -233,7 +232,7 @@ class ScrapeEngine:
def fetch_metadata_only(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url # NODIG voor cover parsing
site.url = url # needed for cover parsing
return {
"title": ScrapeEngine._parse_title(soup),

@ -1,109 +1,167 @@
# ============================================================
# File: scraper/tasks/controller_tasks.py
# Purpose:
# Start the download → parse → save pipeline for a scraped book,
# including progress/abort tracking via book_id.
# ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
# FULL scrape entrypoint + launching download/parse/save pipelines.
# NO result.get() anywhere. Scraping is done inline.
# ============================================================
from celery_app import celery_app
from logbus.publisher import log
from scraper.download_controller import DownloadController
import os
import time
import redis
from urllib.parse import urlparse
from scraper.logger_decorators import logcall
import redis
import os
from scraper.abort import abort_requested
from db.repository import set_chapters_total
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.site_resolver import SiteResolver
from db.repository import fetch_book, set_chapters_total
from scraper.download_controller import DownloadController
print(">>> [IMPORT] controller_tasks.py loaded")
@celery_app.task(bind=True, queue="controller", ignore_result=False)
# =============================================================
# 1) PUBLIC ENTRYPOINT — CALLED FROM /start
# =============================================================
@celery_app.task(
bind=True,
queue="controller",
ignore_result=False,
name="scraper.tasks.controller_tasks.start_full_scrape",
)
@logcall
def launch_downloads(self, book_id: str, scrape_result: dict):
def start_full_scrape(self, book_idx: str):
"""
FULL SCRAPE ENTRYPOINT.
Scraping is done inline no Celery .get() needed.
"""
Launch the entire pipeline (download parse save),
AND initialize progress counters.
Chapter-level progress is updated INSIDE the download/parse/save tasks.
This task MUST NOT call .get() on async subtasks (Celery restriction).
log(f"[CTRL] start_full_scrape(book_idx={book_idx})")
# Abort before doing anything
if abort_requested(book_idx):
log(f"[CTRL] PRE-ABORT flag detected for {book_idx}")
return {"book_idx": book_idx, "aborted": True, "reason": "pre-abort"}
# --------------------------------------------------------
# 1) Load book metadata from SQLite
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
msg = f"[CTRL] Book '{book_idx}' not found in DB"
log(msg)
raise ValueError(msg)
url = book.get("book_url")
if not url:
msg = f"[CTRL] No book_url stored for {book_idx}"
log(msg)
raise ValueError(msg)
# --------------------------------------------------------
# 2) INLINE SCRAPE (fast, no Celery wait)
# --------------------------------------------------------
site = SiteResolver.resolve(url)
try:
scrape_result = ScrapeEngine.fetch_metadata_and_chapters(site, url)
log(f"[CTRL] Scrape OK for {book_idx}: {scrape_result.get('title')}")
except Exception as e:
log(f"[CTRL] ERROR during scrape of {book_idx}: {e}")
raise
# --------------------------------------------------------
# 3) Continue → dispatch pipelines
# --------------------------------------------------------
return launch_downloads(book_idx, scrape_result)
# =============================================================
# 2) PIPELINE DISPATCH (NOT a Celery task)
# =============================================================
@logcall
def launch_downloads(book_idx: str, scrape_result: dict):
"""
Launches the entire processing pipeline:
- initialize Redis UI state
- initialize SQLite totals
- dispatch per-chapter pipelines via DownloadController
"""
title = scrape_result.get("title", "UnknownBook")
chapters = scrape_result.get("chapters", []) or []
total = len(chapters)
# ------------------------------------------------------------
# INIT BOOK STATE MODEL (required for Active Books dashboard)
# INIT REDIS STATE
# ------------------------------------------------------------
broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
parsed = urlparse(broker_url)
state = redis.Redis(
r = redis.Redis(
host=parsed.hostname,
port=parsed.port,
db=int(parsed.path.strip("/")),
decode_responses=True,
)
# Book metadata
state.set(f"book:{book_id}:title", title)
state.set(f"book:{book_id}:status", "starting")
base = f"book:{book_idx}:state"
# Download counters
state.set(f"book:{book_id}:download:total", total)
state.set(f"book:{book_id}:download:done", 0)
# Audio counters (start at zero)
state.set(f"book:{book_id}:audio:done", 0)
r.hset(base, "title", title)
r.hset(base, "status", "starting")
r.hset(base, "chapters_total", total)
r.hset(base, "chapters_download_done", 0)
r.hset(base, "chapters_download_skipped", 0)
r.hset(base, "chapters_parsed_done", 0)
r.hset(base, "audio_done", 0)
r.hset(base, "audio_skipped", 0)
r.hset(base, "last_update", int(time.time()))
# ------------------------------------------------------------
# INIT PROGRESS
# INIT SQLITE SNAPSHOT
# ------------------------------------------------------------
set_chapters_total(book_id, total)
try:
set_chapters_total(book_idx, total)
except Exception as e:
log(f"[CTRL] ERROR updating SQLite totals: {e}")
raise
log(f"[CTRL] Progress initialized for {book_id}: total={total}")
log(f"[CTRL] Initialized totals for {book_idx}: {total}")
# ------------------------------------------------------------
# BUILD CONTROLLER
# ABORT CHECK BEFORE LAUNCHING JOBS
# ------------------------------------------------------------
ctl = DownloadController(book_id, scrape_result)
if abort_requested(book_idx):
log(f"[CTRL] ABORT flag detected — stopping BEFORE dispatch for {book_idx}")
r.hset(base, "status", "aborted")
return {"book_idx": book_idx, "aborted": True, "reason": "abort-before-start"}
# ------------------------------------------------------------
# START PIPELINES (ASYNC)
# Returns a celery group AsyncResult. We DO NOT iterate or get().
# Progress & failures are handled by the worker subtasks.
# BUILD + DISPATCH PER-CHAPTER PIPELINES
# ------------------------------------------------------------
try:
group_result = ctl.start()
log(
f"[CTRL] Pipelines dispatched for '{title}' "
f"(book_id={book_id}, group_id={group_result.id})"
)
controller = DownloadController(book_idx, scrape_result)
# Abort flag set BEFORE tasks start?
if abort_requested(book_id):
log(f"[CTRL] ABORT requested before tasks start")
return {"book_id": book_id, "aborted": True}
except Exception as exc:
log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
try:
group_result = controller.start()
gid = getattr(group_result, "id", None)
log(f"[CTRL] Pipelines dispatched for {book_idx} (group_id={gid})")
except Exception as e:
log(f"[CTRL] ERROR dispatching pipelines for {book_idx}: {e}")
raise
# ------------------------------------------------------------
# CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
# (Download/parse/save tasks update progress themselves)
# ------------------------------------------------------------
log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
# Update UI state to "downloading"
r.hset(base, "status", "downloading")
r.hset(base, "last_update", int(time.time()))
return {
"book_id": book_id,
"book_idx": book_idx,
"total": total,
"started": True,
"group_id": group_result.id,
"group_id": gid,
}

@ -1,12 +1,15 @@
# ============================================================
# File: scraper/tasks/download_tasks.py
# Purpose:
# Download chapter HTML into payload["html"].
# Updated for book_idx unified ID model.
# ============================================================
from celery_app import celery_app
from scraper.utils.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started
# Repository façade — correct imports only
# Unified repository façade
from db.repository import (
set_status,
inc_download_done,
@ -30,9 +33,9 @@ print(">>> [IMPORT] download_tasks.py loaded")
# -----------------------------------------------------------
# TIMESTAMPED LOG WRAPPER
# -----------------------------------------------------------
def log_msg(book_id: str, message: str):
def log_msg(book_idx: str, message: str):
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
full = f"{ts} [{book_id}] {message}"
full = f"{ts} [{book_idx}] {message}"
log(full)
push_ui(full)
@ -85,19 +88,27 @@ def release_global_slot():
# ============================================================
# CELERY TASK — Unified payload v3
# CELERY TASK — Payload v3 (book_idx model)
# ============================================================
@celery_app.task(bind=True, queue="download", ignore_result=False)
@logcall
def download_chapter(self, payload: dict):
"""
Payload:
Payload format:
{
"book_id": str,
"chapter": { "num", "url", "title", "volume_path" },
"book_idx": str,
"chapter": {
"num": int,
"title": str,
"url": str,
"volume_path": str
},
"book_meta": dict,
# fields filled during pipeline:
"html": None | str,
"parsed": None | dict,
"parsed": None | str,
"skipped": bool,
"path": None | str
}
@ -106,7 +117,7 @@ def download_chapter(self, payload: dict):
if not payload:
raise ValueError("download_chapter received empty payload")
book_id = payload["book_id"]
book_idx = payload["book_idx"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
@ -115,44 +126,55 @@ def download_chapter(self, payload: dict):
chapter_title = chapter.get("title") or f"Chapter {chapter_num}"
volume_path = chapter["volume_path"]
# STATUS UPDATE
set_status(book_id, "downloading")
# -----------------------------------------------------------
# STATUS UPDATE (book is now in 'downloading')
# -----------------------------------------------------------
set_status(book_idx, "downloading")
# ABORT CHECK
if abort_requested(book_id) and not chapter_started(book_id, chapter_num):
log_msg(book_id, f"[ABORT] Skip chapter {chapter_num}")
# -----------------------------------------------------------
# ABORT CHECK (skip if not yet started)
# -----------------------------------------------------------
if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num):
log_msg(book_idx, f"[ABORT] Skip chapter {chapter_num}")
inc_download_skipped(book_id)
inc_download_skipped(book_idx)
payload["html"] = None
payload["skipped"] = True
payload["path"] = None
return payload
mark_chapter_started(book_id, chapter_num)
mark_chapter_started(book_idx, chapter_num)
# SKIP IF FILE ALREADY SAVED
# -----------------------------------------------------------
# SKIP IF FILE ALREADY EXISTS
# -----------------------------------------------------------
save_path = get_save_path(chapter_num, volume_path)
if os.path.exists(save_path):
log_msg(book_id, f"[DL] SKIP {chapter_num}{save_path}")
log_msg(book_idx, f"[DL] SKIP {chapter_num}{save_path}")
inc_download_skipped(book_id)
inc_download_skipped(book_idx)
payload["html"] = None
payload["skipped"] = True
payload["path"] = save_path
return payload
# GLOBAL DELAY + SLOT
# -----------------------------------------------------------
# GLOBAL DELAY + CONCURRENCY
# -----------------------------------------------------------
if GLOBAL_DELAY > 0:
time.sleep(GLOBAL_DELAY)
wait_for_global_delay()
acquire_global_slot(MAX_CONCURRENCY)
# -----------------------------------------------------------
# HTTP DOWNLOAD
# -----------------------------------------------------------
try:
log_msg(book_id, f"[DL] Downloading {chapter_num} ({chapter_title})")
log_msg(book_idx, f"[DL] Downloading {chapter_num} ({chapter_title})")
resp = requests.get(
chapter_url,
@ -164,11 +186,10 @@ def download_chapter(self, payload: dict):
resp.encoding = resp.apparent_encoding or "gb2312"
html = resp.text
log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes")
log_msg(book_idx, f"[DL] OK {chapter_num}: {len(html)} bytes")
inc_download_done(book_id)
inc_download_done(book_idx)
# --- attach results ---
payload["html"] = html
payload["skipped"] = False
payload["path"] = save_path
@ -178,13 +199,15 @@ def download_chapter(self, payload: dict):
attempt = self.request.retries
delay = BASE_DELAY * (BACKOFF**attempt)
# Handle 429
if getattr(getattr(exc, "response", None), "status_code", None) == 429:
log_msg(book_id, f"[DL] 429 → WAIT {DELAY_429}s")
log_msg(book_idx, f"[DL] 429 → WAIT {DELAY_429}s")
time.sleep(DELAY_429)
set_global_delay()
raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES)
log_msg(book_id, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s")
# General retry with backoff
log_msg(book_idx, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s")
raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
finally:

@ -2,7 +2,7 @@
# File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text.
# Enhanced Piaotia extractor + selector fallback + clean pipeline.
# Compatible with payload pipeline v3.
# Compatible with payload pipeline v3 + book_idx refactor.
# ============================================================
from celery_app import celery_app
@ -14,11 +14,11 @@ from scraper.logger_decorators import logcall
from db.repository import inc_parsed_done
print(">>> [IMPORT] parse_tasks.py loaded (RESTORED + payload v3)")
print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)")
# ============================================================
# PIAOTIA ADVANCED CONTENT EXTRACTOR (unchanged from original)
# PIAOTIA ADVANCED CONTENT EXTRACTOR
# ============================================================
def extract_piaotia_content(soup):
h1 = soup.find("h1")
@ -44,39 +44,32 @@ def extract_piaotia_content(soup):
if hasattr(sib, "get_text"):
text = sib.get_text(strip=True)
# STOP CONDITIONS
# <!-- 翻页 -->
# Stop conditions
if isinstance(sib, Comment) and ("翻页" in sib):
break
# explicit footer blocks
if name == "div":
sid = sib.get("id", "")
cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break
# copyright block
if text and ("重要声明" in text or "Copyright" in text):
break
# navigation blocks
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break
if name in ("script", "style"):
continue
if name == "center":
continue
# ACCUMULATE
# Accumulate
if isinstance(sib, NavigableString):
s = sib.strip()
if s:
parts.append(s)
elif hasattr(sib, "get_text"):
t = sib.get_text(separator="\n").strip()
if t:
@ -86,7 +79,7 @@ def extract_piaotia_content(soup):
# ============================================================
# PARSE TASK — PAYLOAD PIPELINE (CORRECT v3 FORMAT)
# PARSE TASK — PAYLOAD PIPELINE v3 (book_idx)
# ============================================================
@celery_app.task(bind=True, queue="parse", ignore_result=False)
@logcall
@ -95,7 +88,8 @@ def parse_chapter(self, payload: dict):
if not payload:
return {"skipped": True, "reason": "empty_payload"}
book_id = payload["book_id"]
# NEW MODEL
book_idx = payload["book_idx"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
@ -103,24 +97,26 @@ def parse_chapter(self, payload: dict):
title = chapter.get("title") or f"Chapter {num}"
html = payload.get("html")
# SKIPPED DOWNLOAD → SKIP PARSE
# ------------------------------------------------------------
# DOWNLOAD SKIPPED → PARSE SKIP
# ------------------------------------------------------------
if payload.get("skipped"):
log_msg(book_id, f"[PARSE] SKIP chapter {num} (download skipped)")
log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)")
return payload
if not html:
log_msg(book_id, f"[PARSE] Missing HTML for chapter {num} → SKIP")
log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP")
payload["parsed"] = None
payload["skipped"] = True
return payload
log_msg(book_id, f"[PARSE] Parsing chapter {num}")
log_msg(book_idx, f"[PARSE] Parsing chapter {num}")
soup = BeautifulSoup(html, "lxml")
# ============================================================
# ------------------------------------------------------------
# STRICT SELECTORS
# ============================================================
# ------------------------------------------------------------
selectors = [
"#content",
"div#content",
@ -142,7 +138,7 @@ def parse_chapter(self, payload: dict):
raw = None
# --- STRICT SELECTOR FAILED → Piaotia extractor ---
# strict selectors failed → piaotia extractor
if node is None:
raw = extract_piaotia_content(soup)
else:
@ -154,37 +150,38 @@ def parse_chapter(self, payload: dict):
tag.decompose()
raw = soup.get_text(separator="\n")
# ============================================================
# MULTIPASS CLEANING via replacement files
# ============================================================
# ------------------------------------------------------------
# MULTIPASS CLEANING VIA replacement-block files
# ------------------------------------------------------------
REPL = load_all_replacements()
text = raw
for _ in range(5):
text = clean_text(text, REPL)
# ============================================================
# ------------------------------------------------------------
# Collapse double blank lines
# ============================================================
# ------------------------------------------------------------
cleaned = []
prev_blank = False
for line in text.split("\n"):
stripped = line.rstrip()
if stripped == "":
s = line.rstrip()
if s == "":
if prev_blank:
continue
prev_blank = True
cleaned.append("")
else:
prev_blank = False
cleaned.append(stripped)
cleaned.append(s)
text = "\n".join(cleaned)
text = f"{title}\n{text}"
# ============================================================
# Add header to chapter 1
# ============================================================
# ------------------------------------------------------------
# Header on chapter 1
# ------------------------------------------------------------
if num == 1:
book_url = book_meta.get("book_url") or "UNKNOWN"
header = (
@ -195,14 +192,14 @@ def parse_chapter(self, payload: dict):
)
text = header + text
log_msg(book_id, f"[PARSE] Parsed chapter {num}: {len(text)} chars")
log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars")
# ============================================================
# PAYLOAD OUTPUT (v3)
# ============================================================
# ------------------------------------------------------------
# OUTPUT PAYLOAD
# ------------------------------------------------------------
payload["parsed"] = text
payload["skipped"] = False
inc_parsed_done(book_id)
inc_parsed_done(book_idx)
return payload

@ -7,6 +7,10 @@
# download_chapter(payload)
# → parse_chapter(payload)
# → save_chapter(payload)
#
# NOTE:
# - book_idx is the single authoritative key for all tasks
# - payload travels unchanged through the entire pipeline
# =========================================================
from celery import chain
@ -19,18 +23,23 @@ from scraper.logger_decorators import logcall
@logcall
def build_chapter_pipeline(book_id: str, chapter_dict: dict, book_meta: dict):
def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict):
"""
Payload model passed through entire pipeline.
Create a payload object passed through the pipeline.
Consistent with the chapter_dict-based task signature.
"""
payload = {
"book_id": book_id,
"book_idx": book_idx,
"chapter": chapter_dict,
"book_meta": book_meta,
# Will be filled by download_chapter
"html": None,
# Will be filled by parse_chapter
"parsed": None,
# Set by download or parse on skip/404/etc
"skipped": False,
# Final path written by save_chapter
"path": None,
}

@ -1,5 +1,5 @@
# ============================================================
# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC)
# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC + book_idx)
# ============================================================
print(">>> [IMPORT] save_tasks.py loaded")
@ -24,7 +24,9 @@ def save_chapter(self, payload: dict):
log("[SAVE] ERROR: payload is None")
return {"error": True}
book_id = payload["book_id"]
# NEW unified ID
book_idx = payload["book_idx"]
chapter = payload["chapter"]
parsed = payload.get("parsed")
path = payload.get("path")
@ -36,20 +38,19 @@ def save_chapter(self, payload: dict):
volume_name = os.path.basename(volume.rstrip("/"))
# ============================================================
# SKIPPED CASE (restore old behavior)
# SKIPPED CASE (old behavior restored)
# ============================================================
if skipped or not parsed:
log_msg(book_id, f"[SAVE] SKIP chapter {num}")
inc_download_skipped(book_id)
log_msg(book_idx, f"[SAVE] SKIP chapter {num}")
inc_download_skipped(book_idx)
# Restore old behavior:
# If file already exists, STILL trigger audio.
# OLD behavior: even skipped chapters still queue audio
if path and os.path.exists(path):
log_msg(book_id, f"[AUDIO] Queueing audio for SKIPPED chapter {num}")
log_msg(book_idx, f"[AUDIO] Queueing audio for SKIPPED chapter {num}")
try:
generate_audio.delay(book_id, volume_name, num, title, path)
generate_audio.delay(book_idx, volume_name, num, title, path)
except Exception as exc:
log_msg(book_id, f"[AUDIO] ERROR queueing skipped audio: {exc}")
log_msg(book_idx, f"[AUDIO] ERROR queueing skipped audio: {exc}")
return payload
@ -63,21 +64,21 @@ def save_chapter(self, payload: dict):
with open(save_path, "w", encoding="utf-8") as f:
f.write(parsed)
log_msg(book_id, f"[SAVE] Saved chapter {num}{save_path}")
log_msg(book_idx, f"[SAVE] Saved chapter {num}{save_path}")
inc_download_done(book_id)
inc_download_done(book_idx)
# Restore old behavior → ALWAYS queue audio
# OLD behavior: ALWAYS queue audio
try:
generate_audio.delay(book_id, volume_name, num, title, save_path)
log_msg(book_id, f"[AUDIO] Task queued for chapter {num}")
generate_audio.delay(book_idx, volume_name, num, title, save_path)
log_msg(book_idx, f"[AUDIO] Task queued for chapter {num}")
except Exception as exc:
log_msg(book_id, f"[AUDIO] ERROR queueing chapter {num}: {exc}")
log_msg(book_idx, f"[AUDIO] ERROR queueing chapter {num}: {exc}")
payload["path"] = save_path
payload["skipped"] = False
return payload
except Exception as exc:
log_msg(book_id, f"[SAVE] ERROR saving chapter {num}: {exc}")
log_msg(book_idx, f"[SAVE] ERROR saving chapter {num}: {exc}")
raise

@ -1,7 +1,9 @@
# ============================================================
# File: scraper/tasks/scraping.py
# Purpose: Scrape metadata + chapter list and initialise
# Redis progress tracking + launch download controller
# Purpose:
# Scrape ONLY metadata + chapter list.
# Does NOT launch download controller anymore.
# Controller decides when pipelines start.
# ============================================================
from celery_app import celery_app
@ -12,86 +14,88 @@ import redis
from scraper.logger_decorators import logcall
from scraper.sites import BookSite
from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort # no circular deps
from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT
from scraper.abort import clear_abort
from scraper.ui_log import reset_ui_logs
from scraper.services.init_service import InitService
print(">>> [IMPORT] scraping.py loaded")
# Redis connection (same as Celery broker)
# Redis connection (same DB as Celery broker)
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
@celery_app.task(bind=True, queue="scraping", ignore_result=False)
@celery_app.task(
bind=True,
queue="scraping",
ignore_result=False,
name="scraper.tasks.scraping.start_scrape_book",
)
@logcall
def start_scrape_book(self, url: str):
"""Scrapes metadata + chapters and prepares download tracking."""
"""
Scrapes metadata + chapters.
DOES NOT START download / pipeline controller.
The controller_tasks.start_full_scrape() task will call this one.
"""
# ------------------------------------------------------------
# NEW: clear UI log buffer at start of new run
# CLEAR UI LOG BUFFER
# ------------------------------------------------------------
reset_ui_logs()
log(f"[SCRAPING] Start scraping for: {url}")
# ------------------------------------------------------------
# Book scrape
# SCRAPE (old engine)
# ------------------------------------------------------------
site = BookSite()
scraper = BookScraper(site, url)
result = scraper.execute() # returns dict with metadata + chapters
result = scraper.execute() # → { title, author, chapters, cover_url, ... }
chapters = result.get("chapters", [])
full_count = len(chapters)
# ------------------------------------------------------------
# DRY RUN
# Compute unified book_idx
# ------------------------------------------------------------
book_idx = InitService.derive_book_id(url)
result["book_idx"] = book_idx
log(f"[SCRAPING] Assigned book_idx = {book_idx}")
# ------------------------------------------------------------
# DRY RUN TEST LIMIT
# ------------------------------------------------------------
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
if DRY_RUN:
log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}")
chapters = chapters[:TEST_LIMIT]
result["chapters"] = chapters
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
log(f"[SCRAPING] DRY RUN active → limiting chapters to {TEST_LIMIT}")
result["chapters"] = chapters[:TEST_LIMIT]
# ------------------------------------------------------------
# BOOK RUN ID (using title as ID)
# LOG RESULTS
# ------------------------------------------------------------
title = result.get("title") or "UnknownBook"
book_id = title # user requirement
result["book_id"] = book_id
log(f"[SCRAPING] Assigned book_id = '{book_id}'")
log(
f"[SCRAPING] Completed scrape: "
f"{len(result['chapters'])}/{full_count} chapters"
)
# ------------------------------------------------------------
# RESET ABORT + INITIALISE PROGRESS
# RESET ABORT + INITIALIZE LEGACY PROGRESS
# ------------------------------------------------------------
clear_abort(book_id)
clear_abort(book_idx)
r.set(f"progress:{book_id}:total", len(chapters))
r.set(f"progress:{book_id}:done", 0)
r.delete(f"logs:{book_id}") # clear old logs if any
r.set(f"progress:{book_idx}:total", len(result["chapters"]))
r.set(f"progress:{book_idx}:done", 0)
r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}")
r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters")
r.delete(f"logs:{book_idx}")
r.rpush(f"logs:{book_idx}", f":: SCRAPING STARTED for {url}")
r.rpush(f"logs:{book_idx}", f":: Found {len(result['chapters'])} chapters")
# ------------------------------------------------------------
# DISPATCH DOWNLOAD CONTROLLER
# IMPORTANT: DO NOT DISPATCH any pipelines here
# Controller will receive scrape_result and continue.
# ------------------------------------------------------------
celery_app.send_task(
"scraper.tasks.controller_tasks.launch_downloads",
args=[book_id, result],
queue="controller",
)
log(f"[SCRAPING] Dispatched download controller for '{book_id}'")
return {
"book_id": book_id,
"title": result.get("title"),
"author": result.get("author"),
"chapters": len(chapters),
}
return result

@ -1,10 +1,8 @@
# ============================================================
# File: scraper/utils/state_sync.py
# Purpose:
# State inspection + optional sync logic for book progress.
# This version provides:
# • inspect_books_state() → NO writes, just a dry-run
# • sync_books_from_redis() → NOT USED YET (kept commented)
# State inspection + optional sync logic for unified book_idx model.
# Generates full book-card compatible dicts for debug UI.
# ============================================================
import os
@ -12,17 +10,53 @@ import redis
from db.db import get_db
def inspect_books_state():
def _build_card(sqlite_row, redis_state, merged):
"""
Creates a dict that matches the fields required by components/bookcard.html:
b.book_idx
b.title
b.author
b.cover_path
b.status
b.created_at
b.download_done
b.download_total
b.audio_done
b.audio_total
"""
Reads all books from SQLite and fetches Redis progress,
but performs NO writes. Only shows:
- sqlite row
- redis state
- merged result (dry-run)
Returns a list of inspection dicts.
return {
"book_idx": sqlite_row.get("book_idx"),
"title": sqlite_row.get("title") or "Unknown",
"author": sqlite_row.get("author"),
"cover_path": sqlite_row.get("cover_path"),
# Use merged status (Redis > SQLite)
"status": merged.get("status") or sqlite_row.get("status") or "unknown",
# Meta
"created_at": sqlite_row.get("created_at"),
# Download counters
"download_done": merged.get("downloaded", 0),
"download_total": merged.get("chapters_total", 0),
# Audio counters
"audio_done": merged.get("audio_done", 0),
"audio_total": merged.get("chapters_total", 0),
}
# ============================================================
# INSPECT ONLY — NO WRITES
# ============================================================
def inspect_books_state():
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"))
Reads all books from SQLite and fetches Redis progress.
Builds:
entry.sqlite
entry.redis
entry.would_merge_to
entry.card (book-card compatible)
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db()
cur = db.cursor()
@ -32,110 +66,125 @@ def inspect_books_state():
results = []
for row in rows:
book_id = row["book_id"]
sqlite_row = dict(row)
book_idx = sqlite_row["book_idx"]
# Read redis state
redis_key = f"book:{book_id}:state"
progress = r.hgetall(redis_key)
redis_key = f"book:{book_idx}:state"
redis_state = r.hgetall(redis_key) or {}
if progress:
decoded = {k.decode(): v.decode() for k, v in progress.items()}
else:
decoded = {}
# Determine dry-run merged result
# ================================
# DRY-RUN MERGE LOGIC
# ================================
merged = sqlite_row.copy()
if decoded:
if redis_state:
merged["downloaded"] = int(
decoded.get("download_done", merged.get("downloaded", 0))
redis_state.get("chapters_download_done", merged.get("downloaded", 0))
)
merged["parsed"] = int(decoded.get("parsed_done", merged.get("parsed", 0)))
merged["parsed"] = int(
redis_state.get("chapters_parsed_done", merged.get("parsed", 0))
)
merged["audio_done"] = int(
decoded.get("audio_done", merged.get("audio_done", 0))
redis_state.get("audio_done", merged.get("audio_done", 0))
)
merged["chapters_total"] = int(
decoded.get("chapters_total", merged.get("chapters_total", 0))
redis_state.get("chapters_total", merged.get("chapters_total", 0))
)
merged["status"] = redis_state.get(
"status", merged.get("status", "unknown")
)
merged["status"] = decoded.get("status", merged.get("status", "unknown"))
# ================================
# Build book-card data
# ================================
card = _build_card(sqlite_row, redis_state, merged)
# ================================
# Append final result entry
# ================================
results.append(
{
"book_id": book_id,
"book_idx": book_idx,
"title": sqlite_row.get("title"),
"sqlite": sqlite_row,
"redis": decoded,
"redis": redis_state,
"would_merge_to": merged,
"card": card,
}
)
return results
# ============================================================
# SYNC REDIS → SQLITE (writes)
# ============================================================
def sync_books_from_redis():
"""
Reads all books from SQLite, fetches Redis progress,
and updates SQLite rows accordingly.
Returns a list of {
"book_id": ...,
"before": ...,
"redis": ...,
"after": ...
}
Writes Redis progress values back into SQLite.
Uses unified book_idx as identifier.
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"))
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db()
cur = db.cursor()
# Haal alle boeken op
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
book_id = row["book_id"]
before = dict(row)
book_idx = before["book_idx"]
redis_key = f"book:{book_id}:state"
progress = r.hgetall(redis_key)
redis_key = f"book:{book_idx}:state"
redis_state = r.hgetall(redis_key)
if not progress:
if not redis_state:
results.append(
{"book_id": book_id, "before": before, "redis": {}, "after": before}
{
"book_idx": book_idx,
"before": before,
"redis": {},
"after": before,
}
)
continue
# Decode Redis bytes → string dictionary
decoded = {k.decode(): v.decode() for k, v in progress.items()}
# Extract counters
downloaded = int(decoded.get("download_done", 0))
parsed = int(decoded.get("parsed_done", 0))
audio_done = int(decoded.get("audio_done", 0))
chapters_total = int(decoded.get("chapters_total", 0))
# Extract progress from Redis
downloaded = int(redis_state.get("chapters_download_done", 0))
parsed = int(redis_state.get("chapters_parsed_done", 0))
audio_done = int(redis_state.get("audio_done", 0))
total = int(redis_state.get("chapters_total", 0))
status = redis_state.get("status", before.get("status"))
# Redis status wins
status = decoded.get("status", before["status"])
# Write back to SQLite
# Update SQLite
cur.execute(
"""
UPDATE books
SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now')
WHERE book_id = ?
WHERE book_idx = ?
""",
(downloaded, parsed, audio_done, chapters_total, status, book_id),
(downloaded, parsed, audio_done, total, status, book_idx),
)
db.commit()
# Fetch updated row
cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,))
cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
after = dict(cur.fetchone())
results.append(
{"book_id": book_id, "before": before, "redis": decoded, "after": after}
{
"book_idx": book_idx,
"before": before,
"redis": redis_state,
"after": after,
}
)
return results

@ -2,7 +2,7 @@
File: static/js/dashboard.js
Purpose:
Dashboard interactions:
- Select active book
- Select active book_idx
- Live logs & progress
- Bookcard AJAX start/abort
NOTE:
@ -26,7 +26,7 @@ async function apiGet(url) {
/* ---------------------------------------------------------
Dashboard state
--------------------------------------------------------- */
let ACTIVE_BOOK = null;
let ACTIVE_BOOK_IDX = null;
let REFRESH_INTERVAL = null;
console.log(">>> dashboard.js LOADED");
@ -37,51 +37,51 @@ console.log(">>> dashboard.js LOADED");
document.addEventListener("DOMContentLoaded", () => {
console.log(">>> dashboard.js DOMContentLoaded");
// Fallback: fetch global logs if no active book
// Fallback: global logs when no active book_idx
setInterval(() => {
if (!ACTIVE_BOOK) refreshBook(null);
if (!ACTIVE_BOOK_IDX) refreshBook(null);
}, 2000);
// Sidebar items
const items = $$(".book-list-item");
items.forEach((item) => {
item.addEventListener("click", () => {
selectBook(item.dataset.bookId);
selectBook(item.dataset.bookIdx);
});
});
// Auto-select
if (!ACTIVE_BOOK && items[0]) {
selectBook(items[0].dataset.bookId);
// Auto-select first book
if (!ACTIVE_BOOK_IDX && items[0]) {
selectBook(items[0].dataset.bookIdx);
}
// Initial binding of book-card buttons
// Bind start/abort buttons inside cards
bindBookCardButtons();
// Refresh sidebar every 2 seconds
// Refresh sidebar every few seconds
setInterval(refreshActiveBooks, 2800);
});
/* ---------------------------------------------------------
Select a book
Select a book_idx
--------------------------------------------------------- */
function selectBook(bookId) {
ACTIVE_BOOK = bookId;
console.log(">>> Selecting book", bookId);
function selectBook(bookIdx) {
ACTIVE_BOOK_IDX = bookIdx;
console.log(">>> Selecting book_idx", bookIdx);
// Highlight sidebar
$$(".book-list-item").forEach((el) => {
el.classList.toggle("active", el.dataset.bookId === bookId);
el.classList.toggle("active", el.dataset.bookIdx === bookIdx);
});
// Reset polling
if (REFRESH_INTERVAL) clearInterval(REFRESH_INTERVAL);
REFRESH_INTERVAL = setInterval(() => {
refreshBook(ACTIVE_BOOK);
refreshBook(ACTIVE_BOOK_IDX);
}, 2000);
refreshBook(ACTIVE_BOOK);
refreshBook(ACTIVE_BOOK_IDX);
}
/* ---------------------------------------------------------
@ -99,7 +99,7 @@ async function refreshActiveBooks() {
books.forEach((b) => {
const div = document.createElement("div");
div.className = "book-list-item";
div.dataset.bookId = b.book_id;
div.dataset.bookIdx = b.book_idx;
div.innerHTML = `
<div class="book-title">${b.title}</div>
@ -110,27 +110,27 @@ async function refreshActiveBooks() {
</div>
`;
div.addEventListener("click", () => selectBook(b.book_id));
div.addEventListener("click", () => selectBook(b.book_idx));
container.appendChild(div);
});
if (!ACTIVE_BOOK && books.length > 0) {
selectBook(books[0].book_id);
if (!ACTIVE_BOOK_IDX && books.length > 0) {
selectBook(books[0].book_idx);
}
}
/* ---------------------------------------------------------
Fetch logs + progress
--------------------------------------------------------- */
async function refreshBook(bookId) {
if (!bookId) {
async function refreshBook(bookIdx) {
if (!bookIdx) {
const data = await apiGet("/logs");
if (data) updateLogs(data);
return;
}
const state = await apiGet(`/api/book/${bookId}/status`);
const logs = await apiGet(`/api/book/${bookId}/logs`);
const state = await apiGet(`/api/book/${bookIdx}/status`);
const logs = await apiGet(`/api/book/${bookIdx}/logs`);
if (state) {
updateProgressBars(state);
@ -140,24 +140,30 @@ async function refreshBook(bookId) {
}
/* ---------------------------------------------------------
BOOKCARD BUTTON BINDING idempotent
BOOKCARD BUTTON BINDING (idempotent)
--------------------------------------------------------- */
function bindBookCardButtons() {
console.log(">>> bindBookCardButtons() scanning…");
// START BUTTONS
document.querySelectorAll(".book-card .icon-start").forEach((btn) => {
if (btn.dataset.bound === "1") return; // prevent double-binding
if (btn.dataset.bound === "1") return;
btn.dataset.bound = "1";
btn.addEventListener("click", (ev) => {
ev.preventDefault();
if (btn.disabled) return;
const bookId = btn.closest(".book-card").dataset.bookId;
console.log(">>> START clicked:", bookId);
const card = btn.closest(".book-card");
const bookIdx = card?.dataset.bookIdx;
console.log(">>> START clicked:", bookIdx);
if (!bookIdx) {
console.error(">>> ERROR: bookIdx missing on .book-card dataset");
return;
}
startBook(bookId);
startBook(bookIdx);
});
});
@ -170,10 +176,16 @@ function bindBookCardButtons() {
ev.preventDefault();
if (btn.disabled) return;
const bookId = btn.closest(".book-card").dataset.bookId;
console.log(">>> ABORT clicked:", bookId);
const card = btn.closest(".book-card");
const bookIdx = card?.dataset.bookIdx;
abortBookAjax(bookId);
console.log(">>> ABORT clicked:", bookIdx);
if (!bookIdx) {
console.error(">>> ERROR: bookIdx missing on .book-card dataset");
return;
}
abortBookAjax(bookIdx);
});
});
}
@ -181,13 +193,13 @@ function bindBookCardButtons() {
/* ---------------------------------------------------------
AJAX START
--------------------------------------------------------- */
function startBook(bookId) {
console.log(">>> startBook():", bookId);
function startBook(bookIdx) {
console.log(">>> startBook():", bookIdx);
fetch("/start", {
method: "POST",
headers: { "Content-Type": "application/x-www-form-urlencoded" },
body: `book_id=${bookId}`,
body: `book_idx=${bookIdx}`, // backend expects field name book_idx
})
.then(async (r) => {
console.log(">>> /start status:", r.status);
@ -199,7 +211,7 @@ function startBook(bookId) {
console.log(">>> /start response:", data);
refreshBookCards();
refreshBook(bookId);
refreshBook(bookIdx);
})
.catch((err) => console.error("Start failed:", err));
}
@ -207,12 +219,12 @@ function startBook(bookId) {
/* ---------------------------------------------------------
AJAX ABORT
--------------------------------------------------------- */
function abortBookAjax(bookId) {
if (!confirm(`Abort tasks for book ${bookId}?`)) return;
function abortBookAjax(bookIdx) {
if (!confirm(`Abort tasks for book ${bookIdx}?`)) return;
console.log(">>> abortBookAjax():", bookId);
console.log(">>> abortBookAjax():", bookIdx);
fetch(`/abort/${bookId}`, { method: "POST" })
fetch(`/abort/${bookIdx}`, { method: "POST" })
.then(async (r) => {
let data = null;
try {
@ -221,7 +233,7 @@ function abortBookAjax(bookId) {
console.log(">>> /abort response:", data);
refreshBookCards();
refreshBook(bookId);
refreshBook(bookIdx);
})
.catch((err) => console.error("Abort failed:", err));
}
@ -234,8 +246,8 @@ async function refreshBookCards() {
if (!books) return;
document.querySelectorAll(".book-card").forEach((card) => {
const id = card.dataset.bookId;
const info = books.find((b) => b.book_id === id);
const idx = card.dataset.bookIdx;
const info = books.find((b) => b.book_idx === idx);
if (!info) return;
// Status CSS
@ -255,5 +267,5 @@ async function refreshBookCards() {
].includes(info.status);
});
bindBookCardButtons(); // rebind new DOM
bindBookCardButtons();
}

@ -15,7 +15,7 @@ console.log(">>> log_view.js LOADING…");
--------------------------------------------------------- */
let LOG_FILTER = "ALL";
let LAST_LOG_INDEX = -1; // delta offset
const MAX_LOG_LINES = 600; // safe rolling window
const MAX_LOG_LINES = 600;
/* ---------------------------------------------------------
Apply filter on existing log lines
@ -35,38 +35,25 @@ function applyLogFilter() {
document.addEventListener("DOMContentLoaded", () => {
console.log(">>> log_view.js DOMContentLoaded");
const filterSel = $("#log-filter");
const clearBtn = $("#log-clear");
const output = $("#log-output");
if (!output) {
console.log(
">>> log_view.js: No #log-output on this page → viewer disabled"
);
console.log(">>> log_view.js: No #log-output → viewer disabled");
return;
}
console.log(">>> log_view.js: log viewer detected.");
// Filter dropdown (currently disabled in your UI)
// if (filterSel) {
// filterSel.addEventListener("change", () => {
// LOG_FILTER = filterSel.value;
// applyLogFilter();
// });
// }
if (clearBtn) {
clearBtn.addEventListener("click", () => {
console.log(">>> log_view.js: Clear log viewer");
output.innerHTML = "";
LAST_LOG_INDEX = -1; // reset delta polling
LAST_LOG_INDEX = -1;
});
}
});
/* ---------------------------------------------------------
Append ONE line (smart class assignment)
Append ONE line
--------------------------------------------------------- */
function rollingAppend(lineText) {
const output = $("#log-output");
@ -86,7 +73,6 @@ function rollingAppend(lineText) {
else div.classList.add("default");
div.textContent = lineText;
output.appendChild(div);
// Rolling limit
@ -96,31 +82,24 @@ function rollingAppend(lineText) {
}
/* ---------------------------------------------------------
Primary API entry: updateLogs()
Used by dashboard.js AND delta polling
Primary entry: updateLogs()
Accepts:
{ logs: [...], last_index: N }
{ logs:[...], last:N }
OR legacy:
{ lines: [...], total: N }
{ lines:[...], last:N }
--------------------------------------------------------- */
function updateLogs(packet) {
const output = $("#log-output");
if (!output) return;
if (!output || !packet) return;
if (!packet) return;
// Normalized log arrays
let lines = packet.logs || packet.lines || [];
if (!Array.isArray(lines)) return;
// Append only new lines
lines.forEach((line) => rollingAppend(line));
// Update delta index
if (packet.last_index !== undefined) {
LAST_LOG_INDEX = packet.last_index;
} else if (packet.total !== undefined) {
LAST_LOG_INDEX = packet.total - 1;
// Correct unified delta index handling
if (packet.last !== undefined) {
LAST_LOG_INDEX = packet.last;
}
applyLogFilter();
@ -128,18 +107,17 @@ function updateLogs(packet) {
}
/* ---------------------------------------------------------
Delta polling: ONLY global logs use this
Dashboard overrides logs per book.
Delta polling global logs ONLY
(dashboard.js overrides logs per-book)
--------------------------------------------------------- */
function pollLogs() {
fetch(`/logs?last_index=${LAST_LOG_INDEX}`)
.then((r) => r.json())
.then((data) => {
const lines = data.lines || [];
if (lines.length > 0) {
lines.forEach((line) => logAppend(line));
LAST_LOG_INDEX = data.last; // <-- DE JUISTE INDEX!
lines.forEach((line) => rollingAppend(line));
LAST_LOG_INDEX = data.last;
}
})
.catch((err) => {

@ -2,7 +2,7 @@
File: static/js/progress.js
Purpose:
Update progress bars dynamically for the current book.
Expects data from API endpoints via dashboard.js or start.js.
Only updates the main progress box (book_detail page).
======================================================================= */
console.log(">>> progress.js LOADED");
@ -15,19 +15,20 @@ function updateProgressBars(data) {
return;
}
// Data format expected:
// {
// download_done,
// download_total,
// audio_done,
// audio_total
// }
const barDL = $(".progress-bar-fill");
const barAU = $(".progress-bar-fill.audio-fill");
// We always update inside the main progress box:
const container = document.querySelector("#progressSection");
if (!container) {
console.warn(">>> progress.js: #progressSection NOT FOUND");
return;
}
console.log(">>> progress.js barDL =", barDL);
console.log(">>> progress.js barAU =", barAU);
// Select bars ONLY inside the correct section
const barDL = container.querySelector(
".progress-bar:not(.audio) .progress-bar-fill"
);
const barAU = container.querySelector(
".progress-bar.audio .progress-bar-fill"
);
const pctDL =
data.download_total > 0
@ -39,23 +40,22 @@ function updateProgressBars(data) {
if (barDL) {
barDL.style.width = pctDL.toFixed(1) + "%";
console.log(">>> progress.js updated DL bar to", pctDL.toFixed(1) + "%");
console.log(">>> progress.js DL bar =", pctDL.toFixed(1) + "%");
} else {
console.warn(">>> progress.js: barDL NOT FOUND");
console.warn(">>> progress.js: barDL NOT FOUND INSIDE #progressSection");
}
if (barAU) {
barAU.style.width = pctAU.toFixed(1) + "%";
console.log(">>> progress.js updated AU bar to", pctAU.toFixed(1) + "%");
console.log(">>> progress.js AU bar =", pctAU.toFixed(1) + "%");
} else {
console.warn(">>> progress.js: barAU NOT FOUND");
console.warn(">>> progress.js: barAU NOT FOUND INSIDE #progressSection");
}
// Update textual stats
const stats = $$(".progress-stats span");
console.log(">>> progress.js stats elements found:", stats.length);
// Textual stats — only update inside progress box
const stats = container.querySelectorAll(".progress-stats span");
// Expected structure: [DL "x/y", DL "pct", AU "x/y", AU "pct"]
// Expected: [DL x/y, DL %, AU x/y, AU %]
if (stats.length >= 4) {
stats[0].innerText = `${data.download_done} / ${data.download_total}`;
stats[1].innerText = pctDL.toFixed(1) + "%";
@ -65,7 +65,7 @@ function updateProgressBars(data) {
console.log(">>> progress.js stats updated");
} else {
console.warn(
">>> progress.js: not enough stats spans, found",
">>> progress.js: not enough stats spans in the container, found",
stats.length
);
}

@ -3,17 +3,17 @@
Purpose:
Dashboard weergave van één boek in de lijst.
Variabelen komen binnen via:
Dus alle velden moeten via "book.<veld>" aangesproken worden.
book.<veld>
Boek gebruikt nu uitsluitend book_idx als primaire sleutel
======================================================================= -->
<div class="book-list-item" data-book-id="{{ book.book_id }}">
<div class="book-list-item" data-book-idx="{{ book.book_idx }}">
<!-- Left area: title + metadata -->
<div class="book-info">
<div class="book-title">{{ book.title }}</div>
<div class="book-meta">
<span class="meta-label">ID:</span> {{ book.book_id }} {% if
<span class="meta-label">IDX:</span> {{ book.book_idx }} {% if
book.last_update %}
<span class="meta-separator"></span>
<span class="meta-label">Updated:</span> {{ book.last_update }} {% endif
@ -56,8 +56,10 @@
<span class="mini-value">{{ pct_au }}%</span>
</div>
</div>
<!-- Abort button -->
<div class="book-abort-area">
<button class="abort-btn" onclick="abortBook('{{ book.book_id }}')">
<button class="abort-btn" onclick="abortBookAjax('{{ book.book_idx }}')">
Abort
</button>
</div>

@ -12,13 +12,13 @@
variable "b" in context
============================================================ #}
<div class="book-card {{ b.status }}" data-book-id="{{ b.book_id }}">
<div class="book-card {{ b.status }}" data-book-idx="{{ b.book_idx }}">
<!-- ======================================================
HIDE BUTTON (icon-only)
====================================================== -->
<form
action="/hide/{{ b.book_id }}"
action="/hide/{{ b.book_idx }}"
method="POST"
onsubmit="return confirm('Dit boek verbergen?')"
class="hide-form"
@ -50,7 +50,7 @@
<div class="book-actions">
<!-- START -->
<form action="/start" method="POST">
<input type="hidden" name="book_id" value="{{ b.book_id }}" />
<input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button
class="icon-btn icon-start"
title="Start scraping"
@ -63,8 +63,8 @@
</form>
<!-- ABORT -->
<form action="/abort/{{ b.book_id }}" method="POST">
<input type="hidden" name="book_id" value="{{ b.book_id }}" />
<form action="/abort/{{ b.book_idx }}" method="POST">
<input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button
class="icon-btn icon-abort"
title="Stoppen (abort)"

@ -3,7 +3,7 @@
Purpose: Reusable progress overview (download + audio) for any book.
Notes:
- Expects the following variables from Flask:
book_id: str
book_idx: str
title: str
download_total: int
download_done: int
@ -18,8 +18,8 @@
<h2>Progress</h2>
{% if title %}
<div class="progress-subtitle">{{ title }}</div>
{% endif %} {% if book_id %}
<div class="progress-bookid">Book ID: <span>{{ book_id }}</span></div>
{% endif %} {% if book_idx %}
<div class="progress-bookid">Book IDX: <span>{{ book_idx }}</span></div>
{% endif %}
</div>
@ -57,5 +57,6 @@
<span>{{ pct2 }}%</span>
</div>
</div>
<script src="/static/js/progress.js"></script>
</div>

@ -1,7 +1,7 @@
<!-- =======================================================================
File: templates/dashboard/book_detail.html
Purpose:
Detailpagina voor één book_id.
Detailpagina voor één book_idx.
Toont progress (download/audio) + filters + live logs.
======================================================================= -->
@ -15,7 +15,9 @@
<!-- Progress box -->
<section id="progressSection">
{% include "components/progress_box.html" %}
{% include "components/progress_box.html" with book_idx=book_idx,
title=title, download_total=download_total, download_done=download_done,
audio_total=audio_total, audio_done=audio_done %}
</section>
<!-- Log view -->
@ -27,13 +29,10 @@
<!-- PAGE-SPECIFIC JS -->
<script>
const BOOK_ID = "{{ book_id }}";
const BOOK_IDX = "{{ book_idx }}";
</script>
<!-- Shared log viewer -->
<script src="/static/js/log_view.js"></script>
<!-- Dashboard behaviour (only does something if dashboard HTML is present) -->
<script src="/static/js/log_view.js"></script>
<script src="/static/js/dashboard.js"></script>
<!-- Existing global app logic -->
{% endblock %}

@ -1,26 +1,27 @@
{% extends "layout.html" %} {% block content %}
{# ============================================================ File:
templates/debug/inspect_state.html Purpose: Inspect SQLite vs Redis state per
book_idx. Left side: full book-card UI (same component as dashboard) Right side:
SQL / Redis / merged comparison table.
============================================================ #} {% extends
"layout.html" %} {% block content %}
<h1>State Inspection (SQL vs Redis)</h1>
<style>
.state-card {
border: 1px solid #444;
.state-block {
display: grid;
grid-template-columns: 380px 1fr;
gap: 20px;
margin-bottom: 35px;
padding: 18px;
margin-bottom: 30px;
border: 1px solid #444;
background: #222;
border-radius: 8px;
}
.state-title {
font-size: 1.4em;
margin-bottom: 14px;
color: #9cf;
}
table.state-table {
.state-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 12px;
}
.state-table th,
@ -34,17 +35,20 @@
color: #fff;
}
.same {
color: #9f9;
.state-table td {
background: #2a2a2a;
color: #ddd;
}
.same {
color: #9f9 !important;
}
.diff {
color: #ff7b7b;
color: #ff7b7b !important;
font-weight: bold;
}
.empty {
color: #aaa;
color: #aaa !important;
font-style: italic;
}
</style>
@ -56,12 +60,15 @@
<td class="diff">{{ sqlval }}</td>
<td class="diff">{{ redisval }}</td>
{% endif %} {% endmacro %} {% for entry in results %}
<div class="state-card">
<div class="state-title">📘 {{ entry.book_id }}</div>
{% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged =
entry.would_merge_to %}
<div class="state-block">
<!-- LEFT COLUMN: book-card preview -->
<div>
{% with b = entry.card %} {% include "components/bookcard.html" %} {%
endwith %}
</div>
<!-- RIGHT COLUMN: SQL vs Redis comparison -->
<div>
<table class="state-table">
<tr>
<th>Field</th>
@ -70,19 +77,19 @@
<th>Merged Result</th>
</tr>
{% for field in [ "status", "chapters_total", "downloaded",
"chapters_download_done", "chapters_download_skipped", "parsed",
"chapters_parsed_done", "audio_done", "audio_skipped", "last_update" ] %}
{% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged =
entry.would_merge_to %} {% for field in [ "status", "chapters_total",
"downloaded", "chapters_download_done", "chapters_download_skipped",
"parsed", "chapters_parsed_done", "audio_done", "audio_skipped",
"last_update" ] %}
<tr>
<th>{{ field }}</th>
<td>{{ sql.get(field, '') }}</td>
<td>{{ redis.get(field, '') }}</td>
<td>{{ merged.get(field, '') }}</td>
</tr>
{% endfor %}
</table>
</div>
</div>
{% endfor %} {% endblock %}

@ -32,7 +32,6 @@
font-size: 13px;
}
/* NEW: Clear button */
#clearLogBtn {
margin-bottom: 10px;
padding: 8px 16px;
@ -70,7 +69,7 @@
<body>
<a href="/">&larr; Terug</a>
<h1>Scrape Resultaat--</h1>
<h1>Scrape Resultaat</h1>
{% if error %}
<div
@ -79,7 +78,9 @@
>
<strong>Fout:</strong> {{ error }}
</div>
{% endif %} {% if message %}
{% endif %}
{% if message %}
<div class="box">{{ message }}</div>
{% endif %}
@ -113,127 +114,29 @@
class="box hidden"
style="background: #ffefef; border-left: 5px solid #cc0000"
>
<strong>Failed chapters:</strong>
<strong>Mislukte hoofdstukken:</strong>
<ul id="failedList" style="margin-top: 10px"></ul>
</div>
<div class="box">
<strong>Live log:</strong><br />
<!-- NEW BUTTON -->
<button id="clearLogBtn" onclick="clearLogs()">Clear logs</button>
<div id="logbox" class="logbox"></div>
</div>
<script>
// komt terug van Celery-scraping task
const scrapingTaskId = "{{ scraping_task_id or '' }}";
let bookId = null;
let bookIdx = null;
let polling = true;
if (scrapingTaskId) pollForBookId();
function pollForBookId() {
fetch(`/celery-result/${scrapingTaskId}`)
.then((r) => r.json())
.then((data) => {
if (data.ready && data.result && data.result.book_id) {
bookId = data.result.book_id;
startLiveUI();
} else setTimeout(pollForBookId, 800);
})
.catch(() => setTimeout(pollForBookId, 1200));
}
function startLiveUI() {
document.getElementById("statusBox").classList.remove("hidden");
document.getElementById("abortBtn").classList.remove("hidden");
document.getElementById("abortBtn").onclick = () => {
fetch(`/abort/${bookId}`, { method: "POST" });
};
pollProgress();
pollLogs();
}
function pollProgress() {
if (!bookId) return;
fetch(`/progress/${bookId}`)
.then((r) => r.json())
.then((p) => {
const done = p.completed || 0;
const total = p.total || 0;
document.getElementById(
"progressText"
).innerText = `Completed: ${done} / ${total} | Skipped: ${
p.skipped || 0
} | Failed: ${p.failed || 0}`;
const failedBox = document.getElementById("failedBox");
const failedList = document.getElementById("failedList");
if (p.failed_list && p.failed_list.length > 0) {
failedBox.classList.remove("hidden");
failedList.innerHTML = "";
p.failed_list.forEach((entry) => {
const li = document.createElement("li");
li.textContent = entry;
failedList.appendChild(li);
});
}
if (p.abort) {
document.getElementById("statusLine").innerText = "ABORTED";
polling = false;
} else if (done >= total && total > 0) {
document.getElementById("statusLine").innerText = "KLAAR ✔";
polling = false;
} else {
document.getElementById("statusLine").innerText = "Bezig…";
}
if (polling) setTimeout(pollProgress, 1000);
})
.catch(() => {
if (polling) setTimeout(pollProgress, 1500);
});
}
function pollLogs() {
if (!polling) return;
fetch(`/logs`)
.then((r) => r.json())
.then((data) => {
const logbox = document.getElementById("logbox");
logbox.innerHTML = "";
data.logs.forEach((line) => {
const div = document.createElement("div");
div.textContent = line;
logbox.appendChild(div);
});
logbox.scrollTop = logbox.scrollHeight;
setTimeout(pollLogs, 1000);
})
.catch(() => setTimeout(pollLogs, 1500));
}
if (scrapingTaskId) pollForBookIdx();
// =========================================================
// NEW: Clear logs button handler
// =========================================================
function clearLogs() {
fetch("/clear-logs", { method: "POST" })
.then(() => {
document.getElementById("logbox").innerHTML = "";
})
.catch((e) => console.error("Clear logs failed:", e));
}
</script>
</body>
</html>
// -----------------------------------------------------
// Vraag Celery-result op, wacht tot de scraper een book_idx teruggeeft
// -----------------------------------------------------
function pollForBookIdx() {
fetch(`/celery-result/${scrapingTask

Loading…
Cancel
Save