Compare commits

..

No commits in common. 'main' and 'feat/dashboard-upgrade' have entirely different histories.

2
.gitignore vendored

@ -12,5 +12,3 @@
.env .env
**/.env **/.env
log.txt log.txt
**/static/covers/

@ -125,8 +125,7 @@ docker run \
``` ```
docker compose down --remove-orphans docker compose down
docker image prune -f
docker builder prune -af docker builder prune -af
docker volume prune -f docker volume prune -f
docker compose build --no-cache docker compose build --no-cache
@ -136,20 +135,6 @@ docker compose down
docker compose build docker compose build
docker compose up docker compose up
docker compose up -d
docker compose build --no-cache web && docker compose up web
docker compose build worker_download && docker compose up worker_download
docker compose down --remove-orphans
docker compose build --no-cache worker_m4b
docker compose up -d worker_m4b
docker compose up web
docker compose build web
docker compose restart web
tar \ tar \
--exclude="**pycache**" \ --exclude="**pycache**" \
--exclude="_/**pycache**/_" \ --exclude="_/**pycache**/_" \
@ -157,9 +142,3 @@ tar \
--exclude=".venv" \ --exclude=".venv" \
--exclude="venv" \ --exclude="venv" \
-czvf project.tar.gz . -czvf project.tar.gz .
docker compose down
docker image rm bookscraper-worker_m4b || true
docker builder prune -af
docker compose build --no-cache worker_m4b
docker compose up -d worker_m4b

@ -1,319 +1,126 @@
# ============================================ # ============================================
# File: bookscraper/app.py (ASYNC SCRAPING) # File: bookscraper/app.py (ASYNC SCRAPING)
# ============================================ # ============================================
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
import os
from flask import (
Flask,
render_template,
request,
jsonify,
send_from_directory,
redirect,
url_for,
)
print(">>> [WEB] Importing celery_app …") print(">>> [WEB] Importing celery_app …")
from celery_app import celery_app from celery_app import celery_app
from celery.result import AsyncResult
from db.db import init_db from db.db import init_db
from db.repository import (
get_registered_books, init_db() # ensure DB schema exists before Flask starts
fetch_book,
fetch_all_books, from flask import Flask, render_template, request, jsonify
get_progress,
)
from logbus.publisher import log
from scraper.logger import log_debug from scraper.logger import log_debug
# Abort + Progress (per book_id)
from scraper.abort import set_abort from scraper.abort import set_abort
from scraper.progress import get_progress
# UI LOGS (GLOBAL — no book_id)
from scraper.ui_log import get_ui_logs, reset_ui_logs from scraper.ui_log import get_ui_logs, reset_ui_logs
from celery.result import AsyncResult
from scraper.state import state as r from scraper.state import state as r
from scraper.logger_decorators import logcall
from scraper.utils.state_sync import sync_books_from_redis
from scraper.services.init_service import InitService
# Cover serving
from flask import send_from_directory
import os
# INIT DB import redis
init_db()
# Flask
app = Flask(__name__) app = Flask(__name__)
# ===================================================== # =====================================================
# STATIC FILE SERVING # STATIC FILE SERVING FOR OUTPUT
# ===================================================== # =====================================================
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@app.route("/output/<path:filename>") @app.route("/output/<path:filename>")
@logcall
def serve_output(filename): def serve_output(filename):
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False) return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
# ===================================================== # =====================================================
# SECTION 1 — NAVIGATION / HTML PAGES # HOME PAGE
# ===================================================== # =====================================================
@app.route("/", methods=["GET"]) @app.route("/", methods=["GET"])
@logcall
def index(): def index():
return redirect(url_for("dashboard")) return render_template("index.html")
@app.route("/dashboard", methods=["GET"])
@logcall
def dashboard():
logs_list = get_ui_logs() or []
registered_books = get_registered_books()
log(f"[WEB] Registered books: {registered_books}")
from db.repository import fetch_all_books
from pprint import pprint
pprint(fetch_all_books())
pprint(get_registered_books())
# reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
books=list_active_books(),
registered=registered_books,
logs=logs_list,
)
@app.route("/book/<book_idx>")
@logcall
def book_detail(book_idx):
title = r.get(f"book:{book_idx}:title") or book_idx
return render_template(
"dashboard/book_detail.html",
book_id=book_idx,
title=title,
logs=get_ui_logs(),
)
# ===================================================== # =====================================================
# SECTION 2 — ACTION ROUTES (INIT, START, ABORT, HIDE) # START SCRAPING (async via Celery)
# ===================================================== # =====================================================
@app.route("/start", methods=["POST"])
def start_scraping():
url = request.form.get("url", "").strip()
if not url:
@app.route("/init", methods=["POST"]) # ★ FIX: dashboard moet altijd books + logs meekrijgen
@logcall
def init_book():
# -------------------------------------------------
# Accept single URL (legacy) OR multi-line URLs
# -------------------------------------------------
raw_urls = request.form.get("urls") or request.form.get("url") or ""
urls = [line.strip() for line in raw_urls.splitlines() if line.strip()]
if not urls:
return render_template(
"dashboard/dashboard.html",
error="Geen URL(s) opgegeven.",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
# -------------------------------------------------
# Duplicate check: existing book_ids
# -------------------------------------------------
existing_books = {b["book_idx"] for b in fetch_all_books()}
results = []
# -------------------------------------------------
# Process each URL independently
# -------------------------------------------------
for url in urls:
try:
book_id = InitService.derive_book_id(url)
if book_id in existing_books:
results.append(
{
"url": url,
"status": "skipped",
"book_id": book_id,
"message": "Al geregistreerd",
}
)
continue
result = InitService.execute(url)
results.append(
{
"url": url,
"status": "registered",
"book_id": result.get("book_id"),
"title": result.get("title"),
}
)
except Exception as e:
log_debug(f"[INIT] ERROR for url={url}: {e}")
results.append(
{
"url": url,
"status": "error",
"error": str(e),
}
)
# -------------------------------------------------
# Summary message
# -------------------------------------------------
ok = sum(1 for r in results if r["status"] == "registered")
skipped = sum(1 for r in results if r["status"] == "skipped")
failed = sum(1 for r in results if r["status"] == "error")
message = f"Geregistreerd: {ok}, overgeslagen: {skipped}, fouten: {failed}"
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template( return render_template(
"dashboard/dashboard.html", "dashboard/dashboard.html",
message=message, error="Geen URL opgegeven.",
init_results=results, # optioneel voor UI-weergave
books=list_active_books(), books=list_active_books(),
registered=reg,
logs=get_ui_logs(), logs=get_ui_logs(),
) )
@app.route("/hide/<book_idx>", methods=["POST"])
@logcall
def hide_registered_book(book_idx):
# intentionally left disabled
pass
@app.route("/start", methods=["POST"])
@logcall
def start_scraping():
# 1) Form field: book_idx
book_idx = request.form.get("book_idx")
log(f"[WEB][START] Received start request for book_idx={book_idx}")
if not book_idx:
msg = "book_idx ontbreekt in formulier"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 400
# 2) Fetch boek uit SQLite
try:
book = fetch_book(book_idx)
log(f"[WEB][START] Fetched book from DB: {book}")
except Exception as e:
log(f"[WEB][START] DB ERROR: {e}")
return jsonify({"status": "error", "message": "DB fout"}), 500
if not book:
msg = f"Boek '{book_idx}' niet gevonden in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 404
# 3) Boek moet een URL hebben
url = book.get("book_url")
if not url:
msg = f"Boek '{book_idx}' heeft geen book_url in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 500
# 4) Reset UI logs
reset_ui_logs() reset_ui_logs()
log_debug(f"[WEB] Scraping via Celery: {url}")
# 5) Logging
log(f"[WEB][START] Starting full scrape book_idx={book_idx}, url={url}")
log_debug(f"[WEB][START] DEBUG: book data = {book}")
# 6) Celery controller taak starten
try:
async_result = celery_app.send_task( async_result = celery_app.send_task(
"scraper.tasks.controller_tasks.start_full_scrape", "scraper.tasks.scraping.start_scrape_book",
args=[book_idx], args=[url],
queue="controller", queue="scraping",
) )
except Exception as e:
log(f"[WEB][START] Celery ERROR: {e}")
return jsonify({"status": "error", "message": f"Celery fout: {e}"}), 500
# 7) Successfully dispatched task
log(f"[WEB][START] Task dispatched: {async_result.id}")
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
# ★ FIX: direct dashboard tonen met actuele data
return render_template( return render_template(
"dashboard/dashboard.html", "dashboard/dashboard.html",
scraping_task_id=async_result.id, scraping_task_id=async_result.id,
books=list_active_books(), books=list_active_books(),
registered=reg,
logs=get_ui_logs(), logs=get_ui_logs(),
) )
@app.route("/abort/<book_idx>", methods=["POST"])
@logcall
def abort_download(book_idx):
log_debug(f"[WEB] Abort requested for book: {book_idx}")
set_abort(book_idx)
return jsonify({"status": "ok", "aborted": book_idx})
# ===================================================== # =====================================================
# SECTION 3 — API ROUTES (JSON) # CLEAR UI LOGS
# ===================================================== # =====================================================
@app.route("/clear-logs", methods=["POST"])
def clear_logs():
reset_ui_logs()
return jsonify({"status": "ok", "message": "UI logs cleared"})
@app.route("/api/state/all", methods=["GET"]) # =====================================================
@logcall # ABORT (per book_id)
def api_state_all(): # =====================================================
""" @app.route("/abort/<book_id>", methods=["POST"])
Returns the merged SQL + Redis state for all books def abort_download(book_id):
(same logic as /debug/inspect_state but JSON-only). log_debug(f"[WEB] Abort requested for book: {book_id}")
""" set_abort(book_id)
from scraper.utils.state_sync import inspect_books_state return jsonify({"status": "ok", "aborted": book_id})
return jsonify(inspect_books_state())
@app.route("/api/books")
@logcall
def api_books():
return jsonify(list_active_books())
@app.route("/api/book/<book_idx>/status")
@logcall
def api_book_status(book_idx):
return jsonify(getStatus(book_idx))
@app.route("/api/book/<book_idx>/logs")
@logcall
def api_book_logs(book_idx):
logs = r.lrange(f"logs:{book_idx}", 0, -1) or []
return jsonify(logs)
@app.route("/progress/<book_idx>") # =====================================================
@logcall # PROGRESS (per book_id)
def progress(book_idx): # =====================================================
return jsonify(get_progress(book_idx)) @app.route("/progress/<book_id>", methods=["GET"])
def progress(book_id):
return jsonify(get_progress(book_id))
@app.route("/celery-result/<task_id>") # =====================================================
@logcall # CELERY RESULT → return book_id
# =====================================================
@app.route("/celery-result/<task_id>", methods=["GET"])
def celery_result(task_id): def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app) result = AsyncResult(task_id, app=celery_app)
if result.successful(): if result.successful():
return jsonify({"ready": True, "result": result.get()}) return jsonify({"ready": True, "result": result.get()})
if result.failed(): if result.failed():
@ -321,78 +128,119 @@ def celery_result(task_id):
return jsonify({"ready": False}) return jsonify({"ready": False})
@app.route("/clear-logs", methods=["POST"]) # =====================================================
@logcall # API: book status new model
def clear_logs(): # =====================================================
reset_ui_logs() def getStatus(book_id):
return jsonify({"status": "ok"})
@app.route("/logs", methods=["GET"]) state = r.hgetall(f"book:{book_id}:state")
@logcall status = state.get("status") or "unknown"
def logs(): dl_done = int(state.get("chapters_download_done", 0))
try: dl_skipped = int(state.get("chapters_download_skipped", 0))
last_index = int(request.args.get("last_index", -1)) dl_total = int(state.get("chapters_total", 0))
except: au_done = int(state.get("audio_done") or 0)
last_index = -1 title = state.get("title") or book_id
all_logs = get_ui_logs() or [] au_total = dl_total
new_lines = [] return {
new_last = last_index "book_id": book_id,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": au_total,
}
for idx, line in enumerate(all_logs):
if idx > last_index:
new_lines.append(line)
new_last = idx
return jsonify({"lines": new_lines, "last": new_last}) # =====================================================
# REDIS BACKEND — BOOK STATE MODEL
# =====================================================
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
from flask import render_template def list_active_booksold():
from scraper.services.status_check_service import StatusCheckService """Return list of active books from Redis Book State Model."""
from logbus.publisher import log keys = r.keys("book:*:state")
books = []
for key in keys:
book_id = key.split(":")[1]
print(book_id)
books.append(getStatus(book_id))
from db.repository import get_book_state return books
@app.route("/inspect/statuscheck/<book_idx>", methods=["POST"]) def list_active_books():
@logcall books = []
def inspect_statuscheck(book_idx): for key in r.scan_iter(match="book:*:state", count=1000):
try: first = key.find(":")
StatusCheckService.run(book_idx) second = key.find(":", first + 1)
return ("", 204) # background action, geen UI book_id = key[first + 1 : second]
except Exception as e: books.append(getStatus(book_id))
log(f"[STATUSCHECK] ERROR book_idx={book_idx}: {e}") return books
return jsonify({"error": str(e)}), 500
# ===================================================== # =====================================================
# SECTION 4 — DEBUG ROUTES # API: list all active books
# ===================================================== # =====================================================
@app.route("/api/books")
def api_books():
return jsonify(list_active_books())
@app.route("/debug/sync_state", methods=["GET"]) @app.route("/api/book/<book_id>/status")
def debug_sync_state(): def api_book_status(book_id):
results = sync_books_from_redis()
return {"status": "ok", "synced": results}
return jsonify(getStatus(book_id))
from scraper.utils.state_sync import inspect_books_state
# =====================================================
# API: book logs
# =====================================================
@app.route("/api/book/<book_id>/logs")
def api_book_logs(book_id):
logs = r.lrange(f"logs:{book_id}", 0, -1) or []
return jsonify(logs)
@app.route("/debug/inspect_state", methods=["GET"])
def debug_inspect_state(): # =====================================================
results = inspect_books_state() # VIEW: DASHBOARD
return render_template("debug/inspect_state.html", results=results) # =====================================================
@app.route("/dashboard")
def dashboard():
logs_list = get_ui_logs() or []
# ★ FIX: dashboard moet altijd books + logs krijgen
return render_template(
"dashboard/dashboard.html",
books=list_active_books(),
logs=logs_list, # dashboard krijgt LIST, geen dict
)
# =====================================================
# VIEW: BOOK DETAIL PAGE
# =====================================================
@app.route("/book/<book_id>")
def book_detail(book_id):
title = r.get(f"book:{book_id}:title") or book_id
return render_template(
"dashboard/book_detail.html",
book_id=book_id,
title=title,
logs=get_ui_logs(),
)
@app.route("/debug/redis-keys") @app.route("/debug/redis-keys")
@logcall
def debug_redis_keys(): def debug_redis_keys():
cursor = 0 cursor = 0
results = {} results = {}
while True: while True:
cursor, keys = r.scan(cursor, match="*", count=200) cursor, keys = r.scan(cursor, match="*", count=200)
for k in keys: for k in keys:
@ -402,127 +250,41 @@ def debug_redis_keys():
results[k] = "<non-string value>" results[k] = "<non-string value>"
if cursor == 0: if cursor == 0:
break break
return jsonify(results)
# ===================================================== return jsonify(results)
# DB DEBUG
# =====================================================
@app.route("/api/db/books")
@logcall
def api_db_books():
try:
books = fetch_all_books()
return jsonify({"status": "ok", "books": books})
except Exception as e:
return jsonify({"status": "error", "message": str(e)}), 500
# =============================================
# DEBUG QUEUE VIEW (HTML)
# =============================================
from flask import render_template
from urllib.parse import urlparse
import redis
from celery_app import celery_app
@app.route("/debug/queues")
def debug_queues():
insp = celery_app.control.inspect()
workers_active = insp.active() or {}
workers_scheduled = insp.scheduled() or {}
workers_reserved = insp.reserved() or {}
redis_url = os.getenv("REDIS_BROKER")
parsed = urlparse(redis_url)
r2 = redis.Redis(
host=parsed.hostname,
port=parsed.port,
db=int(parsed.path.strip("/") or 0),
decode_responses=True,
)
queue_names = ["scraping", "controller", "download", "parse", "save", "audio"] # ============================================================
# Rolling log endpoint (no new file)
queues = [] # ============================================================
for q in queue_names:
key = f"celery:{q}"
try:
queues.append(
{
"name": q,
"redis_key": key,
"length": r2.llen(key),
"items": r2.lrange(key, 0, 30),
}
)
except Exception as e:
queues.append(
{
"name": q,
"redis_key": key,
"length": "ERR",
"items": [str(e)],
}
)
return render_template(
"debug/queues.html",
queues=queues,
workers_active=workers_active,
workers_reserved=workers_reserved,
workers_scheduled=workers_scheduled,
)
from flask import jsonify, request
# ===================================================== # =====================================================
# SECTION 5 — INTERNAL HELPERS # ROLLING LOG ENDPOINT — DELTA POLLING VIA ui_log
# ===================================================== # =====================================================
from scraper.ui_log import get_ui_logs_delta
@logcall @app.route("/logs", methods=["GET"])
def getStatus(book_idx): def logs():
state = r.hgetall(f"book:{book_idx}:state") """
status = state.get("status") or "unknown" Delta log delivery for WebGUI.
dl_done = int(state.get("chapters_download_done", 0)) Browser sends ?last_index=N, we return only new lines.
dl_skipped = int(state.get("chapters_download_skipped", 0)) """
dl_total = int(state.get("chapters_total", 0)) try:
au_done = int(state.get("audio_done") or 0) last_index = int(request.args.get("last_index", -1))
title = state.get("title") or book_idx except:
last_index = -1
return {
"book_id": book_idx,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": dl_total,
}
new_lines, total = get_ui_logs_delta(last_index)
@logcall return jsonify({"lines": new_lines, "total": total})
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_idx = key[first + 1 : second]
books.append(getStatus(book_idx))
return books
# ===================================================== # =====================================================
# SECTION 6 — FLASK RUNNER # RUN FLASK
# ===================================================== # =====================================================
if __name__ == "__main__": if __name__ == "__main__":
debug = os.getenv("FLASK_DEBUG", "0") == "1" debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0") host = os.getenv("HOST", "0.0.0.0")

@ -54,7 +54,6 @@ def main():
"-l", "-l",
"INFO", "INFO",
"--pool=prefork", "--pool=prefork",
"--concurrency=2",
] ]
print("[AUDIO-LOCAL] Launching Celery via subprocess…") print("[AUDIO-LOCAL] Launching Celery via subprocess…")

@ -32,17 +32,6 @@ celery_app = Celery(
], ],
) )
# >>>>> PLAATS DIT HIER <<<<<
celery_app.conf.update(
worker_redirect_stdouts_level="WARNING",
task_send_sent_event=False,
resultrepr_maxsize=0,
worker_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s",
worker_task_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s",
)
# >>>>> TOT HIER <<<<<
celery_app.conf.task_routes = { celery_app.conf.task_routes = {
"scraper.tasks.scraping.*": {"queue": "scraping"}, "scraper.tasks.scraping.*": {"queue": "scraping"},
"scraper.tasks.controller_tasks.*": {"queue": "controller"}, "scraper.tasks.controller_tasks.*": {"queue": "controller"},

@ -1,11 +1,14 @@
# ============================================================ # ============================================================
# File: db/db.py (UPDATED for book_idx-only architecture) # File: db/db.py
# Purpose: # Purpose:
# Raw SQLite engine for BookScraper. # Raw SQLite engine for BookScraper.
# - Connection management # Provides ONLY low-level DB primitives.
# - init_db() schema creation + safe schema upgrade # - Connection management (WAL mode)
# - upsert_book() atomic write (now uses book_idx) # - init_db() schema creation
# - raw fetch helpers # - upsert_book() atomic write
# - raw fetch helpers (private)
#
# All business logic belongs in repository.py.
# ============================================================ # ============================================================
import os import os
@ -45,24 +48,19 @@ def enable_wal_mode(conn):
# ------------------------------------------------------------ # ------------------------------------------------------------
# Schema creation + SAFE schema upgrades # Schema creation
# ------------------------------------------------------------ # ------------------------------------------------------------
def init_db(): def init_db():
conn = get_db() conn = get_db()
# --------------------------------------------------------
# BASE SCHEMA — book_idx is now PRIMARY KEY
# --------------------------------------------------------
conn.execute( conn.execute(
""" """
CREATE TABLE IF NOT EXISTS books ( CREATE TABLE IF NOT EXISTS books (
book_idx INTEGER PRIMARY KEY, book_id TEXT PRIMARY KEY,
title TEXT, title TEXT,
author TEXT, author TEXT,
description TEXT,
cover_url TEXT, cover_url TEXT,
cover_path TEXT, cover_path TEXT,
book_url TEXT,
chapters_total INTEGER, chapters_total INTEGER,
@ -72,61 +70,24 @@ def init_db():
audio_done INTEGER DEFAULT 0, audio_done INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP, created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
processdate DATETIME,
last_update DATETIME last_update DATETIME
); );
""" """
) )
conn.commit() conn.commit()
# --------------------------------------------------------
# SCHEMA UPGRADE UTILITY
# --------------------------------------------------------
def add_column(name, type_):
try:
conn.execute(f"ALTER TABLE books ADD COLUMN {name} {type_};")
except:
pass # column already exists
cols = conn.execute("PRAGMA table_info(books);").fetchall()
colnames = [c[1] for c in cols]
# --------------------------------------------------------
# UPGRADE NEW FIELDS — future-proof, matched with Redis state model
# --------------------------------------------------------
# (book_idx already exists as PRIMARY KEY — no need to add)
add_column("description", "TEXT")
add_column("cover_path", "TEXT")
add_column("book_url", "TEXT")
# Download counters
add_column("chapters_download_done", "INTEGER DEFAULT 0")
add_column("chapters_download_skipped", "INTEGER DEFAULT 0")
# Audio counters
add_column("audio_skipped", "INTEGER DEFAULT 0")
# Optional future fields
add_column("audio_total", "INTEGER DEFAULT 0")
conn.commit()
# ------------------------------------------------------------ # ------------------------------------------------------------
# WRITE OPERATIONS (book_idx-based UPSERT) # WRITE OPERATIONS
# ------------------------------------------------------------ # ------------------------------------------------------------
def upsert_book(book_idx, **fields): def upsert_book(book_id, **fields):
""" """
UPSERT by book_idx. Raw upsert primitive. Repository layer should call this.
Replaces old upsert that used book_id.
""" """
conn = get_db() conn = get_db()
keys = ["book_idx"] + list(fields.keys()) keys = ["book_id"] + list(fields.keys())
values = [book_idx] + list(fields.values()) values = [book_id] + list(fields.values())
placeholders = ",".join(["?"] * len(values)) placeholders = ",".join(["?"] * len(values))
updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()]) updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()])
@ -134,7 +95,7 @@ def upsert_book(book_idx, **fields):
sql = f""" sql = f"""
INSERT INTO books ({','.join(keys)}) INSERT INTO books ({','.join(keys)})
VALUES ({placeholders}) VALUES ({placeholders})
ON CONFLICT(book_idx) ON CONFLICT(book_id)
DO UPDATE SET {updates}, DO UPDATE SET {updates},
last_update = CURRENT_TIMESTAMP; last_update = CURRENT_TIMESTAMP;
""" """
@ -144,13 +105,11 @@ def upsert_book(book_idx, **fields):
# ------------------------------------------------------------ # ------------------------------------------------------------
# RAW READ OPERATIONS # RAW READ OPERATIONS (PRIVATE)
# ------------------------------------------------------------ # ------------------------------------------------------------
def _raw_get_book(book_idx): def _raw_get_book(book_id):
conn = get_db() conn = get_db()
row = conn.execute( row = conn.execute("SELECT * FROM books WHERE book_id = ?;", (book_id,)).fetchone()
"SELECT * FROM books WHERE book_idx = ?;", (book_idx,)
).fetchone()
return dict(row) if row else None return dict(row) if row else None

@ -1,320 +1,97 @@
# ============================================================ # ============================================================
# File: db/repository.py # File: db/repository.py
# Purpose: # Purpose:
# Unified façade for BookScraper database state. # High-level BookScraper database interface.
# This is the ONLY module Celery tasks and Flask should use.
# #
# Responsibilities: # Uses low-level primitives from db.db, but exposes
# - Route metadata → SQLite # domain-level operations:
# - Route counters → Redis (live) + SQLite (snapshot) # - fetch_book / fetch_all_books
# - Provide a clean API for tasks and Flask UI # - create_or_update_book
# ============================================================ # - set_status
# ============================================================ # - incrementing counters
# UPDATED — canonical read model via get_book_state # ============================================================
# ============================================================
from db.db import (
from scraper.logger_decorators import logcall upsert_book,
from logbus.publisher import log _raw_get_book,
_raw_get_all_books,
import redis
import os
# ============================================================
# SQL low-level engines (snapshot storage)
# ============================================================
from db.state_sql import (
sql_fetch_book,
sql_fetch_all_books,
sql_set_status,
sql_set_chapters_total,
sql_register_book,
sql_update_book,
) )
# ============================================================
# REDIS low-level engines (live counters)
# ============================================================
from db.state_redis import (
redis_set_status,
redis_set_chapters_total,
redis_inc_download_done,
redis_inc_download_skipped,
redis_inc_parsed_done,
redis_inc_audio_done,
redis_inc_audio_skipped,
)
# ============================================================ # ------------------------------------------------------------
# Redis client (read-only for legacy + guards) # FETCH OPERATIONS
# ============================================================ # ------------------------------------------------------------
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") def fetch_book(book_id):
_r = redis.Redis.from_url(REDIS_URL, decode_responses=True) """Return a single book dict or None."""
return _raw_get_book(book_id)
# ============================================================
# LEGACY PROGRESS (UI only, unchanged)
# ============================================================
def _legacy_get_progress(book_idx):
return {
"book_idx": book_idx,
"total": int(_r.get(f"progress:{book_idx}:total") or 0),
"completed": int(_r.get(f"progress:{book_idx}:completed") or 0),
"skipped": int(_r.get(f"progress:{book_idx}:skipped") or 0),
"failed": int(_r.get(f"progress:{book_idx}:failed") or 0),
"abort": _r.exists(f"abort:{book_idx}") == 1,
"failed_list": _r.lrange(f"progress:{book_idx}:failed_list", 0, -1),
}
@logcall
def get_progress(book_idx):
return _legacy_get_progress(book_idx)
# ============================================================
# FETCH (SQLite snapshot)
# ============================================================
@logcall
def fetch_book(book_idx):
return sql_fetch_book(book_idx)
@logcall
def fetch_all_books(): def fetch_all_books():
return sql_fetch_all_books() """Return all books ordered newest → oldest."""
return _raw_get_all_books()
# ============================================================ # ------------------------------------------------------------
# INIT / UPDATE METADATA # BOOK CREATION / METADATA
# ============================================================ # ------------------------------------------------------------
@logcall def create_or_update_book(
def register_book( book_id,
book_idx,
title,
author=None,
description=None,
cover_url=None,
cover_path=None,
book_url=None,
):
sql_register_book(
book_idx,
{
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path,
"book_url": book_url,
"chapters_total": 0,
"status": "registered",
},
)
@logcall
def update_book_after_full_scrape(
book_idx,
title=None, title=None,
author=None, author=None,
description=None,
cover_url=None,
chapters_total=None, chapters_total=None,
cover_url=None,
cover_path=None,
status=None,
): ):
fields = {} fields = {}
if title is not None: if title is not None:
fields["title"] = title fields["title"] = title
if author is not None: if author is not None:
fields["author"] = author fields["author"] = author
if description is not None:
fields["description"] = description
if cover_url is not None:
fields["cover_url"] = cover_url
if chapters_total is not None: if chapters_total is not None:
fields["chapters_total"] = chapters_total fields["chapters_total"] = chapters_total
if cover_url is not None:
fields["status"] = "active" fields["cover_url"] = cover_url
sql_update_book(book_idx, fields) if cover_path is not None:
fields["cover_path"] = cover_path
if status is not None:
# ============================================================ fields["status"] = status
# STATUS
# ============================================================ if fields:
@logcall upsert_book(book_id, **fields)
def set_status(book_idx, status):
redis_set_status(book_idx, status)
sql_set_status(book_idx, status) # ------------------------------------------------------------
# STATUS MANAGEMENT
# ------------------------------------------------------------
# ============================================================ def set_status(book_id, status):
# TOTALS upsert_book(book_id, status=status)
# ============================================================
@logcall
def set_chapters_total(book_idx, total): # ------------------------------------------------------------
redis_set_chapters_total(book_idx, total) # INCREMENTING COUNTERS (atomic)
sql_set_chapters_total(book_idx, total) # ------------------------------------------------------------
def inc_downloaded(book_id, amount=1):
book = _raw_get_book(book_id)
# ============================================================ if not book:
# COUNTERS — WRITE ONLY return
# ============================================================ cur = book.get("downloaded", 0) or 0
@logcall upsert_book(book_id, downloaded=cur + amount)
def inc_download_done(book_idx, amount=1):
redis_inc_download_done(book_idx, amount)
def inc_parsed(book_id, amount=1):
book = _raw_get_book(book_id)
@logcall if not book:
def inc_download_skipped(book_idx, amount=1): return
redis_inc_download_skipped(book_idx, amount) cur = book.get("parsed", 0) or 0
upsert_book(book_id, parsed=cur + amount)
@logcall
def inc_parsed_done(book_idx, amount=1): def inc_audio_done(book_id, amount=1):
redis_inc_parsed_done(book_idx, amount) book = _raw_get_book(book_id)
if not book:
return
@logcall cur = book.get("audio_done", 0) or 0
def inc_audio_done(book_idx, amount=1): upsert_book(book_id, audio_done=cur + amount)
redis_inc_audio_done(book_idx, amount)
@logcall
def inc_audio_skipped(book_idx, amount=1):
redis_inc_audio_skipped(book_idx, amount)
# ============================================================
# CANONICAL READ MODEL
# ============================================================
@logcall
def get_book_state(book_idx):
"""
Canonical merged read model.
Rules:
- SQL = snapshot baseline
- Redis = live counters
- merged = max(sql, redis)
- capped at chapters_total
"""
sqlite_row = sql_fetch_book(book_idx) or {}
redis_state = _r.hgetall(f"book:{book_idx}:state") or {}
def _int(v):
try:
return int(v)
except Exception:
return 0
chapters_total = _int(sqlite_row.get("chapters_total"))
# SQL snapshot
sql_downloaded = _int(sqlite_row.get("downloaded"))
sql_audio_done = _int(sqlite_row.get("audio_done"))
sql_audio_skipped = _int(sqlite_row.get("audio_skipped"))
# Redis live
redis_downloaded = _int(redis_state.get("chapters_download_done")) + _int(
redis_state.get("chapters_download_skipped")
)
redis_audio_done = _int(redis_state.get("audio_done"))
redis_audio_skipped = _int(redis_state.get("audio_skipped"))
# Merge
merged_downloaded = max(sql_downloaded, redis_downloaded)
merged_audio_done = max(sql_audio_done, redis_audio_done)
merged_audio_skipped = max(sql_audio_skipped, redis_audio_skipped)
if chapters_total > 0:
merged_downloaded = min(merged_downloaded, chapters_total)
merged_audio_done = min(merged_audio_done, chapters_total)
merged_audio_skipped = min(merged_audio_skipped, chapters_total)
audio_completed = merged_audio_done + merged_audio_skipped
# Build state
state = dict(sqlite_row)
state.update(
{
"downloaded": merged_downloaded,
"audio_done": merged_audio_done,
"audio_skipped": merged_audio_skipped,
"chapters_total": chapters_total,
}
)
# Derived status
status = sqlite_row.get("status") or "unknown"
if chapters_total > 0:
if merged_downloaded < chapters_total:
status = "downloading"
elif merged_downloaded == chapters_total and audio_completed < chapters_total:
status = "audio"
elif audio_completed >= chapters_total:
status = "done"
state["status"] = status
return state
# ============================================================
# READ HELPERS (VIA get_book_state ONLY)
# ============================================================
@logcall
def get_chapters_total(book_idx):
return int(get_book_state(book_idx).get("chapters_total", 0))
@logcall
def get_audio_done(book_idx):
return int(get_book_state(book_idx).get("audio_done", 0))
@logcall
def get_audio_completed_total(book_idx):
state = get_book_state(book_idx)
return int(state.get("audio_done", 0)) + int(state.get("audio_skipped", 0))
# ============================================================
# STATUSCHECK GUARD (INTENTIONAL DIRECT REDIS)
# ============================================================
@logcall
def try_trigger_statuscheck(book_idx):
return bool(_r.set(f"book:{book_idx}:statuscheck:triggered", "1", nx=True))
# ============================================================
# ACTIVE / REGISTERED BOOK LISTS (UI API)
# ============================================================
@logcall
def get_registered_books():
"""
Books visible in the 'registered' list in the UI.
"""
all_books = sql_fetch_all_books()
HIDDEN_STATES = {"hidden"}
return [b for b in all_books if b.get("status") not in HIDDEN_STATES]
@logcall
def get_active_books():
"""
Books currently active in the dashboard.
"""
all_books = sql_fetch_all_books()
HIDDEN_STATES = {"hidden", "done"}
return [b for b in all_books if b.get("status") not in HIDDEN_STATES]
@logcall
def store_m4b_error(book_idx: str, volume: str, error_text: str):
"""
Passive storage of m4b errors.
No logic, no retries, no state transitions.
"""
key = f"book:{book_idx}:m4b:errors"
entry = f"{volume}: {error_text}"
_r.rpush(key, entry)

@ -1,130 +0,0 @@
# ============================================================
# File: db/state_redis.py (UPDATED for book_idx-only architecture)
# Purpose:
# Low-level Redis counters/state for BookScraper.
# Used ONLY by db.repository façade.
# ============================================================
import os
import time
import redis
from logbus.publisher import log
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ------------------------------------------------------------
# INTERNAL KEY BUILDER
# ------------------------------------------------------------
def _key(book_idx: str) -> str:
return f"book:{book_idx}:state"
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def redis_set_status(book_idx: str, status: str):
log(f"[DB-REDIS] Setting status for {book_idx} to {status}")
key = _key(book_idx)
r.hset(key, "status", status)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# TOTAL CHAPTERS
# ------------------------------------------------------------
def redis_set_chapters_total(book_idx: str, total: int):
key = _key(book_idx)
r.hset(key, "chapters_total", total)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# DOWNLOAD COUNTERS
# ------------------------------------------------------------
def redis_inc_download_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_download_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_download_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download skipped for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_download_skipped", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# PARSE COUNTERS
# ------------------------------------------------------------
def redis_inc_parsed_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing parsed done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_parsed_done", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# AUDIO COUNTERS
# ------------------------------------------------------------
def redis_inc_audio_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "audio_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_audio_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio skipped for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "audio_skipped", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# INITIALISE BOOK STATE
# ------------------------------------------------------------
def init_book_state(book_idx: str, title: str, url: str, chapters_total: int):
"""
Initialiseert de complete Redis state voor een nieuw boek.
LET OP:
- Als een key al bestaat NIET resetten (progress behouden).
- Alleen missende velden worden toegevoegd.
"""
key = f"book:{book_idx}:state"
# Bestaat al? Dan vullen we alleen missende velden aan.
exists = r.exists(key)
pipeline = r.pipeline()
# Basis metadata
pipeline.hsetnx(key, "book_id", book_idx)
pipeline.hsetnx(key, "title", title or "")
pipeline.hsetnx(key, "url", url or "")
# State
pipeline.hsetnx(key, "status", "registered")
# Counters
pipeline.hsetnx(key, "chapters_total", chapters_total)
pipeline.hsetnx(key, "chapters_download_done", 0)
pipeline.hsetnx(key, "chapters_download_skipped", 0)
pipeline.hsetnx(key, "chapters_parsed_done", 0)
pipeline.hsetnx(key, "audio_done", 0)
pipeline.hsetnx(key, "audio_skipped", 0)
# Timestamp
pipeline.hset(key, "last_update", int(time.time()))
pipeline.execute()
if exists:
log(f"[DB-REDIS] init_book_state(): UPDATED existing state for {book_idx}")
else:
log(f"[DB-REDIS] init_book_state(): CREATED new state for {book_idx}")

@ -1,178 +0,0 @@
# ============================================================
# File: db/state_sql.py (UPDATED for book_idx-only architecture)
# Purpose:
# Low-level SQLite snapshot layer for BookScraper metadata.
# Used ONLY through db.repository façade.
# ============================================================
import sqlite3
import os
from logbus.publisher import log
# Must match db/db.py
DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/data/books.db")
# ------------------------------------------------------------
# INTERNAL HELPERS
# ------------------------------------------------------------
def _connect():
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
# ------------------------------------------------------------
# FETCH
# ------------------------------------------------------------
def sql_fetch_book(book_idx):
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
row = cur.fetchone()
conn.close()
return dict(row) if row else None
def sql_fetch_all_books():
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books ORDER BY created_at DESC")
rows = cur.fetchall()
conn.close()
return [dict(r) for r in rows]
# ------------------------------------------------------------
# REGISTER / UPDATE
# ------------------------------------------------------------
def sql_register_book(book_idx, fields: dict):
"""
Insert or replace entire book record.
book_idx is the PRIMARY KEY.
"""
conn = _connect()
cur = conn.cursor()
cols = ", ".join(["book_idx"] + list(fields.keys()))
placeholders = ", ".join(["?"] * (1 + len(fields)))
values = [book_idx] + list(fields.values())
cur.execute(
f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})",
values,
)
conn.commit()
conn.close()
def sql_update_book(book_idx, fields: dict):
if not fields:
return
conn = _connect()
cur = conn.cursor()
set_clause = ", ".join([f"{k} = ?" for k in fields])
params = list(fields.values()) + [book_idx]
cur.execute(
f"UPDATE books SET {set_clause} WHERE book_idx = ?",
params,
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def sql_set_status(book_idx, status: str):
conn = _connect()
cur = conn.cursor()
cur.execute(
"UPDATE books SET status = ? WHERE book_idx = ?",
(status, book_idx),
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# CHAPTER TOTAL (snapshot)
# ------------------------------------------------------------
def sql_set_chapters_total(book_idx, total: int):
conn = _connect()
cur = conn.cursor()
cur.execute(
"UPDATE books SET chapters_total = ? WHERE book_idx = ?",
(total, book_idx),
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# COUNTERS (SNAPSHOT-ONLY)
# ------------------------------------------------------------
def sql_inc_downloaded(book_idx, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET downloaded = COALESCE(downloaded,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_parsed(book_idx, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET parsed = COALESCE(parsed,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_audio_done(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio_done for {book_idx} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_done = COALESCE(audio_done,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_audio_skipped(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio_skipped for {book_idx} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_skipped = COALESCE(audio_skipped,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()

@ -149,22 +149,3 @@ services:
- .env - .env
command: celery -A celery_app worker -Q scraping -n scraping@%h -l INFO command: celery -A celery_app worker -Q scraping -n scraping@%h -l INFO
restart: "no" restart: "no"
# ----------------------------------------------------------
# M4B Worker (Finalization)
# ----------------------------------------------------------
worker_m4b:
build:
context: .
dockerfile: docker/Dockerfile.m4b
container_name: worker_m4b
command: celery -A celery_app worker -Q m4b -n m4b@%h -l INFO
depends_on:
redis:
condition: service_healthy
env_file:
- .env
volumes:
- .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
restart: "no"

@ -1,70 +0,0 @@
FROM debian:12
ENV DEBIAN_FRONTEND=noninteractive
# ----------------------------------------------------------
# System + PHP (PHP 8.2 native)
# ----------------------------------------------------------
RUN apt-get update && apt-get install -y \
ffmpeg \
curl \
ca-certificates \
bash \
php-cli \
php-intl \
php-json \
php-mbstring \
php-xml \
php-curl \
php-zip \
python3 \
python3-pip \
python3-venv \
\
# build deps for mp4v2
git \
build-essential \
autoconf \
automake \
libtool \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# ----------------------------------------------------------
# Python venv (PEP 668 compliant)
# ----------------------------------------------------------
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:/usr/local/bin:$PATH"
# ----------------------------------------------------------
# Build & install mp4v2 (mp4info)
# ----------------------------------------------------------
WORKDIR /tmp
RUN git clone https://github.com/sandreas/mp4v2 \
&& cd mp4v2 \
&& ./configure \
&& make -j$(nproc) \
&& make install \
&& echo "/usr/local/lib" > /etc/ld.so.conf.d/mp4v2.conf \
&& ldconfig \
&& cd / \
&& rm -rf /tmp/mp4v2
# ----------------------------------------------------------
# Install m4b-tool
# ----------------------------------------------------------
RUN curl -L https://github.com/sandreas/m4b-tool/releases/latest/download/m4b-tool.phar \
-o /usr/local/bin/m4b-tool \
&& chmod +x /usr/local/bin/m4b-tool
# ----------------------------------------------------------
# App
# ----------------------------------------------------------
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY . /app
CMD ["bash"]

@ -1,31 +1,9 @@
# logbus/publisher.py # logbus/publisher.py
import logging import logging
import os
logger = logging.getLogger("logbus") logger = logging.getLogger("logbus")
logger.setLevel(logging.WARNING)
# ============================================================
# FILE LOGGER — log.txt in BOOKSCRAPER_OUTPUT_DIR
# ============================================================
try:
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
os.makedirs(root, exist_ok=True)
file_path = os.path.join(root, "log.txt")
file_handler = logging.FileHandler(file_path, mode="a", encoding="utf-8")
file_formatter = logging.Formatter("%(message)s") # exact zoals input
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
except Exception:
# Logging naar file mag nooit de app laten crashen
pass
def log(message: str): def log(message: str):
""" """

@ -1 +0,0 @@
Subproject commit 480a73324f53d0d24bea4931c3902097f8e2a663

Binary file not shown.

@ -1,7 +1,7 @@
import os import os
import redis import redis
from scraper.logger_decorators import logcall # GUI log (non-breaking)
from scraper.ui_log import push_ui from scraper.ui_log import push_ui
# --------------------------------------------------------- # ---------------------------------------------------------
@ -13,58 +13,55 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# Debug mode (optional) # Debug mode (optional)
ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1" ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1"
# Avoid duplicate spam # Internal flag to avoid spamming the same message
_seen_debug_keys = set() _seen_debug_keys = set()
# ========================================================= # =========================================================
# INTERNAL DEBUGGING # ABORT FLAG
# ========================================================= # =========================================================
def _debug(msg: str): def _debug(msg: str):
"""Print + GUI log (non-breaking, minimal noise)."""
print(msg) print(msg)
push_ui(msg) push_ui(msg)
# ========================================================= def set_abort(book_id: str):
# ABORT FLAG — unified book_idx """Enable abort mode for this book."""
# ========================================================= key = f"abort:{book_id}"
def set_abort(book_idx: str):
"""Enable abort mode for book_idx."""
key = f"abort:{book_idx}"
r.set(key, "1") r.set(key, "1")
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT] SET {key}") _debug(f"[ABORT] SET {key}")
def clear_abort(book_idx: str): def clear_abort(book_id: str):
"""Clear abort flag.""" """Clear abort flag."""
key = f"abort:{book_idx}" key = f"abort:{book_id}"
r.delete(key) r.delete(key)
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT] CLEAR {key}") _debug(f"[ABORT] CLEAR {key}")
def abort_requested(book_idx: str, redis_client=None) -> bool: def abort_requested(book_id: str, redis_client=None) -> bool:
""" """
Check whether abort flag is active for book_idx. Return True if abort flag is set.
redis_client: redis_client:
- Docker workers None use default Redis (r) - Docker workers None use default Redis (r)
- Local macOS audio worker passes Redis(host=127.0.0.1) - Local macOS audio passes Redis(host=127.0.0.1)
""" """
client = redis_client or r client = redis_client or r
key = f"abort:{book_idx}" key = f"abort:{book_id}"
try: try:
exists = client.exists(key) exists = client.exists(key)
if ABORT_DEBUG: if ABORT_DEBUG:
# Log once per key
# Log only once per book
if key not in _seen_debug_keys: if key not in _seen_debug_keys:
try: try:
conn = client.connection_pool.connection_kwargs conn = client.connection_pool.connection_kwargs
@ -72,53 +69,54 @@ def abort_requested(book_idx: str, redis_client=None) -> bool:
port = conn.get("port") port = conn.get("port")
db = conn.get("db") db = conn.get("db")
_debug( _debug(
# f"[ABORT_DEBUG] first check book_idx={book_idx} " f"[ABORT_DEBUG] first check book_id={book_id} "
f"redis={host}:{port} db={db}" f"redis={host}:{port} db={db}"
) )
except Exception: except Exception:
_debug(f"[ABORT_DEBUG] first check book_idx={book_idx}") _debug(f"[ABORT_DEBUG] first check book_id={book_id}")
_seen_debug_keys.add(key) _seen_debug_keys.add(key)
# Log ACTIVE state # Only log abort ACTIVE
if exists == 1: if exists == 1:
_debug(f"[ABORT] ACTIVE for {book_idx}") _debug(f"[ABORT] ACTIVE for {book_id}")
return exists == 1 return exists == 1
except Exception as e: except Exception as e:
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}") _debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}")
return False return False
# ========================================================= # =========================================================
# PER-CHAPTER STATE — unified book_idx # PER-CHAPTER STATE
# ========================================================= # =========================================================
def mark_chapter_started(book_idx: str, chapter_num: int): def mark_chapter_started(book_id: str, chapter_num: int):
key = f"started:{book_idx}:{chapter_num}" key = f"started:{book_id}:{chapter_num}"
r.set(key, "1") r.set(key, "1")
def chapter_started(book_idx: str, chapter_num: int) -> bool: def chapter_started(book_id: str, chapter_num: int) -> bool:
key = f"started:{book_idx}:{chapter_num}" key = f"started:{book_id}:{chapter_num}"
return r.exists(key) == 1 return r.exists(key) == 1
# ========================================================= # =========================================================
# RESET STATE FOR BOOK_IDX # UTILITY: RESET FOR A BOOK
# ========================================================= # =========================================================
def reset_book_state(book_idx: str): def reset_book_state(book_id: str):
""" """
Remove abort flag and all per-chapter started markers. Remove abort flag and all chapter-start markers.
""" """
# abort flag key = f"abort:{book_id}"
r.delete(f"abort:{book_idx}") r.delete(key)
# chapter markers pattern = f"started:{book_id}:*"
pattern = f"started:{book_idx}:*"
for k in r.scan_iter(pattern): for k in r.scan_iter(pattern):
r.delete(k) r.delete(k)

@ -1,55 +1,202 @@
# ============================================================ # scraper/book_scraper.py
# File: scraper/book_scraper.py
# Purpose:
# Backwards-compatible wrapper giving the SAME public API
# as the old BookScraper, but internally uses ScrapeEngine.
#
# execute() → full metadata + chapterlist (NO book_idx creation)
#
# ID management is now handled exclusively by InitService.
# ============================================================
from scraper.logger_decorators import logcall import requests
from scraper.services.scrape_engine import ScrapeEngine from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements
from scraper.models.book_state import Chapter
class BookScraper: class BookScraper:
""" """
Backwards-compatible BookScraper façade. Minimal scraper: only metadata + chapter list.
The DownloadController handles Celery pipelines for:
- download
- parse
- save
"""
Old responsibilities (metadata, chapters, covers, downloads) def __init__(self, site, url):
are now split: self.site = site
self.url = url
ScrapeEngine metadata + chapterlist self.book_title = ""
Download tasks handle download/parse/save self.book_author = ""
InitService determines book_idx (single source of truth) self.book_description = ""
self.cover_url = ""
self.chapter_base = None
This wrapper intentionally does NOT generate a book_idx or book_id. self.chapters = []
It only returns metadata/chapters in legacy-compatible dict format.
"""
@logcall # Load custom replacements
def __init__(self, site_scraper, url: str): extra = load_replacements("replacements.txt")
self.site = site_scraper self.site.replacements.update(extra)
self.url = url
@logcall # ------------------------------------------------------------
def execute(self): def execute(self):
""" """Main entry point. Returns metadata + chapter URLs."""
Legacy public API: soup = self._fetch(self.url)
Return metadata + chapter list EXACTLY as before,
but without generating any book_id. self._parse_title(soup)
""" self._parse_author(soup)
self._parse_description(soup)
self._parse_cover(soup)
data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url) chapter_page = self.get_chapter_page(soup)
self.parse_chapter_links(chapter_page)
log_debug(f"[BookScraper] Completed metadata parse")
# Legacy structure preserved, unchanged:
return { return {
"title": data.get("title"), "title": self.book_title,
"author": data.get("author"), "author": self.book_author,
"description": data.get("description"), "description": self.book_description,
"cover_url": data.get("cover_url"), "cover_url": self.cover_url, # ← used by DownloadController
"chapters": data.get("chapters", []), "book_url": self.url,
"chapters_total": data.get("chapters_total", 0), "chapters": [
"book_url": data.get("book_url"), # used later by parse/save tasks {"num": ch.number, "title": ch.title, "url": ch.url}
for ch in self.chapters
],
} }
# ------------------------------------------------------------
def _fetch(self, url):
log_debug(f"[BookScraper] Fetch: {url}")
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
resp.encoding = self.site.encoding
return BeautifulSoup(resp.text, "lxml")
# ------------------------------------------------------------
def _parse_title(self, soup):
h1 = soup.find("h1")
self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
log_debug(f"[BookScraper] Title = {self.book_title}")
def _parse_author(self, soup):
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
self.book_author = raw.split("")[1] if "" in raw else "UnknownAuthor"
log_debug(f"[BookScraper] Author = {self.book_author}")
def _parse_description(self, soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
self.book_description = ""
log_debug("[BookScraper] Description not found")
return
parts = []
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
self.book_description = clean_text("\n".join(parts), self.site.replacements)
log_debug(f"[BookScraper] Description length = {len(self.book_description)}")
# ------------------------------------------------------------
def _parse_cover(self, soup):
"""
Extract correct cover based on book_id path logic.
1. primary: match "/files/article/image/{vol}/{book_id}/"
2. fallback: endswith "/{book_id}s.jpg"
"""
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", self.url)
if not m:
log_debug("[BookScraper] No book_id found in URL → cannot match cover")
return
book_id = m.group(1)
# Extract vol folder from URL (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", self.url)
volume = m2.group(1) if m2 else None
log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
imgs = soup.find_all("img", src=True)
chosen = None
# --------------------------------------------------------
# PRIORITY 1: Path-match
# /files/article/image/{vol}/{book_id}/
# --------------------------------------------------------
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
log_debug(f"[BookScraper] Cover matched by PATH: {src}")
break
# --------------------------------------------------------
# PRIORITY 2: endswith "/{book_id}s.jpg"
# --------------------------------------------------------
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
break
# --------------------------------------------------------
# No match
# --------------------------------------------------------
if not chosen:
log_debug("[BookScraper] No matching cover found")
return
self.cover_url = urljoin(self.site.root, chosen)
log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
# ------------------------------------------------------------
def get_chapter_page(self, soup):
"""Return BeautifulSoup of the main chapter list page."""
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
chapter_url = urljoin(self.site.root, href)
# base for chapter links
parts = chapter_url.rsplit("/", 1)
self.chapter_base = parts[0] + "/"
return self._fetch(chapter_url)
# ------------------------------------------------------------
def parse_chapter_links(self, soup):
cont = soup.select_one(self.site.chapter_list_selector)
items = cont.select("ul li a[href]")
self.chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full = urljoin(self.chapter_base, href)
self.chapters.append(Chapter(idx, title, full))
idx += 1
log_debug(f"[BookScraper] Found {len(self.chapters)} chapters")

@ -1,54 +1,63 @@
# ========================================================= # =========================================================
# File: scraper/download_controller.py # File: scraper/download_controller.py
# Purpose: # Purpose:
# Build Celery pipelines for all chapters using book_idx # Build Celery pipelines for all chapters
# Handles: # and pass book_id for abort/progress/log functionality.
# • volume assignment # + Download and replicate cover image to all volume folders
# • cover download + replication # + Generate scripts (allinone.txt, makebook, say)
# • script generation # + Initialize Redis Book State Model (status + counters)
# • Redis Book State Model init
# • abort tracking
# ========================================================= # =========================================================
from celery import group from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline from scraper.tasks.pipeline import build_chapter_pipeline
from scraper.scriptgen import generate_all_scripts
# ❗ IMPORTANT:
# generate_all_scripts MUST NOT import DownloadController, otherwise circular import.
# We keep the import, but scriptgen must be clean.
from scraper import scriptgen
from logbus.publisher import log from logbus.publisher import log
import os import os
import requests import requests
import shutil import shutil
from scraper.abort import abort_requested # DEBUG allowed
from db.repository import create_or_update_book
from scraper.abort import abort_requested # NEW: Redis State Model (C&U)
from db.state_redis import init_book_state from scraper.progress import (
from db.repository import set_status, set_chapters_total init_book_state,
set_status,
set_chapter_total,
)
class DownloadController: class DownloadController:
""" """
Coordinates all chapter pipelines (download parse save). Coordinates all chapter pipelines (download parse save),
including:
- volume splitting
- consistent meta propagation
- book_id-based abort + progress tracking
- cover download + volume replication
- script generation (allinone.txt, makebook, say)
- Redis book state initialisation and status updates
""" """
def __init__(self, book_idx: str, scrape_result: dict): def __init__(self, book_id: str, scrape_result: dict):
self.book_idx = str(book_idx) self.book_id = book_id
self.scrape_result = scrape_result self.scrape_result = scrape_result
# Metadata # Core metadata
self.title = scrape_result.get("title", "UnknownBook") self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or [] self.chapters = scrape_result.get("chapters", []) or []
self.cover_url = scrape_result.get("cover_url") self.cover_url = scrape_result.get("cover_url")
# Output folder # Output base dir
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
# Volume size
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
# Base folder for the whole book
self.book_base = os.path.join(root, self.title) self.book_base = os.path.join(root, self.title)
os.makedirs(self.book_base, exist_ok=True) os.makedirs(self.book_base, exist_ok=True)
# Meta passed downstream # Meta passed to parse/save stage
self.meta = { self.meta = {
"title": self.title, "title": self.title,
"author": scrape_result.get("author"), "author": scrape_result.get("author"),
@ -56,120 +65,200 @@ class DownloadController:
"book_url": scrape_result.get("book_url"), "book_url": scrape_result.get("book_url"),
} }
log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}") # -------------------------------------------------
# DEBUG — bevestig dat controller correct book_id ziet
# -------------------------------------------------
log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'")
try:
abort_state = abort_requested(book_id)
log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested ERROR: {e}")
# Init Redis Book State Model # -------------------------------------------------
# NEW: Initialize Redis Book State Model
# -------------------------------------------------
try: try:
init_book_state( init_book_state(
book_id=self.book_idx, book_id=self.book_id,
title=self.title, title=self.title,
url=self.meta["book_url"], url=self.scrape_result.get("book_url"),
chapters_total=len(self.chapters), chapters_total=len(self.chapters),
) )
log(f"[CTRL_STATE] init_book_state() completed for {self.title}")
except Exception as e: except Exception as e:
log(f"[CTRL_STATE] init_book_state FAILED: {e}") log(f"[CTRL_STATE] init_book_state FAILED: {e}")
# ---------------------------------------------------------
# Cover Download
# --------------------------------------------------------- # ---------------------------------------------------------
def download_cover(self): def download_cover(self):
"""Download one cover image into the root of the book folder."""
if not self.cover_url: if not self.cover_url:
return log(f"[CTRL] No cover URL for '{self.title}'") log(f"[CTRL] No cover URL found for '{self.title}'")
return
cover_path = os.path.join(self.book_base, "cover.jpg") cover_path = os.path.join(self.book_base, "cover.jpg")
headers = { headers = {
"User-Agent": "Mozilla/5.0", "User-Agent": (
"Referer": self.scrape_result.get("book_url") or "", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
),
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
} }
try: try:
log(f"[CTRL] Downloading cover: {self.cover_url}") log(f"[CTRL] Downloading cover: {self.cover_url}")
resp = requests.get(self.cover_url, timeout=10, headers=headers) resp = requests.get(self.cover_url, timeout=10, headers=headers)
resp.raise_for_status() resp.raise_for_status()
with open(cover_path, "wb") as f: with open(cover_path, "wb") as f:
f.write(resp.content) f.write(resp.content)
log(f"[CTRL] Cover saved: {cover_path}") log(f"[CTRL] Cover saved to: {cover_path}")
except Exception as e: except Exception as e:
log(f"[CTRL] Cover download failed: {e}") log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
# ---------------------------------------------------------
# Cover Replication to Volumes
# --------------------------------------------------------- # ---------------------------------------------------------
def replicate_cover_to_volumes(self): def replicate_cover_to_volumes(self):
"""Copy cover.jpg into each existing Volume_xxx directory."""
src = os.path.join(self.book_base, "cover.jpg") src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src): if not os.path.exists(src):
log("[CTRL] No cover.jpg found, replication skipped")
return return
try:
for entry in os.listdir(self.book_base): for entry in os.listdir(self.book_base):
if entry.lower().startswith("volume_"): if entry.lower().startswith("volume_"):
dst = os.path.join(self.book_base, entry, "cover.jpg") vol_dir = os.path.join(self.book_base, entry)
try: dst = os.path.join(vol_dir, "cover.jpg")
shutil.copyfile(src, dst) shutil.copyfile(src, dst)
log(f"[CTRL] Cover replicated → {dst}") log(f"[CTRL] Cover replicated into: {dst}")
except Exception as e: except Exception as e:
log(f"[CTRL] Cover replication failed: {e}") log(f"[CTRL] Cover replication failed: {e}")
# ---------------------------------------------------------
def store_cover_in_static(self): def store_cover_in_static(self):
"""
Copy the main cover.jpg from book_base into static/covers/<book_id>.jpg.
This allows the Flask web UI to serve the cover directly.
"""
src = os.path.join(self.book_base, "cover.jpg") src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src): if not os.path.exists(src):
log("[CTRL] No cover.jpg found, cannot store in static/covers")
return return
os.makedirs("static/covers", exist_ok=True) # static/covers/<book_id>.jpg
dst = os.path.join("static/covers", f"{self.book_idx}.jpg") static_dir = os.path.join("static", "covers")
os.makedirs(static_dir, exist_ok=True)
dst = os.path.join(static_dir, f"{self.book_id}.jpg")
try: try:
shutil.copyfile(src, dst) shutil.copyfile(src, dst)
log(f"[CTRL] Cover stored for UI: {dst}") log(f"[CTRL] Cover stored for UI: {dst}")
except Exception as e: except Exception as e:
log(f"[CTRL] Failed storing cover: {e}") log(f"[CTRL] Failed to store cover in static: {e}")
# ---------------------------------------------------------
# Volume isolation
# --------------------------------------------------------- # ---------------------------------------------------------
def get_volume_path(self, chapter_num: int) -> str: def get_volume_path(self, chapter_num: int) -> str:
"""Returns the correct volume directory for a chapter."""
vol_index = (chapter_num - 1) // self.max_vol + 1 vol_index = (chapter_num - 1) // self.max_vol + 1
vol_name = f"Volume_{vol_index:03d}" vol_name = f"Volume_{vol_index:03d}"
vol_path = os.path.join(self.book_base, vol_name) vol_path = os.path.join(self.book_base, vol_name)
os.makedirs(vol_path, exist_ok=True) os.makedirs(vol_path, exist_ok=True)
return vol_path return vol_path
# ---------------------------------------------------------
# Pipeline launcher
# --------------------------------------------------------- # ---------------------------------------------------------
def start(self): def start(self):
total = len(self.chapters) total = len(self.chapters)
log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)")
# Update Redis/SQLite state log(
f"[CTRL] Initialising pipeline for '{self.title}' "
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
)
log(f"[CTRL] Output root: {self.book_base}")
# -------------------------------------
# NEW: Redis state update
# -------------------------------------
try: try:
set_status(self.book_idx, "downloading") set_status(self.book_id, "downloading")
set_chapters_total(self.book_idx, total) set_chapter_total(self.book_id, total)
log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}")
except Exception as e: except Exception as e:
log(f"[CTRL_STATE] Unable to set state: {e}") log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}")
# Download cover # -------------------------------------
# 1) Download cover
# -------------------------------------
self.download_cover() self.download_cover()
# Build pipeline tasks
tasks = [] tasks = []
for ch in self.chapters: for ch in self.chapters:
num = ch["num"]
chapter_info = { # Build chapter_dict (NEW)
"num": num, chapter_num = ch["num"]
"url": ch["url"], chapter_url = ch["url"]
"title": ch.get("title"), chapter_title = ch.get("title")
"volume_path": self.get_volume_path(num),
volume_path = self.get_volume_path(chapter_num)
chapter_dict = {
"num": chapter_num,
"url": chapter_url,
"title": chapter_title,
"volume_path": volume_path,
} }
tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta))
# Dispatch pipeline with chapter_dict
tasks.append(
build_chapter_pipeline(
self.book_id,
chapter_dict,
self.meta,
)
)
async_result = group(tasks).apply_async() async_result = group(tasks).apply_async()
# Replicate cover + place in static log(
f"[CTRL] Pipelines dispatched for '{self.title}' "
f"(book_id={self.book_id}, group_id={async_result.id})"
)
# Debug abort state
try:
abort_state = abort_requested(self.book_id)
log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}")
# -------------------------------------------------------
self.replicate_cover_to_volumes() self.replicate_cover_to_volumes()
self.store_cover_in_static() self.store_cover_in_static()
# -------------------------------------------------------
# Generate scripts (LATE IMPORT to avoid circular)
try: try:
scriptgen.generate_all_scripts( generate_all_scripts(
self.book_base, self.title, self.meta["author"] self.book_base,
self.title,
self.meta.get("author"),
) )
log("[CTRL] Scripts generated") log(f"[CTRL] Scripts generated for '{self.title}'")
except Exception as e: except Exception as e:
log(f"[CTRL] Script generation failed: {e}") log(f"[CTRL] Script generation failed: {e}")

@ -1,27 +0,0 @@
# ============================================================
# File: scraper/engine/fetcher.py
# Purpose:
# Low-level HTML fetch utility shared by all site scrapers.
# Replaces scattered _fetch() logic inside BookScraper.
# ============================================================
import requests
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
)
}
def fetch_html(url: str, encoding: str = "utf-8", timeout: int = 10) -> BeautifulSoup:
"""
Fetch HTML with a consistent user-agent and encoding.
Returns BeautifulSoup(lxml).
"""
resp = requests.get(url, headers=HEADERS, timeout=timeout)
resp.encoding = encoding
return BeautifulSoup(resp.text, "lxml")

@ -1,65 +0,0 @@
# ============================================================
# File: scraper/engine/parser.py
# Purpose:
# High-level scraping API coordinating metadata extraction
# and chapter extraction using pluggable SiteScraper classes.
#
# This is the new central engine:
# - extract_metadata_only() used by INIT flow
# - extract_metadata_full() used by full scraping pipeline
# ============================================================
from scraper.engine.fetcher import fetch_html
def extract_metadata_only(url: str, site_scraper):
"""
Extract ONLY lightweight metadata:
- title
- author
- description
- cover_url
- chapters_total = 0
"""
soup = fetch_html(url, site_scraper.encoding)
title = site_scraper.parse_title(soup)
author = site_scraper.parse_author(soup)
description = site_scraper.parse_description(soup)
cover_url = site_scraper.parse_cover(soup, url)
return {
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"chapters_total": 0,
"book_url": url,
}
def extract_metadata_full(url: str, site_scraper):
"""
Full scraping (metadata + chapterlist).
Used by the scraping Celery pipeline.
"""
soup = fetch_html(url, site_scraper.encoding)
# metadata
meta = extract_metadata_only(url, site_scraper)
# chapter list
chapter_page_url = site_scraper.extract_chapter_page_url(soup)
chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding)
chapters = site_scraper.parse_chapter_list(chapter_page_soup)
meta["chapters"] = chapters
return meta
def build_book_id(title: str) -> str:
"""
Canonical book_id generator.
SCRAPE currently uses title as ID preserve that behavior.
"""
return title

@ -1,33 +0,0 @@
# ============================================================
# File: scraper/logger_decorators.py
# Purpose: Function-call logging decorator
# ============================================================
from functools import wraps
from scraper.logger import log_debug
def logcall(func):
"""
Decorator: log function name + arguments every time it's called.
Usage: @logcall above any function.
"""
@wraps(func)
def wrapper(*args, **kwargs):
# Naam van de functie
name = func.__qualname__
# Eerste logregel vóór uitvoering
# log_debug(f"[CALL] {name} args={args} kwargs={kwargs}")
log_debug(f"[CALL] {name} args={args}")
# log_debug(f"[CALL] {name}")
result = func(*args, **kwargs)
# Log ná uitvoering
# log_debug(f"[RETURN] {name} → {result}")
return result
return wrapper

@ -54,8 +54,6 @@ Copyright=
章节出错= 章节出错=
点此举报= 点此举报=
举报原因= 举报原因=
求收藏=
推荐票=
www.piaotia.com= www.piaotia.com=
www.piaotian.com= www.piaotian.com=
www.= www.=

@ -5,7 +5,6 @@
import os import os
import stat import stat
from logbus.publisher import log from logbus.publisher import log
from scraper.logger_decorators import logcall
TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates") TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")
@ -44,40 +43,13 @@ def detect_volumes(book_base: str):
# ------------------------------------------------------------ # ------------------------------------------------------------
def build_merge_block(title: str, author: str, volumes): def build_merge_block(title: str, author: str, volumes):
lines = [] lines = []
# --------------------------------------------------------
# Normalize input (defensive)
# --------------------------------------------------------
title = (title or "").strip()
author = (author or "").strip()
total_vols = len(volumes)
# Padding-regel:
# - altijd minimaal 2 (01, 02)
# - 3 bij >=100
if total_vols >= 100:
pad = 3
else:
pad = 2
for num, dirname in volumes: for num, dirname in volumes:
vol_num = f"{num:0{pad}d}" # voor filename
series_part = f"{num:0{pad}d}" # voor series-part (string!)
line = ( line = (
f"m4b-tool merge --jobs=4 " f'm4b-tool merge --jobs=4 --writer="{author}" '
f'--writer="{author}" ' f'--albumartist="{author}" --album="{title}" '
f'--sortalbum="{title}" ' f'--name="{title}" --output-file="{title}-{num}.m4b" '
f'--albumartist="{author}" '
f'--album="{title}" '
f'--name="{title}" '
f'--series="{title}" '
f'--series-part="{series_part}" '
f'--output-file="{title}-{vol_num}.m4b" '
f'"{dirname}" -vvv' f'"{dirname}" -vvv'
) )
lines.append(line) lines.append(line)
if not lines: if not lines:
@ -89,14 +61,7 @@ def build_merge_block(title: str, author: str, volumes):
# ------------------------------------------------------------ # ------------------------------------------------------------
# Main generator # Main generator
# ------------------------------------------------------------ # ------------------------------------------------------------
@logcall
def generate_all_scripts(book_base: str, title: str, author: str): def generate_all_scripts(book_base: str, title: str, author: str):
# --------------------------------------------------------
# Defensive normalize
# --------------------------------------------------------
title = (title or "").strip()
author = (author or "").strip()
log(f"[SCRIPTGEN] Generating scripts in {book_base}") log(f"[SCRIPTGEN] Generating scripts in {book_base}")
# Load templates # Load templates

@ -1,94 +0,0 @@
# ============================================================
# File: scraper/services/audio_completion.py
# Purpose:
# Orchestration hook after audio completion.
#
# Rules (STRICT):
# - ALWAYS read via get_book_state()
# - Use ONLY merged counters from repository
# - NO usage of derived status field
# - Completion rule:
# audio_completed < chapters_total → NOT DONE
# ============================================================
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import (
get_book_state,
try_trigger_statuscheck,
)
from scraper.services.status_check_service import StatusCheckService
from scraper.tasks.m4b_tasks import queue_m4b_for_book
@logcall
def trigger_audio_completion_check(book_idx: str):
"""
Called after inc_audio_done() OR inc_audio_skipped().
Flow:
1. Fetch canonical merged state from repository
2. Evaluate completion via merged counters ONLY
3. Run filesystem validation (authoritative)
4. Apply idempotency guard
5. Queue m4b exactly once
"""
try:
# ----------------------------------------------------
# STEP 1 — CANONICAL MERGED STATE
# ----------------------------------------------------
state = get_book_state(book_idx)
chapters_total = int(state.get("chapters_total", 0))
audio_done = int(state.get("audio_done", 0))
audio_skipped = int(state.get("audio_skipped", 0))
audio_completed = audio_done + audio_skipped
log(
f"[AUDIO-COMPLETION] book={book_idx} "
f"audio_completed={audio_completed} chapters_total={chapters_total}"
)
# ----------------------------------------------------
# STEP 2 — FAST REJECT (MERGED COUNTERS ONLY)
# ----------------------------------------------------
if chapters_total <= 0 or audio_completed < chapters_total:
log(f"[AUDIO-COMPLETION] not yet complete for book={book_idx}")
return
# ----------------------------------------------------
# STEP 3 — FILESYSTEM VALIDATION (AUTHORITATIVE)
# ----------------------------------------------------
result = StatusCheckService.run(book_idx)
fs = result.get("filesystem", {})
audio_files = fs.get("audio_files", 0)
chapters_txt = fs.get("chapters_txt", 0)
effective_audio = audio_files + audio_skipped
if effective_audio < chapters_txt:
log(
f"[AUDIO-COMPLETION] FS validation failed "
f"(audio_files={audio_files}, skipped={audio_skipped}, txt={chapters_txt})"
)
return
# ----------------------------------------------------
# STEP 4 — IDEMPOTENCY GUARD (AFTER FS CONFIRMATION)
# ----------------------------------------------------
if not try_trigger_statuscheck(book_idx):
log(f"[AUDIO-COMPLETION] statuscheck already triggered for {book_idx}")
return
# ----------------------------------------------------
# STEP 5 — FINAL ACTION
# ----------------------------------------------------
log(f"[AUDIO-COMPLETION] DONE → queue m4b for book={book_idx}")
queue_m4b_for_book(book_idx)
except Exception as exc:
# MUST NEVER break audio workers
log(f"[AUDIO-COMPLETION][ERROR] book={book_idx} error={exc}")

@ -1,45 +0,0 @@
# ============================================================
# File: scraper/services/cover_service.py
# ============================================================
import os
import requests
from logbus.publisher import log
from typing import Optional
class CoverService:
@staticmethod
def download_main_cover(cover_url: str, book_id: str) -> Optional[str]:
"""
Downloads cover image into: static/covers/<book_id>.jpg.
Returns local path or None.
"""
if not cover_url:
log(f"[COVER] No cover URL for book={book_id}")
return None
static_dir = os.path.join("static", "covers")
os.makedirs(static_dir, exist_ok=True)
dst_path = os.path.join(static_dir, f"{book_id}.jpg")
try:
log(f"[COVER] Downloading: {cover_url}")
resp = requests.get(
cover_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
)
resp.raise_for_status()
with open(dst_path, "wb") as f:
f.write(resp.content)
log(f"[COVER] Stored: {dst_path}")
return dst_path
except Exception as e:
log(f"[COVER] FAILED ({cover_url}) → {e}")
return None

@ -1,95 +0,0 @@
# ============================================================
# File: scraper/services/init_service.py
# Purpose:
# Orchestrate INIT-flow:
# - resolve site
# - fetch minimal metadata
# - derive book_idx
# - register in SQLite
# - store main cover
# ============================================================
import re
from scraper.services.site_resolver import SiteResolver
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.cover_service import CoverService
from db.repository import register_book
from scraper.logger_decorators import logcall
class InitService:
# ------------------------------------------------------------
# BOOK IDX DERIVATION
# ------------------------------------------------------------
@staticmethod
@logcall
def derive_book_id(url: str) -> str:
"""
PTWXZ URL format ends with /{id}.html.
If no match fallback to sanitized URL.
Returns:
book_idx (string)
"""
m = re.search(r"/(\d+)\.html$", url)
if m:
return m.group(1)
# Fallback — ensures deterministic ID for unknown formats
return url.replace("/", "_").replace(":", "_")
# ------------------------------------------------------------
# MAIN INIT FLOW
# ------------------------------------------------------------
@staticmethod
@logcall
def execute(url: str) -> dict:
"""
INIT entry point.
Returns complete metadata + registration result.
"""
# 1) Resolve site handler
site = SiteResolver.resolve(url)
# 2) Create unified book_idx
book_idx = InitService.derive_book_id(url)
# Some site objects historically expect .book_id — we support it but DO NOT rely on it.
site.book_id = book_idx
# 3) Fetch initial metadata (title/author/description/cover)
meta = ScrapeEngine.fetch_metadata_only(site, url)
title = meta.get("title") or "Unknown"
author = meta.get("author")
description = meta.get("description")
cover_url = meta.get("cover_url")
# 4) Download & store main cover for UI
cover_path = CoverService.download_main_cover(cover_url, book_idx)
# 5) Register in SQLite (book_idx is the SOLE primary ID)
register_book(
book_idx=book_idx,
title=title,
author=author,
description=description,
cover_url=cover_url,
cover_path=cover_path,
book_url=url,
)
# 6) Return metadata for UI / API
return {
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path,
"status": "registered",
}

@ -1,286 +0,0 @@
# ============================================================
# File: scraper/services/scrape_engine.py (C&U — no circular import)
# Purpose:
# Unified scraping engine for INIT-flow and Celery tasks.
# ScrapeEngine does NOT determine book_idx itself.
# ============================================================
import os
import time
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from logbus.publisher import log
from scraper.logger import log_debug
from scraper.logger_decorators import logcall
from scraper.utils.utils import load_replacements
class ScrapeEngine:
"""
Central scraping engine.
Metadata + chapterlist scraping.
All methods logged with @logcall.
IMPORTANT:
- ScrapeEngine NEVER decides book_idx.
- No dependency on InitService (prevents circular import).
"""
# ------------------------------------------------------------
# REPLACEMENTS LOADER
# ------------------------------------------------------------
@staticmethod
@logcall
def _apply_replacements(site):
fp = os.path.join(os.getcwd(), "replacements.txt")
extra = load_replacements(fp)
if not hasattr(site, "replacements"):
site.replacements = {}
site.replacements.update(extra)
return True
# ------------------------------------------------------------
# RATE LIMITER
# ------------------------------------------------------------
MIN_DELAY = 1.0 / float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1"))
@staticmethod
@logcall
def _throttle(last_time=[0]):
now = time.time()
elapsed = now - last_time[0]
if elapsed < ScrapeEngine.MIN_DELAY:
time.sleep(ScrapeEngine.MIN_DELAY - elapsed)
last_time[0] = time.time()
return True
# ------------------------------------------------------------
# HTTP GET
# ------------------------------------------------------------
@staticmethod
@logcall
def _get_doc(url: str, site):
attempt = 1
while True:
ScrapeEngine._throttle()
log_debug(f"[SCRAPER] GET {url} (attempt {attempt})")
try:
resp = requests.get(
url,
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
except Exception as e:
log_debug(f"Network error {e} → retry {attempt + 1}s")
time.sleep(attempt + 1)
attempt += 1
continue
code = resp.status_code
if code == 200:
resp.encoding = getattr(site, "encoding", "utf-8")
return BeautifulSoup(resp.text, "lxml")
if code == 429:
cooldown = 60
log_debug("429 detected — cooldown 60s")
for i in range(cooldown, 0, -1):
log_debug(f" cooldown {i}s…")
time.sleep(1)
attempt += 1
continue
if code in (403, 500):
wait = min(5 * attempt, 30)
log_debug(f"HTTP {code} → retry in {wait}s")
time.sleep(wait)
attempt += 1
continue
wait = attempt + 1
log_debug(f"Unexpected HTTP {code} → sleep {wait}s")
time.sleep(wait)
attempt += 1
# ------------------------------------------------------------
# PARSER HELPERS
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_title(soup):
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownTitle"
@staticmethod
@logcall
def _parse_author(soup):
td = soup.find("td", string=lambda t: t and "" in t)
if td and "" in td.get_text():
return td.get_text(strip=True).split("")[1]
return "UnknownAuthor"
@staticmethod
@logcall
def _parse_description(soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
txt = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if txt:
parts.append(txt)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSER (NO InitService dependency)
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_cover(soup, site):
"""
Extract book index from URL heuristically instead of InitService
(prevents circular import).
"""
# Typical Chinese novel sites embed numeric ID in URL path
try:
parsed = urlparse(site.url)
digits = re.findall(r"\d+", parsed.path)
book_idx = digits[-1] if digits else None
except Exception:
book_idx = None
imgs = soup.find_all("img", src=True)
candidates = []
for img in imgs:
src = img["src"].strip()
filename = os.path.basename(src)
if book_idx and book_idx in filename:
candidates.append((filename, src))
if not candidates:
return None
candidates.sort(key=lambda t: len(t[0])) # smallest filename
return urljoin(site.root, candidates[0][1])
# ------------------------------------------------------------
# RESOLVE CHAPTER PAGE
# ------------------------------------------------------------
@staticmethod
@logcall
def _resolve_chapter_page(soup, site):
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
if not node:
raise ValueError("Could not locate chapter list base node")
href = node.select_one("a").get("href")
url = urljoin(site.root, href)
parsed = urlparse(url)
basepath = parsed.path.rsplit("/", 1)[0] + "/"
chapter_base = f"{parsed.scheme}://{parsed.netloc}{basepath}"
return url, chapter_base
# ------------------------------------------------------------
# PARSE CHAPTER LINKS
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_chapter_links(soup, chapter_base, selector):
cont = soup.select_one(selector)
if not cont:
return []
items = cont.select("ul li a[href]")
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full = urljoin(chapter_base, href)
chapters.append({"num": idx, "title": title, "url": full})
idx += 1
return chapters
# ============================================================
# PUBLIC APIS
# ============================================================
@staticmethod
@logcall
def fetch_metadata_only(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url # needed for cover parsing
return {
"title": ScrapeEngine._parse_title(soup),
"author": ScrapeEngine._parse_author(soup),
"description": ScrapeEngine._parse_description(soup),
"cover_url": ScrapeEngine._parse_cover(soup, site),
"book_url": url,
}
@staticmethod
@logcall
def fetch_metadata_and_chapters(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url
title = ScrapeEngine._parse_title(soup)
author = ScrapeEngine._parse_author(soup)
desc = ScrapeEngine._parse_description(soup)
cover = ScrapeEngine._parse_cover(soup, site)
chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site)
chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site)
chapters = ScrapeEngine._parse_chapter_links(
chapter_soup, chapter_base, site.chapter_list_selector
)
return {
"title": title,
"author": author,
"description": desc,
"cover_url": cover,
"chapters": chapters,
"chapters_total": len(chapters),
"book_url": url,
}
@staticmethod
@logcall
def fetch_chapterlist(site, url: str):
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site)
chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site)
return ScrapeEngine._parse_chapter_links(
chapter_soup, chapter_base, site.chapter_list_selector
)

@ -1,20 +0,0 @@
# ============================================================
# File: scraper/services/site_resolver.py
# Purpose:
# Determine which BookSite implementation applies for a given URL.
# This keeps INIT-flow and SCRAPE-flow site-agnostic.
# ============================================================
from scraper.sites import BookSite # current PTWXZ implementation
class SiteResolver:
"""
Resolves the correct BookSite class based on URL.
Currently only PTWXZ/Piaotian is supported.
"""
@staticmethod
def resolve(url: str):
# Later: add more domain rules for other sources
return BookSite()

@ -1,135 +0,0 @@
# ============================================================
# File: scraper/services/status_check_service.py
# Purpose:
# Handmatige, idempotente statuscheck per boek.
#
# Bepaalt op basis van het filesystem:
# - aantal gedownloade chapters (.txt)
# - aantal gegenereerde audiofiles (.m4b)
#
# En schrijft deze gevalideerde werkelijkheid naar SQL.
#
# LET OP:
# - Geen Redis
# - Geen Celery
# - Geen status-transities
# - Geen pipeline-logica
# ============================================================
import os
from datetime import datetime
from typing import Dict, Any
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.state_sql import sql_fetch_book, sql_update_book
class StatusCheckService:
"""
Statuscheck op basis van filesystem.
Single source of truth = disk.
"""
@staticmethod
@logcall
def run(book_idx: str) -> Dict[str, Any]:
"""
Voer statuscheck uit voor één boek.
Returns een inspecteerbaar dict met:
- filesystem tellingen
- SQL before / after snapshot
"""
# ----------------------------------------------------
# 1. SQL fetch (bestaat het boek?)
# ----------------------------------------------------
sql_before = sql_fetch_book(book_idx)
if not sql_before:
raise ValueError(f"[STATUSCHECK] Book not found in SQL: {book_idx}")
# ----------------------------------------------------
# 2. Bepaal filesystem root
# ----------------------------------------------------
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
title = sql_before.get("title")
book_dir = os.path.join(output_root, title)
if not os.path.isdir(book_dir):
log(
f"[STATUSCHECK] No output directory for book_idx={book_idx} : title='{title}')"
)
chapters_txt = 0
audio_files = 0
volumes = 0
else:
chapters_txt = 0
audio_files = 0
volumes = 0
# ------------------------------------------------
# 3. Scan volumes
# ------------------------------------------------
for entry in os.listdir(book_dir):
if not entry.lower().startswith("volume_"):
continue
volumes += 1
volume_path = os.path.join(book_dir, entry)
if not os.path.isdir(volume_path):
continue
# ---- TXT chapters ----
for fname in os.listdir(volume_path):
if fname.lower().endswith(".txt"):
chapters_txt += 1
# ---- Audio ----
audio_dir = os.path.join(volume_path, "Audio")
if os.path.isdir(audio_dir):
for fname in os.listdir(audio_dir):
if fname.lower().endswith(".m4b"):
audio_files += 1
# ----------------------------------------------------
# 4. SQL update (snapshot)
# ----------------------------------------------------
now = datetime.utcnow().isoformat(timespec="seconds")
update_fields = {
"downloaded": chapters_txt,
"audio_done": audio_files,
"last_update": now,
}
sql_update_book(book_idx, update_fields)
sql_after = sql_fetch_book(book_idx)
# ----------------------------------------------------
# 5. Resultaat voor inspect/debug
# ----------------------------------------------------
result = {
"book_idx": book_idx,
"filesystem": {
"book_dir": book_dir,
"exists": os.path.isdir(book_dir),
"volumes": volumes,
"chapters_txt": chapters_txt,
"audio_files": audio_files,
},
"sql_before": sql_before,
"sql_after": sql_after,
"notes": [],
}
log(
f"[STATUSCHECK] book_idx={book_idx} "
f"chapters={chapters_txt} audio={audio_files}"
)
return result

@ -1,28 +0,0 @@
# ============================================================
# File: scraper/sites/__init__.py
# Purpose:
# Site autodetection based on URL.
# ============================================================
from scraper.sites.piaotian import PiaotianScraper
def get_scraper_for_url(url: str):
"""
Return the correct scraper instance for a given URL.
Later: add more site implementations.
"""
if "ptwxz" in url or "piaotian" in url:
return PiaotianScraper()
raise ValueError(f"No scraper available for URL: {url}")
# ============================================================
# Backwards-compatibility export for legacy BookScraper
# ============================================================
# Old code expects:
# from scraper.sites import BookSite
# We map that to our new PiaotianScraper implementation.
BookSite = PiaotianScraper

@ -1,52 +0,0 @@
# ============================================================
# File: scraper/sites/base.py
# Purpose:
# Abstract interface that every site-specific scraper must implement.
# ============================================================
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from typing import Optional
class SiteScraper(ABC):
"""
Defines the interface for site-specific scrapers.
Each concrete scraper (Piaotian, Biquge, etc.) must implement these.
"""
@property
@abstractmethod
def root(self) -> str: ...
@property
@abstractmethod
def encoding(self) -> str: ...
@property
@abstractmethod
def chapter_list_selector(self) -> str: ...
# --------------------------
# Metadata extraction
# --------------------------
@abstractmethod
def parse_title(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_author(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_description(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]: ...
# --------------------------
# Chapter extraction
# --------------------------
@abstractmethod
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_chapter_list(self, soup: BeautifulSoup) -> list: ...

@ -1,121 +0,0 @@
# ============================================================
# File: scraper/sites/piaotian.py
# Purpose:
# Concrete SiteScraper implementation for ptwxz.com (Piaotian).
# Moves all parsing logic out of BookScraper.
# ============================================================
from scraper.sites.base import SiteScraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from typing import Optional
class PiaotianScraper(SiteScraper):
root = "https://www.ptwxz.com"
encoding = "GB18030"
chapter_list_selector = "div.centent"
# ------------------------------------------------------------
# METADATA PARSING
# ------------------------------------------------------------
def parse_title(self, soup: BeautifulSoup) -> str:
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownBook"
def parse_author(self, soup: BeautifulSoup) -> str:
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
return raw.split("")[1] if "" in raw else "UnknownAuthor"
def parse_description(self, soup: BeautifulSoup) -> str:
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
# stop when next <span> reappears
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSING
# (exactly your BookScraper._parse_cover logic)
# ------------------------------------------------------------
def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]:
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", url)
if not m:
return None
book_id = m.group(1)
# Extract vol (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", url)
volume = m2.group(1) if m2 else None
imgs = soup.find_all("img", src=True)
chosen = None
# Priority 1: match "/files/article/image/{vol}/{book_id}/"
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
break
# Priority 2: endswith "/{book_id}s.jpg"
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
break
if not chosen:
return None
return urljoin(self.root, chosen)
# ------------------------------------------------------------
# CHAPTER EXTRACTION
# ------------------------------------------------------------
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str:
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
return urljoin(self.root, href)
def parse_chapter_list(self, soup: BeautifulSoup) -> list:
cont = soup.select_one(self.chapter_list_selector)
items = cont.select("ul li a[href]") if cont else []
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full_url = urljoin(self.root, href)
chapters.append({"num": idx, "title": title, "url": full_url})
idx += 1
return chapters

@ -1,8 +1,5 @@
# ============================================================ # ============================================================
# File: scraper/tasks/audio_tasks.py # File: scraper/tasks/audio_tasks.py
# Purpose: Convert chapter text files into audio using macOS
# “say”, with Redis-based slot control.
# Updated: now uses db.repository for audio counters.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
@ -10,81 +7,61 @@ from logbus.publisher import log
import os import os
import subprocess import subprocess
import time import time
import socket
import os
from scraper.progress import inc_audio_done, inc_audio_skipped
# from db.repository import inc_audio_done
from scraper.abort import abort_requested from scraper.abort import abort_requested
from scraper.logger_decorators import logcall
from redis import Redis from redis import Redis
from urllib.parse import urlparse from urllib.parse import urlparse
from scraper.services.audio_completion import trigger_audio_completion_check
# NEW — unified repository façade # Kies lokale redis als aanwezig, anders standaard backend
from db.repository import ( redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
inc_audio_done,
inc_audio_skipped,
)
HOST = socket.gethostname() parsed = urlparse(redis_url)
# ------------------------------------------------------------ # ------------------------------------------------------------
# REDIS CLIENT SETUP # REGULIER REDIS CLIENT (slots, file checks, state)
# ------------------------------------------------------------ # ------------------------------------------------------------
redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
parsed = urlparse(redis_url)
# Slot locking Redis client
redis_client = Redis( redis_client = Redis(
host=parsed.hostname, host=parsed.hostname,
port=parsed.port, port=parsed.port,
db=parsed.path.strip("/"), db=parsed.path.strip("/"),
) )
# Abort + global progress flags always live in DB 0 # ------------------------------------------------------------
# BACKEND CLIENT (abort flags, progress counters) - altijd DB 0
# ------------------------------------------------------------
backend_client = Redis( backend_client = Redis(
host=parsed.hostname, host=parsed.hostname,
port=parsed.port, port=parsed.port,
db=0, db=0,
) )
# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
AUDIO_TIMEOUT = int(os.getenv("AUDIO_TIMEOUT_SECONDS", "300")) AUDIO_TIMEOUT = int(os.getenv("AUDIO_TIMEOUT_SECONDS", "300"))
AUDIO_VOICE = os.getenv("AUDIO_VOICE", "SinJi") AUDIO_VOICE = os.getenv("AUDIO_VOICE", "SinJi")
AUDIO_RATE = int(os.getenv("AUDIO_RATE", "200")) AUDIO_RATE = int(os.getenv("AUDIO_RATE", "200"))
HOST_PATH = os.getenv("HOST_PATH", "/app/output") HOST_PATH = os.getenv("HOST_PATH", "/app/output")
CONTAINER_PREFIX = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "/app/output")
AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1")) AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1"))
CONTAINER_PREFIX = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "/app/output")
# ============================================================
# CELERY TASK
# ============================================================
@celery_app.task(bind=True, queue="audio", ignore_result=True) @celery_app.task(bind=True, queue="audio", ignore_result=True)
@logcall
def generate_audio( def generate_audio(
self, book_id, volume_name, chapter_number, chapter_title, chapter_path self, book_id, volume_name, chapter_number, chapter_title, chapter_text
): ):
""" log(f"[AUDIO] CH{chapter_number}: START task → raw_input={chapter_text}")
chapter_path: absolute container path to chapter text file.
"""
log(f"[AUDIO]({HOST}) CH{chapter_number}: START → {chapter_title}")
# ------------------------------------------------------------ # Abort early
# ABORT CHECK
# ------------------------------------------------------------
if abort_requested(book_id, backend_client): if abort_requested(book_id, backend_client):
inc_audio_skipped(book_id) inc_audio_skipped(book_id)
log(f"[AUDIO]({HOST}) ABORT detected → skip CH{chapter_number}") log(f"[AUDIO] ABORT detected → skip CH{chapter_number}")
return return
# ------------------------------------------------------------ # ============================================================
# ACQUIRE SLOT # ACQUIRE AUDIO SLOT
# ------------------------------------------------------------ # ============================================================
slot_key = None slot_key = None
ttl = AUDIO_TIMEOUT + 15 ttl = AUDIO_TIMEOUT + 15
@ -95,13 +72,11 @@ def generate_audio(
log(f"[AUDIO] CH{chapter_number}: Acquired slot {i}/{AUDIO_SLOTS}") log(f"[AUDIO] CH{chapter_number}: Acquired slot {i}/{AUDIO_SLOTS}")
break break
# Need to wait
if slot_key is None: if slot_key is None:
log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting") log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting...")
start_wait = time.time() start_wait = time.time()
while slot_key is None: while slot_key is None:
# Try all slots again
for i in range(1, AUDIO_SLOTS + 1): for i in range(1, AUDIO_SLOTS + 1):
key = f"audio_slot:{i}" key = f"audio_slot:{i}"
if redis_client.set(key, "1", nx=True, ex=ttl): if redis_client.set(key, "1", nx=True, ex=ttl):
@ -109,32 +84,32 @@ def generate_audio(
log(f"[AUDIO] CH{chapter_number}: Slot acquired after wait") log(f"[AUDIO] CH{chapter_number}: Slot acquired after wait")
break break
# If still no slot if slot_key:
if not slot_key: break
if abort_requested(book_id, backend_client): if abort_requested(book_id, backend_client):
log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}") log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}")
inc_audio_skipped(book_id)
return return
if time.time() - start_wait > ttl: if time.time() - start_wait > ttl:
log(f"[AUDIO] CH{chapter_number}: Wait timeout → abort audio") log(f"[AUDIO] CH{chapter_number}: Slot wait timeout → aborting audio")
inc_audio_skipped(book_id)
return return
time.sleep(0.25) time.sleep(0.25)
# ------------------------------------------------------------ # ============================================================
# PATH NORMALISATION # PATH NORMALISATION
# ------------------------------------------------------------ # ============================================================
container_path = chapter_path
container_path = chapter_text
# Fix 1 — container_path kan None zijn → abort zonder crash
if not container_path: if not container_path:
log(f"[AUDIO] CH{chapter_number}: ERROR — no input file path provided") log(f"[AUDIO] CH{chapter_number}: FATAL — no input path provided")
redis_client.delete(slot_key) redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return return
# Strip container prefix so that host path is resolvable # Fix 2 — veilige startswith
if CONTAINER_PREFIX and container_path.startswith(CONTAINER_PREFIX): if CONTAINER_PREFIX and container_path.startswith(CONTAINER_PREFIX):
relative_path = container_path[len(CONTAINER_PREFIX) :].lstrip("/") relative_path = container_path[len(CONTAINER_PREFIX) :].lstrip("/")
else: else:
@ -143,36 +118,35 @@ def generate_audio(
parts = relative_path.split("/") parts = relative_path.split("/")
if len(parts) < 3: if len(parts) < 3:
log( log(
f"[AUDIO] CH{chapter_number}: ERROR — cannot parse book/volume from {relative_path}" f"[AUDIO] CH{chapter_number}: FATAL — cannot parse book/volume from {relative_path}"
) )
redis_client.delete(slot_key) redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return return
# book_from_path = parts[0] # volume_name passed explicitly anyway book_from_path = parts[0]
# volume_from_path = parts[1] volume_from_path = parts[1]
host_path = os.path.join(HOST_PATH, relative_path) host_path = os.path.join(HOST_PATH, relative_path)
# ------------------------------------------------------------ # ============================================================
# OUTPUT DIRECTORY # OUTPUT PREP
# ------------------------------------------------------------ # ============================================================
base_dir = os.path.join(HOST_PATH, parts[0], parts[1], "Audio")
base_dir = os.path.join(HOST_PATH, book_from_path, volume_from_path, "Audio")
os.makedirs(base_dir, exist_ok=True) os.makedirs(base_dir, exist_ok=True)
safe_num = f"{chapter_number:04d}" safe_num = f"{chapter_number:04d}"
audio_file = os.path.join(base_dir, f"{safe_num}.m4b") audio_file = os.path.join(base_dir, f"{safe_num}.m4b")
# Skip if audio already exists
if os.path.exists(audio_file): if os.path.exists(audio_file):
log(f"[AUDIO] CH{chapter_number}: Already exists → skip") log(f"[AUDIO] Skip CH{chapter_number} → already exists")
redis_client.delete(slot_key) redis_client.delete(slot_key)
inc_audio_skipped(book_id)
trigger_audio_completion_check(book_id)
return return
# ------------------------------------------------------------ # ============================================================
# BUILD TTS COMMAND # BUILD CMD
# ------------------------------------------------------------ # ============================================================
cmd = ( cmd = (
f"say --voice={AUDIO_VOICE} " f"say --voice={AUDIO_VOICE} "
f"--input-file='{host_path}' " f"--input-file='{host_path}' "
@ -183,36 +157,30 @@ def generate_audio(
f"--data-format=aac" f"--data-format=aac"
) )
log(f"[AUDIO]({HOST}) CH{chapter_number} → output: {audio_file}") log(f"[AUDIO] CH{chapter_number}: CMD = {cmd}")
# ------------------------------------------------------------ # ============================================================
# EXECUTE # RUN TTS
# ------------------------------------------------------------ # ============================================================
try: try:
subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT) subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT)
# NEW — repository façade
inc_audio_done(book_id) inc_audio_done(book_id)
trigger_audio_completion_check(book_id) log(f"[AUDIO] CH{chapter_number}: Completed")
log(f"trigger_audio_completion_check ")
log(f"[AUDIO]({HOST}) CH{chapter_number}: Completed")
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
log(f"[AUDIO]({HOST}) CH{chapter_number}: TIMEOUT → removing file") log(f"[AUDIO] CH{chapter_number}: TIMEOUT → remove incomplete file")
if os.path.exists(audio_file): if os.path.exists(audio_file):
try: try:
os.remove(audio_file) os.remove(audio_file)
except Exception: except Exception:
pass pass
inc_audio_skipped(book_id)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
log(f"[AUDIO] CH{chapter_number}: ERROR during say → {e}") log(f"[AUDIO] CH{chapter_number}: ERROR during say → {e}")
inc_audio_skipped(book_id)
except Exception as e: except Exception as e:
log(f"[AUDIO] CH{chapter_number}: UNEXPECTED ERROR → {e}") log(f"[AUDIO] CH{chapter_number}: UNEXPECTED ERROR → {e}")
inc_audio_skipped(book_id)
finally: finally:
if slot_key: if slot_key:

@ -1,167 +1,106 @@
# ============================================================ # ============================================================
# File: scraper/tasks/controller_tasks.py # File: scraper/tasks/controller_tasks.py
# Purpose: # Purpose:
# FULL scrape entrypoint + launching download/parse/save pipelines. # Start the download → parse → save pipeline for a scraped book,
# NO result.get() anywhere. Scraping is done inline. # including progress/abort tracking via book_id.
# ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from logbus.publisher import log from logbus.publisher import log
import os from scraper.download_controller import DownloadController
import time from scraper.progress import (
import redis set_total,
)
from urllib.parse import urlparse from urllib.parse import urlparse
import redis
from scraper.logger_decorators import logcall import os
from scraper.abort import abort_requested from scraper.abort import abort_requested
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.site_resolver import SiteResolver
from db.repository import fetch_book, set_chapters_total
from scraper.download_controller import DownloadController
print(">>> [IMPORT] controller_tasks.py loaded") print(">>> [IMPORT] controller_tasks.py loaded")
# ============================================================= @celery_app.task(bind=True, queue="controller", ignore_result=False)
# 1) PUBLIC ENTRYPOINT — CALLED FROM /start def launch_downloads(self, book_id: str, scrape_result: dict):
# =============================================================
@celery_app.task(
bind=True,
queue="controller",
ignore_result=False,
name="scraper.tasks.controller_tasks.start_full_scrape",
)
@logcall
def start_full_scrape(self, book_idx: str):
"""
FULL SCRAPE ENTRYPOINT.
Scraping is done inline no Celery .get() needed.
""" """
Launch the entire pipeline (download parse save),
AND initialize progress counters.
log(f"[CTRL] start_full_scrape(book_idx={book_idx})") Chapter-level progress is updated INSIDE the download/parse/save tasks.
This task MUST NOT call .get() on async subtasks (Celery restriction).
# Abort before doing anything
if abort_requested(book_idx):
log(f"[CTRL] PRE-ABORT flag detected for {book_idx}")
return {"book_idx": book_idx, "aborted": True, "reason": "pre-abort"}
# --------------------------------------------------------
# 1) Load book metadata from SQLite
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
msg = f"[CTRL] Book '{book_idx}' not found in DB"
log(msg)
raise ValueError(msg)
url = book.get("book_url")
if not url:
msg = f"[CTRL] No book_url stored for {book_idx}"
log(msg)
raise ValueError(msg)
# --------------------------------------------------------
# 2) INLINE SCRAPE (fast, no Celery wait)
# --------------------------------------------------------
site = SiteResolver.resolve(url)
try:
scrape_result = ScrapeEngine.fetch_metadata_and_chapters(site, url)
log(f"[CTRL] Scrape OK for {book_idx}: {scrape_result.get('title')}")
except Exception as e:
log(f"[CTRL] ERROR during scrape of {book_idx}: {e}")
raise
# --------------------------------------------------------
# 3) Continue → dispatch pipelines
# --------------------------------------------------------
return launch_downloads(book_idx, scrape_result)
# =============================================================
# 2) PIPELINE DISPATCH (NOT a Celery task)
# =============================================================
@logcall
def launch_downloads(book_idx: str, scrape_result: dict):
"""
Launches the entire processing pipeline:
- initialize Redis UI state
- initialize SQLite totals
- dispatch per-chapter pipelines via DownloadController
""" """
title = scrape_result.get("title", "UnknownBook") title = scrape_result.get("title", "UnknownBook")
chapters = scrape_result.get("chapters", []) or [] chapters = scrape_result.get("chapters", []) or []
total = len(chapters) total = len(chapters)
# ------------------------------------------------------------ # ------------------------------------------------------------
# INIT REDIS STATE # INIT BOOK STATE MODEL (required for Active Books dashboard)
# ------------------------------------------------------------ # ------------------------------------------------------------
broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0") broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
parsed = urlparse(broker_url) parsed = urlparse(broker_url)
r = redis.Redis( state = redis.Redis(
host=parsed.hostname, host=parsed.hostname,
port=parsed.port, port=parsed.port,
db=int(parsed.path.strip("/")), db=int(parsed.path.strip("/")),
decode_responses=True, decode_responses=True,
) )
base = f"book:{book_idx}:state" # Book metadata
state.set(f"book:{book_id}:title", title)
state.set(f"book:{book_id}:status", "starting")
# Download counters
state.set(f"book:{book_id}:download:total", total)
state.set(f"book:{book_id}:download:done", 0)
r.hset(base, "title", title) # Audio counters (start at zero)
r.hset(base, "status", "starting") state.set(f"book:{book_id}:audio:done", 0)
r.hset(base, "chapters_total", total)
r.hset(base, "chapters_download_done", 0)
r.hset(base, "chapters_download_skipped", 0)
r.hset(base, "chapters_parsed_done", 0)
r.hset(base, "audio_done", 0)
r.hset(base, "audio_skipped", 0)
r.hset(base, "last_update", int(time.time()))
# ------------------------------------------------------------ # ------------------------------------------------------------
# INIT SQLITE SNAPSHOT # INIT PROGRESS
# ------------------------------------------------------------ # ------------------------------------------------------------
try: set_total(book_id, total)
set_chapters_total(book_idx, total) log(f"[CTRL] Progress initialized for {book_id}: total={total}")
except Exception as e:
log(f"[CTRL] ERROR updating SQLite totals: {e}")
raise
log(f"[CTRL] Initialized totals for {book_idx}: {total}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# ABORT CHECK BEFORE LAUNCHING JOBS # BUILD CONTROLLER
# ------------------------------------------------------------ # ------------------------------------------------------------
if abort_requested(book_idx): ctl = DownloadController(book_id, scrape_result)
log(f"[CTRL] ABORT flag detected — stopping BEFORE dispatch for {book_idx}")
r.hset(base, "status", "aborted")
return {"book_idx": book_idx, "aborted": True, "reason": "abort-before-start"}
# ------------------------------------------------------------ # ------------------------------------------------------------
# BUILD + DISPATCH PER-CHAPTER PIPELINES # START PIPELINES (ASYNC)
# Returns a celery group AsyncResult. We DO NOT iterate or get().
# Progress & failures are handled by the worker subtasks.
# ------------------------------------------------------------ # ------------------------------------------------------------
controller = DownloadController(book_idx, scrape_result)
try: try:
group_result = controller.start() group_result = ctl.start()
gid = getattr(group_result, "id", None)
log(f"[CTRL] Pipelines dispatched for {book_idx} (group_id={gid})") log(
except Exception as e: f"[CTRL] Pipelines dispatched for '{title}' "
log(f"[CTRL] ERROR dispatching pipelines for {book_idx}: {e}") f"(book_id={book_id}, group_id={group_result.id})"
)
# Abort flag set BEFORE tasks start?
if abort_requested(book_id):
log(f"[CTRL] ABORT requested before tasks start")
return {"book_id": book_id, "aborted": True}
except Exception as exc:
log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
raise raise
# Update UI state to "downloading" # ------------------------------------------------------------
r.hset(base, "status", "downloading") # CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
r.hset(base, "last_update", int(time.time())) # (Download/parse/save tasks update progress themselves)
# ------------------------------------------------------------
log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
return { return {
"book_idx": book_idx, "book_id": book_id,
"total": total, "total": total,
"started": True, "started": True,
"group_id": gid, "group_id": group_result.id,
} }

@ -1,24 +1,26 @@
# ============================================================ # ============================================================
# File: scraper/tasks/download_tasks.py # File: scraper/tasks/download_tasks.py
# Purpose: # Purpose: Download chapter HTML with global concurrency,
# Download chapter HTML into payload["html"]. # retry/backoff logic, 429 support, and abort-awareness.
# Updated for book_idx unified ID model. #
# Logging:
# - timestamp + book_id in message
# - logbus.publisher → console
# - ui_log.push_ui → Redis GUI
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from scraper.utils.utils import get_save_path from scraper.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started from scraper.abort import abort_requested, chapter_started, mark_chapter_started
# Unified repository façade from scraper.progress import (
from db.repository import ( inc_completed,
set_status, inc_chapter_done,
inc_download_done, inc_chapter_download_skipped,
inc_download_skipped,
) )
from db.repository import inc_downloaded, set_status
from logbus.publisher import log from logbus.publisher import log
from scraper.ui_log import push_ui from scraper.ui_log import push_ui
from scraper.logger_decorators import logcall
import requests import requests
import redis import redis
@ -33,9 +35,9 @@ print(">>> [IMPORT] download_tasks.py loaded")
# ----------------------------------------------------------- # -----------------------------------------------------------
# TIMESTAMPED LOG WRAPPER # TIMESTAMPED LOG WRAPPER
# ----------------------------------------------------------- # -----------------------------------------------------------
def log_msg(book_idx: str, message: str): def log_msg(book_id: str, message: str):
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
full = f"{ts} [{book_idx}] {message}" full = f"{ts} [{book_id}] {message}"
log(full) log(full)
push_ui(full) push_ui(full)
@ -88,63 +90,46 @@ def release_global_slot():
# ============================================================ # ============================================================
# CELERY TASK — Payload v3 (book_idx model) # CELERY TASK — NEW SIGNATURE WITH chapter_dict + book_meta
# ============================================================ # ============================================================
@celery_app.task(bind=True, queue="download", ignore_result=False) @celery_app.task(bind=True, queue="download", ignore_result=False)
@logcall def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
def download_chapter(self, payload: dict):
""" """
Payload format: New unified chapter model:
chapter_dict = {
{
"book_idx": str,
"chapter": {
"num": int, "num": int,
"title": str,
"url": str, "url": str,
"title": str,
"volume_path": str "volume_path": str
},
"book_meta": dict,
# fields filled during pipeline:
"html": None | str,
"parsed": None | str,
"skipped": bool,
"path": None | str
} }
"""
if not payload: book_meta is propagated through the pipeline for parse/save.
raise ValueError("download_chapter received empty payload") """
book_idx = payload["book_idx"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
chapter_num = chapter["num"]
chapter_url = chapter["url"]
chapter_title = chapter.get("title") or f"Chapter {chapter_num}"
volume_path = chapter["volume_path"]
# ----------------------------------------------------------- chapter_num = chapter_dict.get("num")
# STATUS UPDATE (book is now in 'downloading') chapter_url = chapter_dict.get("url")
# ----------------------------------------------------------- chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
set_status(book_idx, "downloading") volume_path = chapter_dict.get("volume_path")
# ----------------------------------------------------------- # -----------------------------------------------------------
# ABORT CHECK (skip if not yet started) # ABORT BEFORE START
# ----------------------------------------------------------- # -----------------------------------------------------------
if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num): if abort_requested(book_id) and not chapter_started(book_id, chapter_num):
log_msg(book_idx, f"[ABORT] Skip chapter {chapter_num}") msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)"
log_msg(book_id, msg)
inc_download_skipped(book_idx) inc_chapter_download_skipped(book_id)
return {
payload["html"] = None "book_id": book_id,
payload["skipped"] = True "chapter": chapter_dict,
payload["path"] = None "html": None,
return payload "skipped": True,
"path": None,
"abort": True,
"book_meta": book_meta,
}
mark_chapter_started(book_idx, chapter_num) # Mark chapter as started
mark_chapter_started(book_id, chapter_num)
# ----------------------------------------------------------- # -----------------------------------------------------------
# SKIP IF FILE ALREADY EXISTS # SKIP IF FILE ALREADY EXISTS
@ -152,28 +137,34 @@ def download_chapter(self, payload: dict):
save_path = get_save_path(chapter_num, volume_path) save_path = get_save_path(chapter_num, volume_path)
if os.path.exists(save_path): if os.path.exists(save_path):
log_msg(book_idx, f"[DL] SKIP {chapter_num}{save_path}") log_msg(book_id, f"[DL] SKIP {chapter_num} ({chapter_title}) → {save_path}")
inc_download_skipped(book_idx) return {
"book_id": book_id,
payload["html"] = None "chapter": chapter_dict,
payload["skipped"] = True "html": None,
payload["path"] = save_path "skipped": True,
return payload "path": save_path,
"book_meta": book_meta,
}
# ----------------------------------------------------------- # -----------------------------------------------------------
# GLOBAL DELAY + CONCURRENCY # GLOBAL + SYNC DELAY
# ----------------------------------------------------------- # -----------------------------------------------------------
if GLOBAL_DELAY > 0: if GLOBAL_DELAY > 0:
time.sleep(GLOBAL_DELAY) time.sleep(GLOBAL_DELAY)
wait_for_global_delay() wait_for_global_delay()
acquire_global_slot(MAX_CONCURRENCY) acquire_global_slot(MAX_CONCURRENCY)
# log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
# ----------------------------------------------------------- # -----------------------------------------------------------
# HTTP DOWNLOAD # HTTP DOWNLOAD
# ----------------------------------------------------------- # -----------------------------------------------------------
try: try:
log_msg(book_idx, f"[DL] Downloading {chapter_num} ({chapter_title})") log_msg(
book_id,
f"[DL] Downloading {chapter_num} ({chapter_title}): {chapter_url}",
)
resp = requests.get( resp = requests.get(
chapter_url, chapter_url,
@ -185,28 +176,41 @@ def download_chapter(self, payload: dict):
resp.encoding = resp.apparent_encoding or "gb2312" resp.encoding = resp.apparent_encoding or "gb2312"
html = resp.text html = resp.text
log_msg(book_idx, f"[DL] OK {chapter_num}: {len(html)} bytes") log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes")
payload["html"] = html return {
payload["skipped"] = False "book_id": book_id,
payload["path"] = save_path "chapter": chapter_dict,
return payload "html": html,
"skipped": False,
"path": save_path,
"book_meta": book_meta,
}
except Exception as exc: except Exception as exc:
attempt = self.request.retries attempt = self.request.retries
delay = BASE_DELAY * (BACKOFF**attempt) delay = BASE_DELAY * (BACKOFF**attempt)
# Handle 429 # Specific 429 handler
if getattr(getattr(exc, "response", None), "status_code", None) == 429: if getattr(getattr(exc, "response", None), "status_code", None) == 429:
log_msg(book_idx, f"[DL] 429 → WAIT {DELAY_429}s") log_msg(
book_id,
f"[DL] 429 {chapter_num} → WAIT {DELAY_429}s "
f"(attempt {attempt}/{MAX_RETRIES})",
)
time.sleep(DELAY_429) time.sleep(DELAY_429)
set_global_delay() set_global_delay()
raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES) raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES)
# General retry with backoff # Normal retry
log_msg(book_idx, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s") log_msg(
book_id,
f"[DL] ERROR {chapter_num}: {exc} → retry in {delay}s "
f"(attempt {attempt}/{MAX_RETRIES})",
)
raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES) raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
finally: finally:
set_global_delay() set_global_delay()
release_global_slot() release_global_slot()
# log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")

@ -1,132 +0,0 @@
# ============================================================
# File: scraper/tasks/m4b_tasks.py
# ============================================================
import os
import subprocess
from typing import List
from celery_app import celery_app
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import fetch_book, store_m4b_error
from scraper.scriptgen import build_merge_block
# ------------------------------------------------------------
# Helper: detect volumes (UNCHANGED)
# ------------------------------------------------------------
def detect_volumes(book_base: str) -> List[str]:
volumes = []
for name in os.listdir(book_base):
if name.lower().startswith("volume_"):
full = os.path.join(book_base, name)
if os.path.isdir(full):
volumes.append(name)
volumes.sort()
return volumes
# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="m4b", ignore_result=True)
@logcall
def run_m4btool(self, book_idx: str):
log(f"[M4B] START book_idx={book_idx}")
book = fetch_book(book_idx)
if not book:
log(f"[M4B] Book not found in SQL: book_idx={book_idx}")
return
title = book.get("title", book_idx)
author = book.get("author", "Unknown")
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
book_base = os.path.join(output_root, title)
log(f"[M4B] Book base directory: {book_base}")
if not os.path.isdir(book_base):
log(f"[M4B] Book directory missing: {book_base}")
return
volumes = detect_volumes(book_base)
if not volumes:
log(f"[M4B] No volumes found for book_idx={book_idx}")
return
log(f"[M4B] Volumes detected: {volumes}")
# --------------------------------------------------------
# Build canonical commands via scriptgen
# --------------------------------------------------------
merge_block = build_merge_block(
title, author, [(i + 1, v) for i, v in enumerate(volumes)]
)
commands = [c.strip() for c in merge_block.split("&&") if c.strip()]
for volume, cmd in zip(volumes, commands):
audio_dir = os.path.join(book_base, volume, "Audio")
if not os.path.isdir(audio_dir):
log(f"[M4B] SKIP {volume}: no Audio directory")
continue
log(f"[M4B] Running for volume={volume}")
log(f"[M4B] CMD: {cmd}")
try:
result = subprocess.run(
cmd,
cwd=book_base,
shell=True,
capture_output=True,
text=True,
check=True,
)
if result.stdout:
log(f"[M4B][STDOUT] {result.stdout}")
except subprocess.CalledProcessError as exc:
log(f"[M4B][FAILED] volume={volume}")
if exc.stdout:
log(f"[M4B][STDOUT] {exc.stdout}")
if exc.stderr:
log(f"[M4B][STDERR] {exc.stderr}")
store_m4b_error(
book_idx=book_idx,
volume=volume,
error_text=exc.stderr or str(exc),
)
continue
except Exception as exc:
log(f"[M4B][UNEXPECTED ERROR] volume={volume}: {exc}")
store_m4b_error(
book_idx=book_idx,
volume=volume,
error_text=str(exc),
)
continue
log(f"[M4B] FINISHED book_idx={book_idx}")
# ------------------------------------------------------------
# Orchestration helper (UNCHANGED)
# ------------------------------------------------------------
@logcall
def queue_m4b_for_book(book_idx: str):
log(f"[M4B] Queuing m4b-tool for book_idx={book_idx}")
celery_app.send_task(
"scraper.tasks.m4b_tasks.run_m4btool",
args=[book_idx],
queue="m4b",
)

@ -1,31 +1,33 @@
# ============================================================ # =========================================================
# File: scraper/tasks/parse_tasks.py # File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text. # Purpose: Parse downloaded HTML into clean chapter text.
# Enhanced Piaotia extractor + selector fallback + clean pipeline. # Enhanced version: Piaotia H1→content extractor + clean pipeline
# Compatible with payload pipeline v3 + book_idx refactor. # NO HARDCODED REPLACEMENTS — everything comes from replacement files
# ============================================================ # =========================================================
from celery_app import celery_app from celery_app import celery_app
from bs4 import BeautifulSoup, NavigableString, Comment from bs4 import BeautifulSoup
from scraper.tasks.download_tasks import log_msg from scraper.utils import clean_text, load_all_replacements
from scraper.utils.utils import clean_text, load_all_replacements from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.logger_decorators import logcall
from db.repository import inc_parsed_done
from bs4 import NavigableString, Comment
print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)") print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
# ============================================================
# PIAOTIA ADVANCED CONTENT EXTRACTOR
# ============================================================
def extract_piaotia_content(soup): def extract_piaotia_content(soup):
"""
Extract clean chapter content from Piaotia pages.
Start after the table following <H1>.
End before nav/ads/footer/copyright.
"""
h1 = soup.find("h1") h1 = soup.find("h1")
if not h1: if not h1:
return None return None
# Find first table after <h1> # -------- Find first table after <h1> --------
table = None table = None
for sib in h1.next_siblings: for sib in h1.next_siblings:
if getattr(sib, "name", None) == "table": if getattr(sib, "name", None) == "table":
@ -37,39 +39,49 @@ def extract_piaotia_content(soup):
parts = [] parts = []
# -------- Iterate after table --------
for sib in table.next_siblings: for sib in table.next_siblings:
name = getattr(sib, "name", None) name = getattr(sib, "name", None)
text = None text = None
if hasattr(sib, "get_text"): if hasattr(sib, "get_text"):
text = sib.get_text(strip=True) text = sib.get_text(strip=True)
# Stop conditions # === STOP CONDITIONS ===
# Comments like <!-- 翻页上AD开始 -->
if isinstance(sib, Comment) and ("翻页" in sib): if isinstance(sib, Comment) and ("翻页" in sib):
break break
# Explicit footer blocks
if name == "div": if name == "div":
sid = sib.get("id", "") sid = sib.get("id", "")
cls = sib.get("class", []) cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"): if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break break
# Copyright block — strongest indicator
if text and ("重要声明" in text or "Copyright" in text): if text and ("重要声明" in text or "Copyright" in text):
break break
# Navigation or 推荐阅读
if text and (text.startswith(("推荐阅读", "目录", "目 录"))): if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break break
# Skip scripts, ads, centers
if name in ("script", "style"): if name in ("script", "style"):
continue continue
# Skip JS containers like <center><script>...</script></center>
if name == "center": if name == "center":
continue continue
# Accumulate # === ACCUMULATE TEXT ===
if isinstance(sib, NavigableString): if isinstance(sib, NavigableString):
s = sib.strip() s = sib.strip()
if s: if s:
parts.append(s) parts.append(s)
elif hasattr(sib, "get_text"): elif hasattr(sib, "get_text"):
t = sib.get_text(separator="\n").strip() t = sib.get_text(separator="\n").strip()
if t: if t:
@ -78,44 +90,35 @@ def extract_piaotia_content(soup):
return "\n".join(parts).strip() return "\n".join(parts).strip()
# ============================================================
# PARSE TASK — PAYLOAD PIPELINE v3 (book_idx)
# ============================================================
@celery_app.task(bind=True, queue="parse", ignore_result=False) @celery_app.task(bind=True, queue="parse", ignore_result=False)
@logcall def parse_chapter(self, download_result: dict):
def parse_chapter(self, payload: dict): """
New signature under chapter_dict pipeline:
if not payload: - receives ONLY the output dict from download_chapter
return {"skipped": True, "reason": "empty_payload"} - book_meta is inside download_result["book_meta"]
- chapter_dict is inside download_result["chapter"]
# NEW MODEL """
book_idx = payload["book_idx"]
chapter = payload["chapter"] book_id = download_result.get("book_id", "NOBOOK")
book_meta = payload.get("book_meta") or {} chapter_dict = download_result.get("chapter") or {}
book_meta = download_result.get("book_meta") or {}
num = chapter.get("num") chapter_title = chapter_dict.get("title")
title = chapter.get("title") or f"Chapter {num}" chapter_num = chapter_dict.get("num")
html = payload.get("html") chapter_url = chapter_dict.get("url")
html = download_result.get("html")
# ------------------------------------------------------------ # ------------------------------------------------------------
# DOWNLOAD SKIPPED → PARSE SKIP # SKIPPED DOWNLOAD SKIP PARSE
# ------------------------------------------------------------ # ------------------------------------------------------------
if payload.get("skipped"): if download_result.get("skipped"):
log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)") log_msg(book_id, f"[PARSE] SKIP chapter {chapter_num} (download skipped)")
return payload return download_result # already has chapter + book_meta + skipped
if not html:
log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP")
payload["parsed"] = None
payload["skipped"] = True
return payload
log_msg(book_idx, f"[PARSE] Parsing chapter {num}") log_msg(book_id, f"[PARSE] Parsing chapter {chapter_num}")
soup = BeautifulSoup(html, "lxml") soup = BeautifulSoup(html, "lxml")
# ------------------------------------------------------------ # ------------------------------------------------------------
# STRICT SELECTORS # STRICT SELECTORS (direct content blocks)
# ------------------------------------------------------------ # ------------------------------------------------------------
selectors = [ selectors = [
"#content", "#content",
@ -138,20 +141,50 @@ def parse_chapter(self, payload: dict):
raw = None raw = None
# strict selectors failed → piaotia extractor # --- STRICT SELECTOR FAILED → Try Piaotia extractor ---
if node is None: if node is None:
raw = extract_piaotia_content(soup) raw = extract_piaotia_content(soup)
else:
raw = node.get_text(separator="\n")
# # ------------------------------------------------------------
# # PIAOTIA FALLBACK:
# # Extract content between <H1> and the "bottomlink" block.
# # ------------------------------------------------------------
# raw = None
# if node is None:
# h1 = soup.find("h1")
# if h1:
# content_parts = []
# for sib in h1.next_siblings:
# sib_class = getattr(sib, "get", lambda *_: None)("class")
# if sib_class and (
# "bottomlink" in sib_class or sib_class == "bottomlink"
# ):
# break
# if getattr(sib, "name", None) in ["script", "style", "center"]:
# continue
# if hasattr(sib, "get_text"):
# content_parts.append(sib.get_text(separator="\n"))
# else:
# content_parts.append(str(sib))
# raw = "\n".join(content_parts)
# ------------------------------------------------------------
# FINAL FALLBACK # FINAL FALLBACK
# ------------------------------------------------------------
if raw is None: if raw is None:
if node:
raw = node.get_text(separator="\n")
else:
for tag in soup(["script", "style", "noscript"]): for tag in soup(["script", "style", "noscript"]):
tag.decompose() tag.decompose()
raw = soup.get_text(separator="\n") raw = soup.get_text(separator="\n")
# ------------------------------------------------------------ # ------------------------------------------------------------
# MULTIPASS CLEANING VIA replacement-block files # MULTIPASS CLEANING via replacement files ONLY
# ------------------------------------------------------------ # ------------------------------------------------------------
REPL = load_all_replacements() REPL = load_all_replacements()
@ -160,30 +193,28 @@ def parse_chapter(self, payload: dict):
text = clean_text(text, REPL) text = clean_text(text, REPL)
# ------------------------------------------------------------ # ------------------------------------------------------------
# Collapse double blank lines # Collapse excessive empty lines
# ------------------------------------------------------------ # ------------------------------------------------------------
cleaned = [] cleaned = []
prev_blank = False prev_blank = False
for line in text.split("\n"): for line in text.split("\n"):
s = line.rstrip() stripped = line.rstrip()
if s == "": if stripped == "":
if prev_blank: if prev_blank:
continue continue
prev_blank = True prev_blank = True
cleaned.append("") cleaned.append("")
else: else:
prev_blank = False prev_blank = False
cleaned.append(s) cleaned.append(stripped)
text = "\n".join(cleaned) text = "\n".join(cleaned)
text = f"{title}\n{text}" text = chapter_title + "\n" + text
# ------------------------------------------------------------ # ------------------------------------------------------------
# Header on chapter 1 # Add header to chapter 1
# ------------------------------------------------------------ # ------------------------------------------------------------
if num == 1: if chapter_num == 1:
book_url = book_meta.get("book_url") or "UNKNOWN" book_url = book_meta.get("book_url") or book_meta.get("url") or "UNKNOWN"
header = ( header = (
f"{book_meta.get('title','')}\n" f"{book_meta.get('title','')}\n"
f"Author: {book_meta.get('author','')}\n" f"Author: {book_meta.get('author','')}\n"
@ -192,14 +223,13 @@ def parse_chapter(self, payload: dict):
) )
text = header + text text = header + text
log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars") log_msg(book_id, f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars")
# ------------------------------------------------------------
# OUTPUT PAYLOAD
# ------------------------------------------------------------
payload["parsed"] = text
payload["skipped"] = False
inc_parsed_done(book_idx)
return payload # NEW RETURN FORMAT: chapter_dict stays intact
return {
"book_id": book_id,
"chapter": chapter_dict,
"text": text,
"length": len(text),
"book_meta": book_meta,
}

@ -1,16 +1,16 @@
# ========================================================= # =========================================================
# File: scraper/tasks/pipeline.py # File: scraper/tasks/pipeline.py
# Purpose: # Purpose:
# Build Celery chains for chapter processing using payload dict. # Build Celery chains for chapter processing using chapter_dict.
# #
# Pipeline v3: # New Chain:
# download_chapter(payload) # download_chapter(book_id, chapter_dict, book_meta)
# → parse_chapter(payload) # → parse_chapter(download_result)
# → save_chapter(payload) # → save_chapter(parsed_result)
# → update_progress(final_result, book_id)
# #
# NOTE: # All subtasks pass through result dicts unchanged so the
# - book_idx is the single authoritative key for all tasks # next stage receives the correct fields.
# - payload travels unchanged through the entire pipeline
# ========================================================= # =========================================================
from celery import chain from celery import chain
@ -18,33 +18,26 @@ from celery import chain
from scraper.tasks.download_tasks import download_chapter from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter from scraper.tasks.save_tasks import save_chapter
from scraper.tasks.progress_tasks import update_progress
from scraper.logger_decorators import logcall
def build_chapter_pipeline(
@logcall book_id: str,
def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict): chapter_dict: dict,
""" book_meta: dict,
Create a payload object passed through the pipeline. ):
Consistent with the chapter_dict-based task signature.
""" """
Build a Celery chain for one chapter using chapter_dict.
payload = { download_chapter(book_id, chapter_dict, book_meta)
"book_idx": book_idx, parse_chapter(download_result)
"chapter": chapter_dict, save_chapter(parsed_result)
"book_meta": book_meta, update_progress(result, book_id)
# Will be filled by download_chapter """
"html": None,
# Will be filled by parse_chapter
"parsed": None,
# Set by download or parse on skip/404/etc
"skipped": False,
# Final path written by save_chapter
"path": None,
}
return chain( return chain(
download_chapter.s(payload), download_chapter.s(book_id, chapter_dict, book_meta),
parse_chapter.s(), parse_chapter.s(),
save_chapter.s(), save_chapter.s(),
update_progress.s(book_id),
) )

@ -0,0 +1,57 @@
# ============================================================
# File: scraper/tasks/progress_tasks.py
# Purpose: Central progress updater for chapter pipelines.
# Updated for chapter_dict pipeline model.
# ============================================================
from celery_app import celery_app
from scraper.progress import inc_completed, inc_skipped, inc_failed
from logbus.publisher import log
print(">>> [IMPORT] progress_tasks.py loaded")
@celery_app.task(bind=False, name="progress.update", queue="controller")
def update_progress(result: dict, book_id: str):
"""
Central progress logic:
- result: output of save_chapter
- book_id: explicitly passed by pipeline
IMPORTANT:
- save_chapter already updates counters for skipped & normal chapters
- progress.update MUST NOT double-increment
"""
ch = result.get("chapter") or {}
chapter_num = ch.get("num")
skipped = result.get("skipped", False)
failed = result.get("failed", False)
# ------------------------------------------------------------
# FAILED CASE
# ------------------------------------------------------------
if failed:
inc_failed(book_id)
log(f"[PROG] FAILED chapter {chapter_num}")
return result
# ------------------------------------------------------------
# SKIPPED CASE
# ------------------------------------------------------------
if skipped:
# save_chapter already did:
# inc_skipped(book_id)
log(f"[PROG] SKIPPED chapter {chapter_num}")
return result
# ------------------------------------------------------------
# NORMAL COMPLETION
# ------------------------------------------------------------
# save_chapter did NOT increment completed for skipped cases
# but DID inc_completed(book_id) for normal cases.
# update_progress should NOT double increment, so only log here.
log(f"[PROG] DONE chapter {chapter_num}")
return result

@ -1,84 +1,139 @@
# ============================================================ # ============================================================
# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC + book_idx) # File: scraper/tasks/save_tasks.py
# Purpose: Save parsed chapter text to disk + trigger audio.
# Updated for chapter_dict + book_meta pipeline model.
# ============================================================ # ============================================================
print(">>> [IMPORT] save_tasks.py loaded") print(">>> [IMPORT] save_tasks.py loaded")
from celery import shared_task from celery import shared_task
import os import os
from scraper.utils import get_save_path
from logbus.publisher import log from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.logger_decorators import logcall from scraper.progress import (
from scraper.utils.utils import get_save_path inc_completed,
from scraper.tasks.download_tasks import log_msg inc_chapter_done,
inc_chapter_download_skipped,
)
from scraper.tasks.audio_tasks import generate_audio from scraper.tasks.audio_tasks import generate_audio
from db.repository import inc_download_done, inc_download_skipped
@shared_task(bind=True, queue="save", ignore_result=False) @shared_task(bind=True, queue="save", ignore_result=False)
@logcall def save_chapter(self, parsed: dict):
def save_chapter(self, payload: dict): """
New pipeline model:
if not payload: parsed = {
log("[SAVE] ERROR: payload is None") "book_id": str,
return {"error": True} "chapter": chapter_dict,
"text": str,
# NEW unified ID "length": int,
book_idx = payload["book_idx"] "book_meta": dict,
"skipped": bool,
chapter = payload["chapter"] "path": optional str (if skipped)
parsed = payload.get("parsed") }
path = payload.get("path") """
skipped = payload.get("skipped")
book_id = parsed.get("book_id", "NOBOOK")
num = chapter["num"] chapter_dict = parsed.get("chapter") or {}
title = chapter.get("title") or f"Chapter {num}" book_meta = parsed.get("book_meta") or {}
volume = chapter.get("volume_path")
volume_name = os.path.basename(volume.rstrip("/")) chapter_num = chapter_dict.get("num")
chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
# ============================================================ volume_path = chapter_dict.get("volume_path")
# SKIPPED CASE (old behavior restored)
# ============================================================ # ------------------------------------------------------------
if skipped or not parsed: # VALIDATION
log_msg(book_idx, f"[SAVE] SKIP chapter {num}") # ------------------------------------------------------------
inc_download_skipped(book_idx) if chapter_num is None or volume_path is None:
raise ValueError("Invalid parsed payload: chapter_dict missing fields.")
# OLD behavior: even skipped chapters still queue audio
# ------------------------------------------------------------
# SKIPPED CASE
# ------------------------------------------------------------
if parsed.get("skipped"):
path = parsed.get("path", None)
log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num}{path}")
inc_chapter_download_skipped(book_id)
volume_name = os.path.basename(volume_path.rstrip("/"))
# Queue audio only if a valid file exists
if path and os.path.exists(path): if path and os.path.exists(path):
log_msg(book_idx, f"[AUDIO] Queueing audio for SKIPPED chapter {num}")
try: try:
generate_audio.delay(book_idx, volume_name, num, title, path) generate_audio.delay(
except Exception as exc: book_id,
log_msg(book_idx, f"[AUDIO] ERROR queueing skipped audio: {exc}") volume_name,
chapter_num,
return payload chapter_title,
path,
# ============================================================ )
log_msg(
book_id,
f"[AUDIO] Task queued (SKIPPED) for chapter {chapter_num} in {volume_name}",
)
except Exception as audio_exc:
log_msg(
book_id,
f"[AUDIO] ERROR queueing (SKIPPED) chapter {chapter_num}: {audio_exc}",
)
return {
"book_id": book_id,
"chapter": chapter_dict,
"path": path,
"skipped": True,
"book_meta": book_meta,
}
# ------------------------------------------------------------
# NORMAL SAVE CASE # NORMAL SAVE CASE
# ============================================================ # ------------------------------------------------------------
try: try:
os.makedirs(volume, exist_ok=True) text = parsed.get("text", "")
save_path = get_save_path(num, volume)
with open(save_path, "w", encoding="utf-8") as f: # Ensure volume folder exists
f.write(parsed) os.makedirs(volume_path, exist_ok=True)
log_msg(book_idx, f"[SAVE] Saved chapter {num}{save_path}") # Build final chapter file path
path = get_save_path(chapter_num, volume_path)
inc_download_done(book_idx) # Write chapter text to file
with open(path, "w", encoding="utf-8") as f:
f.write(text)
# OLD behavior: ALWAYS queue audio log_msg(book_id, f"[SAVE] Saved chapter {chapter_num}{path}")
try: inc_chapter_done(book_id)
generate_audio.delay(book_idx, volume_name, num, title, save_path) inc_completed(book_id)
log_msg(book_idx, f"[AUDIO] Task queued for chapter {num}")
except Exception as exc:
log_msg(book_idx, f"[AUDIO] ERROR queueing chapter {num}: {exc}")
payload["path"] = save_path # Determine volume name
payload["skipped"] = False volume_name = os.path.basename(volume_path.rstrip("/"))
return payload
# Queue audio task
try:
generate_audio.delay(
book_id,
volume_name,
chapter_num,
chapter_title,
path,
)
log_msg(
book_id,
f"[AUDIO] Task queued for chapter {chapter_num} in {volume_name}",
)
except Exception as audio_exc:
log_msg(
book_id, f"[AUDIO] ERROR queueing chapter {chapter_num}: {audio_exc}"
)
return {
"book_id": book_id,
"chapter": chapter_dict,
"path": path,
"book_meta": book_meta,
}
except Exception as exc: except Exception as exc:
log_msg(book_idx, f"[SAVE] ERROR saving chapter {num}: {exc}") log_msg(book_id, f"[SAVE] ERROR saving chapter {chapter_num}: {exc}")
raise raise

@ -1,9 +1,7 @@
# ============================================================ # ============================================================
# File: scraper/tasks/scraping.py # File: scraper/tasks/scraping.py
# Purpose: # Purpose: Scrape metadata + chapter list and initialise
# Scrape ONLY metadata + chapter list. # Redis progress tracking + launch download controller
# Does NOT launch download controller anymore.
# Controller decides when pipelines start.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
@ -11,91 +9,88 @@ from logbus.publisher import log
import os import os
import redis import redis
from scraper.logger_decorators import logcall
from scraper.sites import BookSite from scraper.sites import BookSite
from scraper.book_scraper import BookScraper from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort from scraper.abort import clear_abort # no circular deps
from scraper.ui_log import reset_ui_logs from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT
from scraper.services.init_service import InitService
print(">>> [IMPORT] scraping.py loaded") print(">>> [IMPORT] scraping.py loaded")
# Redis connection (same DB as Celery broker) # Redis connection (same as Celery broker)
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True) r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
@celery_app.task( @celery_app.task(bind=True, queue="scraping", ignore_result=False)
bind=True,
queue="scraping",
ignore_result=False,
name="scraper.tasks.scraping.start_scrape_book",
)
@logcall
def start_scrape_book(self, url: str): def start_scrape_book(self, url: str):
""" """Scrapes metadata + chapters and prepares download tracking."""
Scrapes metadata + chapters.
DOES NOT START download / pipeline controller.
The controller_tasks.start_full_scrape() task will call this one.
"""
# ------------------------------------------------------------ # ------------------------------------------------------------
# CLEAR UI LOG BUFFER # NEW: clear UI log buffer at start of new run
# ------------------------------------------------------------ # ------------------------------------------------------------
reset_ui_logs() reset_ui_logs()
log(f"[SCRAPING] Start scraping for: {url}") log(f"[SCRAPING] Start scraping for: {url}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# SCRAPE (old engine) # Book scrape
# ------------------------------------------------------------ # ------------------------------------------------------------
site = BookSite() site = BookSite()
scraper = BookScraper(site, url) scraper = BookScraper(site, url)
result = scraper.execute() # → { title, author, chapters, cover_url, ... } result = scraper.execute() # returns dict with metadata + chapters
chapters = result.get("chapters", []) chapters = result.get("chapters", [])
full_count = len(chapters) full_count = len(chapters)
# ------------------------------------------------------------ # ------------------------------------------------------------
# Compute unified book_idx # DRY RUN
# ------------------------------------------------------------
book_idx = InitService.derive_book_id(url)
result["book_idx"] = book_idx
log(f"[SCRAPING] Assigned book_idx = {book_idx}")
# ------------------------------------------------------------
# DRY RUN TEST LIMIT
# ------------------------------------------------------------ # ------------------------------------------------------------
DRY_RUN = os.getenv("DRY_RUN", "0") == "1" DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5")) TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
if DRY_RUN: if DRY_RUN:
log(f"[SCRAPING] DRY RUN active → limiting chapters to {TEST_LIMIT}") log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}")
result["chapters"] = chapters[:TEST_LIMIT] chapters = chapters[:TEST_LIMIT]
result["chapters"] = chapters
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
# ------------------------------------------------------------ # ------------------------------------------------------------
# LOG RESULTS # BOOK RUN ID (using title as ID)
# ------------------------------------------------------------ # ------------------------------------------------------------
log( title = result.get("title") or "UnknownBook"
f"[SCRAPING] Completed scrape: " book_id = title # user requirement
f"{len(result['chapters'])}/{full_count} chapters"
) result["book_id"] = book_id
log(f"[SCRAPING] Assigned book_id = '{book_id}'")
# ------------------------------------------------------------ # ------------------------------------------------------------
# RESET ABORT + INITIALIZE LEGACY PROGRESS # RESET ABORT + INITIALISE PROGRESS
# ------------------------------------------------------------ # ------------------------------------------------------------
clear_abort(book_idx) clear_abort(book_id)
r.set(f"progress:{book_idx}:total", len(result["chapters"])) r.set(f"progress:{book_id}:total", len(chapters))
r.set(f"progress:{book_idx}:done", 0) r.set(f"progress:{book_id}:done", 0)
r.delete(f"logs:{book_id}") # clear old logs if any
r.delete(f"logs:{book_idx}") r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}")
r.rpush(f"logs:{book_idx}", f":: SCRAPING STARTED for {url}") r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters")
r.rpush(f"logs:{book_idx}", f":: Found {len(result['chapters'])} chapters")
# ------------------------------------------------------------ # ------------------------------------------------------------
# IMPORTANT: DO NOT DISPATCH any pipelines here # DISPATCH DOWNLOAD CONTROLLER
# Controller will receive scrape_result and continue.
# ------------------------------------------------------------ # ------------------------------------------------------------
return result celery_app.send_task(
"scraper.tasks.controller_tasks.launch_downloads",
args=[book_id, result],
queue="controller",
)
log(f"[SCRAPING] Dispatched download controller for '{book_id}'")
return {
"book_id": book_id,
"title": result.get("title"),
"author": result.get("author"),
"chapters": len(chapters),
}

@ -1,149 +0,0 @@
# ============================================================
# File: scraper/tasks/statuscheck.py
# Purpose:
# Final status check after audio completion.
#
# Responsibilities:
# - Verify Redis counters (sanity check)
# - Verify filesystem (Audio files present)
# - Queue m4btool task
#
# Design rules:
# - Book-scope ONLY
# - No direct Redis usage
# - Repository is the single source of truth
# - Idempotent, defensive, non-blocking
# ============================================================
import os
from celery_app import celery_app
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import (
get_audio_done,
get_chapters_total,
set_status,
fetch_book,
)
from scraper.tasks.m4b_tasks import run_m4btool
# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
@log
def _detect_volumes(book_base: str):
"""
Return sorted list of Volume_XXX directories.
"""
vols = []
for name in os.listdir(book_base):
if name.lower().startswith("volume_"):
full = os.path.join(book_base, name)
if os.path.isdir(full):
vols.append(name)
vols.sort()
return vols
@logcall
def _count_audio_files(audio_dir: str) -> int:
"""
Count .m4b files in an Audio directory.
"""
if not os.path.isdir(audio_dir):
return 0
return len([f for f in os.listdir(audio_dir) if f.lower().endswith(".m4b")])
# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="controller", ignore_result=True)
@logcall
def run_statuscheck(self, book_idx: str):
"""
Final statuscheck before m4btool execution.
Triggered exactly once by audio_completion quickcheck.
"""
log(f"[STATUSCHECK] START book={book_idx}")
# --------------------------------------------------------
# 1. Redis sanity check (via repository)
# --------------------------------------------------------
audio_done = get_audio_done(book_idx)
chapters_total = get_chapters_total(book_idx)
log(
f"[STATUSCHECK] Counters book={book_idx} "
f"audio_done={audio_done} chapters_total={chapters_total}"
)
if chapters_total <= 0:
log(f"[STATUSCHECK] No chapters_total → abort")
return
if audio_done < chapters_total:
# Defensive: should not happen, but never assume
log(
f"[STATUSCHECK] Audio not complete yet "
f"({audio_done}/{chapters_total}) → abort"
)
return
# --------------------------------------------------------
# 2. Fetch book metadata (for paths & m4b meta)
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
log(f"[STATUSCHECK] Book not found in DB: {book_idx}")
return
title = book.get("title") or book_idx
author = book.get("author") or "Unknown"
# Base output directory
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
book_base = os.path.join(root, title)
if not os.path.isdir(book_base):
log(f"[STATUSCHECK] Book directory missing: {book_base}")
return
# --------------------------------------------------------
# 3. Filesystem validation (light, non-blocking)
# --------------------------------------------------------
volumes = _detect_volumes(book_base)
if not volumes:
log(f"[STATUSCHECK] No volumes found for {book_idx}")
# Still allow m4btool to decide (it will no-op)
else:
for vol in volumes:
audio_dir = os.path.join(book_base, vol, "Audio")
count = _count_audio_files(audio_dir)
log(f"[STATUSCHECK] {vol}: " f"{count} audio files detected")
# --------------------------------------------------------
# 4. Queue m4btool (final pipeline step)
# --------------------------------------------------------
log(f"[STATUSCHECK] Queue m4btool for book={book_idx}")
set_status(book_idx, "m4b_running")
run_m4btool.delay(
book_idx=book_idx,
book_base=book_base,
meta={
"title": title,
"author": author,
},
)
log(f"[STATUSCHECK] DONE book={book_idx}")

@ -1,272 +0,0 @@
# ============================================================
# File: scraper/utils/state_sync.py
# Purpose:
# State inspection + optional sync logic for unified book_idx model.
# Generates full book-card compatible dicts for debug UI.
# ============================================================
import os
import redis
from db.db import get_db
def _build_card(sqlite_row, redis_state, merged):
"""
Creates a dict that matches the fields required by components/bookcard.html:
b.book_idx
b.title
b.author
b.cover_path
b.status
b.created_at
b.download_done
b.download_total
b.audio_done
b.audio_total
"""
return {
"book_idx": sqlite_row.get("book_idx"),
"title": sqlite_row.get("title") or "Unknown",
"author": sqlite_row.get("author"),
"cover_path": sqlite_row.get("cover_path"),
# Use merged status (Redis > SQLite)
"status": merged.get("status") or sqlite_row.get("status") or "unknown",
# Meta
"created_at": sqlite_row.get("created_at"),
# Download counters
"download_done": merged.get("downloaded", 0),
"download_total": merged.get("chapters_total", 0),
# Audio counters
"audio_done": merged.get("audio_done", 0),
"audio_total": merged.get("chapters_total", 0),
}
# ============================================================
# INSPECT ONLY — NO WRITES
# ============================================================
def inspect_books_state_depecrated():
"""
Reads all books from SQLite and fetches Redis progress.
Builds:
entry.sqlite
entry.redis
entry.would_merge_to
entry.card (book-card compatible)
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db()
cur = db.cursor()
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
sqlite_row = dict(row)
book_idx = sqlite_row["book_idx"]
redis_key = f"book:{book_idx}:state"
redis_state = r.hgetall(redis_key) or {}
# ================================
# DRY-RUN MERGE LOGIC
# ================================
merged = sqlite_row.copy()
if redis_state:
merged["downloaded"] = int(
redis_state.get("chapters_download_done", merged.get("downloaded", 0))
)
merged["parsed"] = int(
redis_state.get("chapters_parsed_done", merged.get("parsed", 0))
)
merged["audio_done"] = int(
redis_state.get("audio_done", merged.get("audio_done", 0))
)
merged["chapters_total"] = int(
redis_state.get("chapters_total", merged.get("chapters_total", 0))
)
merged["status"] = redis_state.get(
"status", merged.get("status", "unknown")
)
# ================================
# Build book-card data
# ================================
card = _build_card(sqlite_row, redis_state, merged)
# ================================
# Append final result entry
# ================================
results.append(
{
"book_idx": book_idx,
"title": sqlite_row.get("title"),
"sqlite": sqlite_row,
"redis": redis_state,
"would_merge_to": merged,
"card": card,
}
)
return results
# ============================================================
# INSPECT ONLY — NO WRITES
# ============================================================
def inspect_books_state():
"""
Reads canonical book state from repository.
Builds:
entry.sqlite
entry.redis
entry.would_merge_to
entry.card (book-card compatible)
"""
from db.repository import get_book_state
from db.db import get_db
db = get_db()
cur = db.cursor()
# Alleen nodig om te weten *welke* books er zijn
cur.execute("SELECT book_idx FROM books")
rows = cur.fetchall()
results = []
for row in rows:
book_idx = row["book_idx"]
# --------------------------------
# Canonical state (ENIGE waarheid)
# --------------------------------
state = get_book_state(book_idx)
# SQLite-view = alleen SQLite-kolommen
sqlite_view = {
k: v
for k, v in state.items()
if k
in (
"book_idx",
"title",
"author",
"description",
"cover_path",
"book_url",
"chapters_total",
"status",
"downloaded",
"parsed",
"audio_done",
"created_at",
"processdate",
"last_update",
)
}
# Redis-view = alleen Redis counters/status
redis_view = {
k: v
for k, v in state.items()
if k.startswith("chapters_")
or k in ("status", "audio_done", "audio_skipped")
}
merged = state # letterlijk de canonieke state
card = _build_card(sqlite_view, redis_view, merged)
results.append(
{
"book_idx": book_idx,
"title": state.get("title"),
"sqlite": sqlite_view,
"redis": redis_view,
"would_merge_to": merged,
"card": card,
}
)
return results
# ============================================================
# SYNC REDIS → SQLITE (writes)
# ============================================================
def sync_books_from_redis():
"""
Writes Redis progress values back into SQLite.
Uses unified book_idx as identifier.
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db()
cur = db.cursor()
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
before = dict(row)
book_idx = before["book_idx"]
redis_key = f"book:{book_idx}:state"
redis_state = r.hgetall(redis_key)
if not redis_state:
results.append(
{
"book_idx": book_idx,
"before": before,
"redis": {},
"after": before,
}
)
continue
# Extract progress from Redis
downloaded = int(redis_state.get("chapters_download_done", 0))
parsed = int(redis_state.get("chapters_parsed_done", 0))
audio_done = int(redis_state.get("audio_done", 0))
total = int(redis_state.get("chapters_total", 0))
status = redis_state.get("status", before.get("status"))
# Update SQLite
cur.execute(
"""
UPDATE books
SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now')
WHERE book_idx = ?
""",
(downloaded, parsed, audio_done, total, status, book_idx),
)
db.commit()
cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
after = dict(cur.fetchone())
results.append(
{
"book_idx": book_idx,
"before": before,
"redis": redis_state,
"after": after,
}
)
return results

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

@ -1,310 +0,0 @@
/* =======================================================================
File: static/css/bookcard.css
Purpose:
Styling voor registered book cards:
- status kleuren
- badges
- start/abort/statuscheck
- progress bars
======================================================================= */
/* -----------------------------------------------------------------------
GRID WRAPPER
----------------------------------------------------------------------- */
.registered-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(340px, 1fr));
gap: 20px;
margin-top: 15px;
}
/* -----------------------------------------------------------------------
BOOK CARD BASE
----------------------------------------------------------------------- */
.book-card {
position: relative;
display: grid;
grid-template-columns: 90px auto;
gap: 15px;
padding: 15px;
background: #fff;
border-radius: 10px;
border: 1px solid #e5e5e5;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
transition: border-color 0.25s ease, box-shadow 0.25s ease;
}
/* -----------------------------------------------------------------------
STATUS COLORS (BOOK CARD BORDER)
----------------------------------------------------------------------- */
/* Downloading / actief bezig */
.book-card.downloading {
border-color: #ff9500;
box-shadow: 0 0 6px rgba(255, 149, 0, 0.35);
}
/* Audio fase */
.book-card.audio {
border-color: #ffca28;
box-shadow: 0 0 6px rgba(255, 202, 40, 0.35);
}
/* Volledig klaar */
.book-card.done {
border: 2px solid #4caf50;
box-shadow: 0 0 6px rgba(76, 175, 80, 0.35);
}
/* Afgebroken */
.book-card.aborted {
border-color: #ff3b30;
box-shadow: 0 0 6px rgba(255, 59, 48, 0.35);
}
/* -----------------------------------------------------------------------
COVER
----------------------------------------------------------------------- */
.book-cover {
width: 90px;
}
.book-img {
width: 90px;
height: 130px;
object-fit: cover;
border-radius: 4px;
background: #f4f4f4;
}
.placeholder {
display: flex;
justify-content: center;
align-items: center;
color: #777;
font-size: 12px;
}
/* -----------------------------------------------------------------------
META
----------------------------------------------------------------------- */
.book-meta {
display: flex;
flex-direction: column;
justify-content: space-between;
}
.book-title {
font-size: 16px;
font-weight: bold;
}
.book-author {
font-size: 14px;
color: #444;
margin-bottom: 6px;
}
.book-created {
font-size: 12px;
color: #666;
}
/* -----------------------------------------------------------------------
ACTION BUTTONS
----------------------------------------------------------------------- */
.book-actions {
display: flex;
justify-content: flex-end;
gap: 10px;
margin-top: 10px;
}
.icon-btn {
width: 34px;
height: 34px;
border: none;
border-radius: 8px;
display: flex;
justify-content: center;
align-items: center;
font-size: 16px;
color: #fff;
cursor: pointer;
transition: background 0.15s ease, transform 0.1s ease;
}
/* Start */
.icon-start {
background: #2d8a3d;
}
.icon-start:hover {
background: #226c30;
transform: scale(1.05);
}
.icon-start:disabled {
background: #9bbb9f;
cursor: not-allowed;
opacity: 0.5;
}
/* Abort */
.icon-abort {
background: #c62828;
}
.icon-abort:hover {
background: #a31f1f;
transform: scale(1.05);
}
.icon-abort:disabled {
background: #d8a0a0;
cursor: not-allowed;
opacity: 0.5;
}
/* Hide */
.hide-form {
position: absolute;
top: 6px;
right: 6px;
}
.icon-hide {
background: #777;
}
.icon-hide:hover {
background: #555;
}
/* Statuscheck */
.statuscheck-btn {
background-color: #444;
color: #fff;
border: 1px solid #666;
margin-left: 4px;
padding: 4px 8px;
border-radius: 6px;
font-size: 12px;
cursor: pointer;
}
.statuscheck-btn:hover {
background-color: #333;
}
/* -----------------------------------------------------------------------
PROGRESS (FULL WIDTH)
----------------------------------------------------------------------- */
.book-progress {
grid-column: 1 / -1;
margin-top: 12px;
padding: 10px 12px;
background: #f6f6f6;
border-radius: 8px;
}
.progress-row {
margin-bottom: 4px;
}
.progress-label {
font-size: 12px;
margin-bottom: 4px;
color: #444;
}
/* BAR */
.progressbar {
position: relative;
width: 100%;
height: 14px;
background: #ddd;
border-radius: 7px;
overflow: hidden;
}
.progressbar-fill {
height: 100%;
transition: width 0.4s ease;
}
/* Download */
.progressbar-fill.download {
background: #2196f3;
}
/* Audio */
.progressbar-fill.audio {
background: #4caf50;
}
/* TEXT IN BAR */
.progressbar-text {
position: absolute;
inset: 0;
display: flex;
align-items: center;
justify-content: center;
font-size: 11px;
font-weight: 600;
color: #fff;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.6);
pointer-events: none;
}
/* -----------------------------------------------------------------------
STATUS BADGE
----------------------------------------------------------------------- */
.status-badge {
display: inline-block;
margin-bottom: 6px;
padding: 2px 8px;
font-size: 11px;
font-weight: 600;
border-radius: 10px;
text-transform: uppercase;
letter-spacing: 0.5px;
cursor: default;
}
/* DONE */
.status-badge.status-done {
background-color: #e6f4ea;
color: #2e7d32;
border: 1px solid #4caf50;
}
/* AUDIO */
.status-badge.status-audio {
background-color: #fff8e1;
color: #8d6e00;
border: 1px solid #ffca28;
}
/* DOWNLOADING */
.status-badge.status-downloading {
background-color: #e3f2fd;
color: #1565c0;
border: 1px solid #42a5f5;
}
/* Statuscheck */
.icon-statuscheck {
background: #444;
}
.icon-statuscheck:hover {
background: #333;
transform: scale(1.05);
}

@ -2,28 +2,33 @@
File: static/css/dashboard.css File: static/css/dashboard.css
Purpose: Purpose:
Clean full-width vertical dashboard layout with large log viewer. Clean full-width vertical dashboard layout with large log viewer.
Book-card CSS is now moved to bookcard.css
======================================================================= */ ======================================================================= */
/* ----------------------------------------------------------------------- /* ------------------------------
1) GENERAL PAGE LAYOUT GENERAL PAGE LAYOUT
----------------------------------------------------------------------- */ ------------------------------ */
/* Dashboard content should use full width */
.dashboard-container { .dashboard-container {
display: flex; display: flex;
flex-direction: column; flex-direction: column;
width: 100%; width: 100%;
max-width: 1200px; max-width: 1200px; /* voorkomt overflow rechts */
margin: 20px auto; margin: 20px auto;
padding: 0 20px; padding: 0 20px;
gap: 18px; gap: 18px; /* kleiner dan 30px */
} }
/* ------------------------------
SECTIONS (input, progress, logs)
------------------------------ */
.dashboard-section { .dashboard-section {
background: #ffffff; background: #ffffff;
padding: 16px; padding: 16px; /* kleiner */
border-radius: 6px; border-radius: 6px;
border: 1px solid #ddd; border: 1px solid #ddd;
margin: 0; /* weg extra witruimte */
} }
.page-title { .page-title {
@ -31,9 +36,9 @@
margin-bottom: 15px; margin-bottom: 15px;
} }
/* ----------------------------------------------------------------------- /* ------------------------------
2) ACTIVE BOOK LIST (dashboard left panel) BOOK LIST (optional)
----------------------------------------------------------------------- */ ------------------------------ */
.book-list { .book-list {
display: flex; display: flex;
@ -47,6 +52,7 @@
color: #777; color: #777;
} }
/* List item */
.book-list-item { .book-list-item {
padding: 12px 16px; padding: 12px 16px;
background: #f7f7f7; background: #f7f7f7;
@ -67,6 +73,7 @@
border-color: #1e88e5; border-color: #1e88e5;
} }
/* Title + metadata */
.book-title { .book-title {
font-size: 16px; font-size: 16px;
font-weight: 600; font-weight: 600;
@ -77,15 +84,20 @@
color: #555; color: #555;
} }
/* ----------------------------------------------------------------------- .meta-label {
3) PROGRESS BOX font-weight: 600;
----------------------------------------------------------------------- */ }
/* ------------------------------
PROGRESS BOX
------------------------------ */
.progress-box { .progress-box {
background: #fafafa; background: #fafafa;
border: 1px solid #ddd; border: 1px solid #ddd;
padding: 8px; padding: 18px;
border-radius: 6px; border-radius: 6px;
width: 100%;
} }
.progress-header h2 { .progress-header h2 {
@ -129,35 +141,14 @@
margin-top: 4px; margin-top: 4px;
} }
.book-abort-area { /* ------------------------------
margin-top: 10px; LOG VIEWER LARGE FULL-WIDTH
text-align: right; ------------------------------ */
}
.abort-btn {
padding: 6px 12px;
border-radius: 4px;
border: 1px solid #cc0000;
background: #ff4444;
color: white;
font-size: 12px;
cursor: pointer;
transition: background 0.2s, border-color 0.2s;
}
.abort-btn:hover {
background: #ff2222;
border-color: #aa0000;
}
/* -----------------------------------------------------------------------
4) LOG VIEWER
----------------------------------------------------------------------- */
.log-viewer { .log-viewer {
width: 100%; width: 100%;
max-width: 100%; max-width: 100%;
overflow: hidden; overflow: hidden; /* voorkom horizontaal uitsteken */
} }
.log-header { .log-header {
@ -180,11 +171,11 @@
max-height: 75vh; max-height: 75vh;
overflow-y: auto; overflow-y: auto;
overflow-x: hidden; overflow-x: hidden; /* voorkom dat de log naar rechts uitsteekt */
background: #000; background: #000000; /* Pure terminal black */
color: #00ff66; color: #00ff66; /* Matrix / retro green */
border: 1px solid #0f0; border: 1px solid #0f0; /* neon green frame */
border-radius: 6px; border-radius: 6px;
padding: 12px; padding: 12px;
@ -192,39 +183,48 @@
font-size: 13px; font-size: 13px;
line-height: 1.35; line-height: 1.35;
white-space: pre-wrap; white-space: pre-wrap; /* wraps text */
word-break: break-word; word-break: break-word; /* lange links breken */
} }
/* Basestijl voor alle logregels */
.log-line { .log-line {
white-space: pre-wrap; white-space: pre-wrap;
padding: 2px 0; padding: 2px 0;
font-family: "SF Mono", "Consolas", "Courier New", monospace;
} }
/* Subklassen per logtype */
.log-line.default { .log-line.default {
color: #00ff66; color: #00ff66; /* groen */
} }
.log-line.dl { .log-line.dl {
color: #00ccff; color: #00ccff; /* cyan */
} }
.log-line.parse { .log-line.parse {
color: #ffaa00; color: #ffaa00; /* oranje */
} }
.log-line.save { .log-line.save {
color: #ffdd33; color: #ffdd33; /* geel */
} }
.log-line.audio { .log-line.audio {
color: #ff66ff; color: #ff66ff; /* paars */
} }
.log-line.ctrl { .log-line.ctrl {
color: #66aaff; color: #66aaff; /* lichtblauw */
} }
.log-line.error { .log-line.error {
color: #ff3333; color: #ff3333; /* rood */
} }
/* ----------------------------------------------------------------------- /* ------------------------------
5) PLACEHOLDER / FOOTER PLACEHOLDER
----------------------------------------------------------------------- */ ------------------------------ */
.dashboard-placeholder { .dashboard-placeholder {
font-size: 15px; font-size: 15px;
@ -232,7 +232,6 @@
text-align: center; text-align: center;
color: #777; color: #777;
} }
.footer { .footer {
text-align: center; text-align: center;
padding: 12px; padding: 12px;
@ -241,72 +240,23 @@
font-size: 12px; font-size: 12px;
border-top: 1px solid #ddd; border-top: 1px solid #ddd;
} }
/* ----------------------------- .book-abort-area {
DROPDOWN NAVIGATION margin-top: 10px;
------------------------------ */ text-align: right;
/* Container for dropdown */
.nav-dropdown {
position: relative;
}
/* The clickable label ("Tools ▾") */
.nav-dropdown > .nav-item {
cursor: pointer;
} }
/* Hide dropdown by default */ .abort-btn {
.dropdown-menu { padding: 6px 12px;
display: none;
position: absolute;
top: 100%;
right: 0;
background: #fff; /* zelfde achtergrond als navbar */
border: 1px solid #ddd;
padding: 8px 0;
margin: 0;
list-style: none; /* verwijder bolletjes */
border-radius: 4px; border-radius: 4px;
min-width: 160px; border: 1px solid #cc0000;
z-index: 1000; background: #ff4444;
} color: white;
font-size: 12px;
/* Show dropdown when hovering over parent */ cursor: pointer;
.nav-dropdown:hover .dropdown-menu { transition: background 0.2s, border-color 0.2s;
display: block;
}
/* Menu item styling */
.dropdown-menu li {
padding: 0;
margin: 0;
}
.dropdown-menu li a {
display: block;
padding: 8px 16px;
white-space: nowrap;
color: #333;
text-decoration: none;
}
/* Hover state */
.dropdown-menu li a:hover {
background: #f0f0f0;
}
table.kv {
border-collapse: collapse;
margin-bottom: 16px;
}
table.kv th {
text-align: left;
padding-right: 12px;
color: #777;
font-weight: normal;
} }
table.kv td { .abort-btn:hover {
font-weight: 500; background: #ff2222;
border-color: #aa0000;
} }

@ -1,145 +0,0 @@
/* ============================================================
File: static/js/bookcard_controller.js
Purpose:
Single owner for updating book-card DOM from merged state
(would_merge_to)
============================================================ */
console.log("[BOOKCARD] controller loaded");
/* ============================================================
ENTRY POINT (called by state_updater.js)
============================================================ */
function updateBookCardsFromState(stateList) {
console.log("[BOOKCARD] updateBookCardsFromState called");
if (!Array.isArray(stateList)) {
console.warn("[BOOKCARD] Invalid stateList", stateList);
return;
}
const stateById = {};
stateList.forEach((entry) => {
const merged = entry.would_merge_to;
if (!merged || merged.book_idx == null) {
console.warn("[BOOKCARD] entry without merged/book_idx", entry);
return;
}
stateById[String(merged.book_idx)] = merged;
});
document.querySelectorAll(".book-card").forEach((card) => {
const bookIdx = card.dataset.bookIdx;
const state = stateById[bookIdx];
if (!state) {
console.debug("[BOOKCARD] No state for book_idx", bookIdx);
return;
}
console.log("[BOOKCARD] Updating card", bookIdx, state.status);
updateSingleBookCard(card, state);
});
}
/* ============================================================
SINGLE CARD UPDATE
============================================================ */
function updateSingleBookCard(card, state) {
console.log("[BOOKCARD] updateSingleBookCard", state.book_idx);
updateStatus(card, state);
updateStatusBadge(card, state);
updateButtons(card, state);
updateProgress(card, state);
}
/* ============================================================
STATUS
============================================================ */
function updateStatus(card, state) {
console.log("[BOOKCARD][STATUS]", state.book_idx, "→", state.status);
card.className = `book-card ${state.status || ""}`;
}
function updateStatusBadge(card, state) {
const badge = card.querySelector(".status-badge");
if (!badge) return;
const status = (state.status || "").toLowerCase();
badge.textContent = status.toUpperCase();
badge.className = `status-badge status-${status}`;
badge.title =
{
downloading: "Bezig met downloaden",
audio: "Downloads compleet, audio wordt gegenereerd",
done: "Alle chapters en audio zijn compleet",
}[status] || "";
}
/* ============================================================
BUTTONS
============================================================ */
function updateButtons(card, state) {
const startBtn = card.querySelector(".icon-start");
const abortBtn = card.querySelector(".icon-abort");
const busy = ["starting", "downloading", "parsing", "audio"];
console.log("[BOOKCARD][BUTTONS]", state.book_idx, "status:", state.status);
if (startBtn) {
// startBtn.disabled = busy.includes(state.status);
}
if (abortBtn) {
abortBtn.disabled = !busy.includes(state.status);
}
}
/* ============================================================
PROGRESS (DOWNLOAD + AUDIO)
============================================================ */
function updateProgress(card, s) {
const total = Number(s.chapters_total || 0);
// const downloadDone =
// Number(s.chapters_download_done || 0) +
// Number(s.chapters_download_skipped || 0);
const downloadDone = Number(s.downloaded || 0);
const audioDone = Number(s.audio_done || 0) + Number(s.audio_skipped || 0);
const downloadPct =
total > 0 ? Math.min((downloadDone / total) * 100, 100) : 0;
const audioPct = total > 0 ? Math.min((audioDone / total) * 100, 100) : 0;
console.log("[BOOKCARD][PROGRESS]", s.book_idx, {
total,
downloadDone,
audioDone,
downloadPct,
audioPct,
});
/* ---- DOWNLOAD ---- */
const dlBar = card.querySelector('[data-field="download_pct"]');
const dlText = card.querySelector('[data-field="download_text"]');
if (dlBar) dlBar.style.width = `${downloadPct}%`;
if (dlText) dlText.textContent = `${downloadDone} / ${total}`;
/* ---- AUDIO ---- */
const auBar = card.querySelector('[data-field="audio_pct"]');
const auText = card.querySelector('[data-field="audio_text"]');
if (auBar) auBar.style.width = `${audioPct}%`;
if (auText) auText.textContent = `${audioDone} / ${total}`;
}

@ -1,178 +1,200 @@
/* ======================================================================= /* =======================================================================
File: static/js/dashboard.js File: static/js/dashboard.js
Purpose: Purpose:
- Sidebar selectie Dashboard interactions:
- Start / Abort acties - select book
- UI status updates - refresh logs
- refresh progress
NOTE: NOTE:
- GEEN polling $ / $$ / autoScroll komen uit helpers.js
- state_updater.js is leidend
======================================================================= */ ======================================================================= */
console.log("[DASHBOARD] loaded");
/* --------------------------------------------------------- /* ---------------------------------------------------------
Helpers Simple fetch wrapper
--------------------------------------------------------- */ --------------------------------------------------------- */
async function apiGet(url) { async function apiGet(url) {
console.log("[DASHBOARD][API] GET", url);
try { try {
const r = await fetch(url, { cache: "no-store" }); const r = await fetch(url);
if (!r.ok) { if (!r.ok) return null;
console.warn("[DASHBOARD][API] GET failed", url, r.status);
return null;
}
return await r.json(); return await r.json();
} catch (e) { } catch (e) {
console.error("[DASHBOARD][API] GET error", url, e); console.error("API GET failed:", url, e);
return null; return null;
} }
} }
function safeUpdateLogs(data) {
if (typeof window.updateLogs === "function") {
console.log("[DASHBOARD] updateLogs()");
window.updateLogs(data);
}
}
/* --------------------------------------------------------- /* ---------------------------------------------------------
State Dashboard state
--------------------------------------------------------- */ --------------------------------------------------------- */
let ACTIVE_BOOK_IDX = null; let ACTIVE_BOOK = null;
let REFRESH_INTERVAL = null;
console.log(">>> dashboard.js LOADED");
/* --------------------------------------------------------- /* ---------------------------------------------------------
DOM READY DOM Ready setup
--------------------------------------------------------- */ --------------------------------------------------------- */
document.addEventListener("DOMContentLoaded", () => { document.addEventListener("DOMContentLoaded", () => {
console.log("[DASHBOARD] DOMContentLoaded"); console.log(">>> dashboard.js DOMContentLoaded");
// =====================================================
bindSidebar(); // GLOBAL FALLBACK POLLING — ALWAYS FETCH LOGS
bindBookCardButtons(); // Runs when no books exist or no selection has been made
// =====================================================
console.log(">>> dashboard.js: enabling global fallback polling");
setInterval(() => {
// if no active book → fetch global logs
if (!ACTIVE_BOOK) {
refreshBook(null); // triggers /logs
}
}, 2000);
const items = $$(".book-list-item");
console.log(">>> dashboard.js found book-list items:", items.length);
// Geen boeken → geen polling starten
// if (!items || items.length === 0) {
// console.log(">>> dashboard.js: geen boeken aanwezig, polling uit.");
// return;
// }
// Book selection listener
items.forEach((item) => {
item.addEventListener("click", () => {
console.log(">>> dashboard.js: user clicked book:", item.dataset.bookId);
selectBook(item.dataset.bookId);
});
});
const first = document.querySelector(".book-list-item"); // Auto-select first book
if (first) { if (!ACTIVE_BOOK && items[0]) {
console.log("[DASHBOARD] auto-select", first.dataset.bookIdx); console.log(
selectBook(first.dataset.bookIdx); ">>> dashboard.js: auto-select first book:",
items[0].dataset.bookId
);
selectBook(items[0].dataset.bookId);
} }
}); });
/* --------------------------------------------------------- /* ---------------------------------------------------------
Sidebar Select a book (updates UI + starts polling)
--------------------------------------------------------- */ --------------------------------------------------------- */
function bindSidebar() { function selectBook(bookId) {
console.log("[DASHBOARD] bindSidebar()"); console.log(">>> selectBook(", bookId, ")");
document.querySelectorAll(".book-list-item").forEach((item) => {
item.onclick = () => selectBook(item.dataset.bookIdx); ACTIVE_BOOK = bookId;
// Highlight
$$(".book-list-item").forEach((el) => {
el.classList.toggle("active", el.dataset.bookId === bookId);
}); });
// Reset previous polling
if (REFRESH_INTERVAL) {
console.log(">>> dashboard.js: clearing previous polling interval");
clearInterval(REFRESH_INTERVAL);
}
// Start new polling
console.log(">>> dashboard.js: starting polling for bookId =", bookId);
REFRESH_INTERVAL = setInterval(() => {
refreshBook(ACTIVE_BOOK);
}, 2000);
// Immediate refresh
refreshBook(ACTIVE_BOOK);
} }
setInterval(refreshActiveBooks, 2000);
async function refreshActiveBooks() {
const books = await apiGet("/api/books");
if (!books) return;
const container = $("#book-list");
if (!container) return;
// Herbouw de lijst
container.innerHTML = "";
books.forEach((b) => {
const div = document.createElement("div");
div.className = "book-list-item";
div.dataset.bookId = b.book_id;
function selectBook(bookIdx) { div.innerHTML = `
if (!bookIdx || bookIdx === ACTIVE_BOOK_IDX) return; <div class="book-title">${b.title}</div>
<div class="book-status">${b.status}</div>
<div class="book-progress">
${b.download_done}/${b.download_total} downloaded,
${b.audio_done}/${b.audio_total} audio
</div>
ACTIVE_BOOK_IDX = bookIdx; <button class="abort-btn" onclick="abortBook('${b.book_id}')">Abort</button>
console.log("[DASHBOARD] selectBook", bookIdx); `;
document.querySelectorAll(".book-list-item").forEach((el) => { // Event listener opnieuw koppelen
el.classList.toggle("active", el.dataset.bookIdx === bookIdx); div.addEventListener("click", () => selectBook(b.book_id));
container.appendChild(div);
}); });
refreshBook(bookIdx); // Als ACTIVE_BOOK nog niet bekend → auto-selecteer eerste boek
if (!ACTIVE_BOOK && books.length > 0) {
selectBook(books[0].book_id);
}
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
Book refresh (NO POLLING) Fetch logs + progress from API
--------------------------------------------------------- */ --------------------------------------------------------- */
async function refreshBook(bookIdx) { async function refreshBook(bookId) {
console.log("[DASHBOARD] refreshBook", bookIdx); console.log(">>> refreshBook(", bookId, ")");
const logs = await apiGet(`/api/book/${bookIdx}/logs`); // 1) Als er GEEN bookId is → haal alleen globale logs op
if (logs) safeUpdateLogs(logs); if (!bookId) {
console.log(">>> refreshBook: no active book → fetch /logs");
refreshBookCards(); const data = await apiGet("/logs");
} if (data && data.logs) updateLogs(data.logs);
/* --------------------------------------------------------- return; // klaar
Bookcard buttons }
--------------------------------------------------------- */
function bindBookCardButtons() {
console.log("[DASHBOARD] bindBookCardButtons()");
document.querySelectorAll(".icon-start").forEach((btn) => {
if (btn.dataset.bound) return;
btn.dataset.bound = "1";
btn.onclick = (e) => {
e.preventDefault();
const card = btn.closest(".book-card");
if (!card) return;
startBook(card.dataset.bookIdx);
};
});
document.querySelectorAll(".icon-abort").forEach((btn) => { // 2) Als er WEL een boek is → haal book status + logs op
if (btn.dataset.bound) return; const state = await apiGet(`/api/book/${bookId}/status`);
btn.dataset.bound = "1"; const logs = await apiGet(`/api/book/${bookId}/logs`);
btn.onclick = (e) => { console.log(">>> refreshBook state =", state);
e.preventDefault(); console.log(">>> refreshBook logs =", logs);
const card = btn.closest(".book-card");
if (!card) return;
abortBook(card.dataset.bookIdx);
};
});
}
/* --------------------------------------------------------- if (state) updateProgressBars(state);
START if (logs) updateLogs(logs);
--------------------------------------------------------- */
function startBook(bookIdx) {
console.log("[DASHBOARD] START", bookIdx);
fetch("/start", {
method: "POST",
headers: { "Content-Type": "application/x-www-form-urlencoded" },
body: `book_idx=${bookIdx}`,
}).then(() => refreshBook(bookIdx));
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
ABORT Update LOG VIEW panel
--------------------------------------------------------- */ --------------------------------------------------------- */
function abortBook(bookIdx) { function updateLogs(logList) {
if (!confirm(`Abort book ${bookIdx}?`)) return; const output = $("#log-output");
if (!output) {
console.warn(">>> updateLogs: no #log-output element found");
return;
}
console.log("[DASHBOARD] ABORT", bookIdx); output.innerHTML = "";
fetch(`/abort/${bookIdx}`, { method: "POST" }).then(() => logList.forEach((line) => logAppend(line));
refreshBook(bookIdx)
); autoScroll(output);
} }
/* --------------------------------------------------------- function abortBook(book_id) {
Bookcard UI refresh (non-progress) if (!confirm(`Abort tasks for book ${book_id}?`)) return;
--------------------------------------------------------- */
async function refreshBookCards() {
console.log("[DASHBOARD] refreshBookCards()");
const books = await apiGet("/api/books");
if (!books) return;
document.querySelectorAll(".book-card").forEach((card) => { fetch(`/abort/${book_id}`, { method: "POST" })
const idx = card.dataset.bookIdx; .then((r) => r.json())
const info = books.find((b) => b.book_idx === idx); .then((data) => {
if (!info) return; console.log("Abort:", data);
})
console.log("[DASHBOARD] card status", idx, info.status); .catch((err) => {
card.className = `book-card ${info.status}`; console.error("Abort failed:", err);
const abortBtn = card.querySelector(".icon-abort");
if (abortBtn) {
abortBtn.disabled = ![
"processing",
"downloading",
"parsing",
"audio",
].includes(info.status);
}
}); });
} }

@ -1,101 +0,0 @@
/* ============================================================
File: static/js/inspect_state.js
Purpose:
- Receive merged state via state_updater.js
- Update ONLY the right-side state tables
- NO polling, NO fetch
============================================================ */
console.log("[inspect_state] JS loaded (subscriber mode)");
/* ------------------------------------------------------------
State subscription
------------------------------------------------------------ */
window.addEventListener("state:update", (e) => {
const entries = e.detail;
if (!Array.isArray(entries)) {
console.warn("[inspect_state] state:update payload is not array", entries);
return;
}
console.log("[inspect_state] state:update received entries:", entries.length);
updateInspectTables(entries);
});
/* ------------------------------------------------------------
Update tables
------------------------------------------------------------ */
function updateInspectTables(entries) {
console.log("[inspect_state] updating tables");
entries.forEach((entry) => {
const bookIdx = entry.book_idx;
if (bookIdx == null) {
console.warn("[inspect_state] entry without book_idx", entry);
return;
}
const block = document.querySelector(
`.state-block[data-book-idx="${bookIdx}"]`
);
if (!block) {
console.warn("[inspect_state] no state-block for book_idx", bookIdx);
return;
}
const table = block.querySelector(".state-table");
if (!table) {
console.warn("[inspect_state] no state-table for book_idx", bookIdx);
return;
}
console.log("[inspect_state] updating table for book_idx", bookIdx);
const sql = entry.sqlite || {};
const redis = entry.redis || {};
const merged = entry.would_merge_to || {};
table.innerHTML = `
<tr>
<th>Field</th>
<th>SQLite</th>
<th>Redis</th>
<th>Merged</th>
</tr>
${row("status", sql, redis, merged)}
${row("chapters_total", sql, redis, merged)}
${row("downloaded", sql, redis, merged)}
${row("chapters_download_done", sql, redis, merged)}
${row("chapters_download_skipped", sql, redis, merged)}
${row("parsed", sql, redis, merged)}
${row("chapters_parsed_done", sql, redis, merged)}
${row("audio_done", sql, redis, merged)}
${row("audio_skipped", sql, redis, merged)}
${row("last_update", sql, redis, merged)}
`;
});
}
/* ------------------------------------------------------------
Row helper
------------------------------------------------------------ */
function row(field, sql, redis, merged) {
const s = sql[field] ?? "";
const r = redis[field] ?? "";
const m = merged[field] ?? "";
const cls = String(s) === String(r) ? "same" : "diff";
return `
<tr>
<th>${field}</th>
<td class="${cls}">${s}</td>
<td class="${cls}">${r}</td>
<td>${m}</td>
</tr>
`;
}

@ -1,105 +1,107 @@
/* ======================================================================= /* =======================================================================
File: static/js/log_view.js File: static/js/log_view.js
Purpose: Purpose:
High-performance rolling log viewer Log viewer functionality:
- efficient delta polling - filtering
- append-only mode (no DOM reset) - clearing
- rolling limit (prevents memory freeze) - auto-scroll
- supports both global logs and per-book logs - delta polling (efficient)
- rolling limit (prevent GUI freeze)
======================================================================= */ ======================================================================= */
console.log(">>> log_view.js LOADING…"); console.log(">>> log_view.js LOADING…");
/* --------------------------------------------------------- /* ---------------------------------------------------------
Global log viewer state Log filtering
--------------------------------------------------------- */ --------------------------------------------------------- */
let LOG_FILTER = "ALL"; let LOG_FILTER = "ALL";
let LAST_LOG_INDEX = -1; // delta offset let LAST_LOG_INDEX = -1; // For delta polling
const MAX_LOG_LINES = 600; const MAX_LOG_LINES = 1000; // Rolling cap to prevent freezing
/* ---------------------------------------------------------
Apply filter on existing log lines
--------------------------------------------------------- */
function applyLogFilter() { function applyLogFilter() {
console.log(">>> log_view.js applyLogFilter(), filter =", LOG_FILTER);
const lines = $$(".log-line"); const lines = $$(".log-line");
console.log(">>> log_view.js number of log-line elements:", lines.length);
lines.forEach((line) => { lines.forEach((line) => {
const text = line.innerText; const text = line.innerText;
const show = LOG_FILTER === "ALL" || (text && text.includes(LOG_FILTER)); line.style.display =
line.style.display = show ? "block" : "none"; LOG_FILTER === "ALL" || text.includes(LOG_FILTER) ? "block" : "none";
}); });
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
DOM Ready bind clear/filter UI bindings
--------------------------------------------------------- */ --------------------------------------------------------- */
document.addEventListener("DOMContentLoaded", () => { document.addEventListener("DOMContentLoaded", () => {
console.log(">>> log_view.js DOMContentLoaded"); console.log(">>> log_view.js DOMContentLoaded");
const filterSel = $("#log-filter");
const clearBtn = $("#log-clear"); const clearBtn = $("#log-clear");
const output = $("#log-output"); const output = $("#log-output");
if (!output) { if (!filterSel) {
console.log(">>> log_view.js: No #log-output → viewer disabled"); console.log(">>> log_view.js: No log viewer found on this page.");
return; return;
} }
console.log(">>> log_view.js: log viewer detected.");
// Filter dropdown
// filterSel.addEventListener("change", () => {
// LOG_FILTER = filterSel.value;
// console.log(">>> log_view.js filter changed to:", LOG_FILTER);
// applyLogFilter();
// });
// Clear log window
if (clearBtn) { if (clearBtn) {
clearBtn.addEventListener("click", () => { clearBtn.addEventListener("click", () => {
console.log(">>> log_view.js: Clear log viewer"); console.log(">>> log_view.js log-clear clicked → clearing output");
if (output) {
output.innerHTML = ""; output.innerHTML = "";
LAST_LOG_INDEX = -1; LAST_LOG_INDEX = -1; // reset delta polling
}
}); });
} }
}); });
/* --------------------------------------------------------- /* ---------------------------------------------------------
Append ONE line Append + Rolling buffer
--------------------------------------------------------- */ --------------------------------------------------------- */
function rollingAppend(lineText) { function logAppend(lineText) {
const output = $("#log-output"); const output = $("#log-output");
if (!output) return; if (!output) return;
const div = document.createElement("div"); const div = document.createElement("div");
div.classList.add("log-line"); div.classList.add("log-line");
// Type detection // -----------------------------------------------------
if (lineText.includes("[DL]") || lineText.includes("[DOWNLOAD]")) // Assign subtype classes
// -----------------------------------------------------
if (lineText.includes("[DL]") || lineText.includes("[DOWNLOAD]")) {
div.classList.add("dl"); div.classList.add("dl");
else if (lineText.includes("[PARSE]")) div.classList.add("parse"); } else if (lineText.includes("[PARSE]")) {
else if (lineText.includes("[SAVE]")) div.classList.add("save"); div.classList.add("parse");
else if (lineText.includes("[AUDIO]")) div.classList.add("audio"); } else if (lineText.includes("[SAVE]")) {
else if (lineText.includes("[CTRL]")) div.classList.add("ctrl"); div.classList.add("save");
else if (lineText.includes("[ERROR]")) div.classList.add("error"); } else if (lineText.includes("[AUDIO]")) {
else div.classList.add("default"); div.classList.add("audio");
} else if (lineText.includes("[CTRL]")) {
div.textContent = lineText; div.classList.add("ctrl");
output.appendChild(div); } else if (lineText.includes("[ERROR]")) {
div.classList.add("error");
// Rolling limit } else {
while (output.childNodes.length > MAX_LOG_LINES) { div.classList.add("default");
output.removeChild(output.firstChild);
} }
}
/* --------------------------------------------------------- div.innerText = lineText;
Primary entry: updateLogs() output.appendChild(div);
Accepts:
{ logs:[...], last:N }
OR legacy:
{ lines:[...], last:N }
--------------------------------------------------------- */
function updateLogs(packet) {
const output = $("#log-output");
if (!output || !packet) return;
let lines = packet.logs || packet.lines || [];
if (!Array.isArray(lines)) return;
lines.forEach((line) => rollingAppend(line));
// Correct unified delta index handling // Rolling buffer
if (packet.last !== undefined) { while (output.children.length > MAX_LOG_LINES) {
LAST_LOG_INDEX = packet.last; output.removeChild(output.firstChild);
} }
applyLogFilter(); applyLogFilter();
@ -107,8 +109,7 @@ function updateLogs(packet) {
} }
/* --------------------------------------------------------- /* ---------------------------------------------------------
Delta polling global logs ONLY Delta-based log polling
(dashboard.js overrides logs per-book)
--------------------------------------------------------- */ --------------------------------------------------------- */
function pollLogs() { function pollLogs() {
fetch(`/logs?last_index=${LAST_LOG_INDEX}`) fetch(`/logs?last_index=${LAST_LOG_INDEX}`)
@ -116,8 +117,8 @@ function pollLogs() {
.then((data) => { .then((data) => {
const lines = data.lines || []; const lines = data.lines || [];
if (lines.length > 0) { if (lines.length > 0) {
lines.forEach((line) => rollingAppend(line)); lines.forEach((line) => logAppend(line));
LAST_LOG_INDEX = data.last; LAST_LOG_INDEX = data.total - 1;
} }
}) })
.catch((err) => { .catch((err) => {
@ -125,6 +126,7 @@ function pollLogs() {
}); });
} }
setInterval(pollLogs, 2800); // Poll every 800 ms
setInterval(pollLogs, 1800);
console.log(">>> log_view.js LOADED"); console.log(">>> log_view.js LOADED");

@ -0,0 +1,72 @@
/* =======================================================================
File: static/js/progress.js
Purpose:
Update progress bars dynamically for the current book.
Expects data from API endpoints via dashboard.js or start.js.
======================================================================= */
console.log(">>> progress.js LOADED");
function updateProgressBars(data) {
console.log(">>> progress.js updateProgressBars() CALLED with:", data);
if (!data) {
console.warn(">>> progress.js: NO DATA RECEIVED");
return;
}
// Data format expected:
// {
// download_done,
// download_total,
// audio_done,
// audio_total
// }
const barDL = $(".progress-bar-fill");
const barAU = $(".progress-bar-fill.audio-fill");
console.log(">>> progress.js barDL =", barDL);
console.log(">>> progress.js barAU =", barAU);
const pctDL =
data.download_total > 0
? (100 * data.download_done) / data.download_total
: 0;
const pctAU =
data.audio_total > 0 ? (100 * data.audio_done) / data.audio_total : 0;
if (barDL) {
barDL.style.width = pctDL.toFixed(1) + "%";
console.log(">>> progress.js updated DL bar to", pctDL.toFixed(1) + "%");
} else {
console.warn(">>> progress.js: barDL NOT FOUND");
}
if (barAU) {
barAU.style.width = pctAU.toFixed(1) + "%";
console.log(">>> progress.js updated AU bar to", pctAU.toFixed(1) + "%");
} else {
console.warn(">>> progress.js: barAU NOT FOUND");
}
// Update textual stats
const stats = $$(".progress-stats span");
console.log(">>> progress.js stats elements found:", stats.length);
// Expected structure: [DL "x/y", DL "pct", AU "x/y", AU "pct"]
if (stats.length >= 4) {
stats[0].innerText = `${data.download_done} / ${data.download_total}`;
stats[1].innerText = pctDL.toFixed(1) + "%";
stats[2].innerText = `${data.audio_done} / ${data.audio_total}`;
stats[3].innerText = pctAU.toFixed(1) + "%";
console.log(">>> progress.js stats updated");
} else {
console.warn(
">>> progress.js: not enough stats spans, found",
stats.length
);
}
}

@ -1,98 +0,0 @@
/* ========================================================
File: static/js/state_updater.js
Purpose:
- Poll /api/state/all
- Dispatch merged state to subscribers
(bookcard_controller, inspect_state, others)
- Pause polling when tab inactive
======================================================== */
console.log("[STATE-UPDATER] loaded");
const STATE_POLL_INTERVAL_MS = 2500;
const STATE_ENDPOINT = "/api/state/all";
let STATE_TIMER = null;
/* ========================================================
INIT
======================================================== */
document.addEventListener("DOMContentLoaded", () => {
initStateUpdater();
});
function initStateUpdater() {
const cards = document.querySelectorAll(".book-card");
if (cards.length === 0) {
console.log("[STATE-UPDATER] No bookcards found — skipping");
return;
}
console.log(`[STATE-UPDATER] Starting updater for ${cards.length} bookcards`);
startPolling(true);
document.addEventListener("visibilitychange", () => {
document.hidden ? stopPolling() : startPolling(true);
});
}
/* ========================================================
DISPATCH
======================================================== */
function dispatchState(entries) {
console.debug("[STATE] dispatch", entries.length);
// 1. Bookcards
if (typeof window.updateBookCardsFromState === "function") {
window.updateBookCardsFromState(entries);
}
// 2. Inspect state tables / other subscribers
window.dispatchEvent(new CustomEvent("state:update", { detail: entries }));
}
/* ========================================================
POLLING CONTROL
======================================================== */
function startPolling(immediate = false) {
if (STATE_TIMER) return;
console.log("[STATE-UPDATER] Start polling");
if (immediate) pollState();
STATE_TIMER = setInterval(pollState, STATE_POLL_INTERVAL_MS);
}
function stopPolling() {
if (!STATE_TIMER) return;
console.log("[STATE-UPDATER] Stop polling (tab inactive)");
clearInterval(STATE_TIMER);
STATE_TIMER = null;
}
/* ========================================================
POLL API
======================================================== */
async function pollState() {
if (document.hidden) return;
try {
const resp = await fetch(STATE_ENDPOINT, { cache: "no-store" });
if (!resp.ok) return;
const entries = await resp.json();
if (!Array.isArray(entries)) return;
dispatchState(entries);
} catch (e) {
console.error("[STATE-UPDATER] poll error", e);
}
}

@ -20,16 +20,7 @@
<!-- JS --> <!-- JS -->
<script src="/static/js/app.js"></script> <script src="/static/js/app.js"></script>
<script src="/static/js/log_view.js"></script> <script src="/static/js/log_view.js"></script>
<script src="/static/js/progress.js"></script>
<script src="/static/js/dashboard.js"></script> <script src="/static/js/dashboard.js"></script>
<!-- GLOBAL STATE UPDATER -->
<script src="/static/js/state_updater.js"></script>
<script>
document.addEventListener("DOMContentLoaded", () => {
if (typeof initStateUpdater === "function") {
initStateUpdater();
}
});
</script>
</body> </body>
</html> </html>

@ -3,17 +3,17 @@
Purpose: Purpose:
Dashboard weergave van één boek in de lijst. Dashboard weergave van één boek in de lijst.
Variabelen komen binnen via: Variabelen komen binnen via:
book.<veld>
Boek gebruikt nu uitsluitend book_idx als primaire sleutel Dus alle velden moeten via "book.<veld>" aangesproken worden.
======================================================================= --> ======================================================================= -->
<div class="book-list-item" data-book-idx="{{ book.book_idx }}"> <div class="book-list-item" data-book-id="{{ book.book_id }}">
<!-- Left area: title + metadata --> <!-- Left area: title + metadata -->
<div class="book-info"> <div class="book-info">
<div class="book-title">{{ book.title }}</div> <div class="book-title">{{ book.title }}</div>
<div class="book-meta"> <div class="book-meta">
<span class="meta-label">IDX:</span> {{ book.book_idx }} {% if <span class="meta-label">ID:</span> {{ book.book_id }} {% if
book.last_update %} book.last_update %}
<span class="meta-separator"></span> <span class="meta-separator"></span>
<span class="meta-label">Updated:</span> {{ book.last_update }} {% endif <span class="meta-label">Updated:</span> {{ book.last_update }} {% endif
@ -56,10 +56,8 @@
<span class="mini-value">{{ pct_au }}%</span> <span class="mini-value">{{ pct_au }}%</span>
</div> </div>
</div> </div>
<!-- Abort button -->
<div class="book-abort-area"> <div class="book-abort-area">
<button class="abort-btn" onclick="abortBookAjax('{{ book.book_idx }}')"> <button class="abort-btn" onclick="abortBook('{{ book.book_id }}')">
Abort Abort
</button> </button>
</div> </div>

@ -1,90 +0,0 @@
{# ============================================================ File:
templates/components/bookcard.html Purpose: Eén enkele boekkaart (dumb
component) ============================================================ #}
<div class="book-card {{ b.status }}" data-book-idx="{{ b.book_idx }}">
<!-- HIDE -->
<form
action="/hide/{{ b.book_idx }}"
method="POST"
class="hide-form"
onsubmit="return confirm('Dit boek verbergen?')"
>
<button class="icon-btn icon-hide" title="Verbergen">
<i class="fa-solid fa-xmark"></i>
</button>
</form>
<!-- COVER -->
<div class="book-cover">
{% if b.cover_path %}
<img
src="/{{ b.cover_path }}"
class="book-img"
data-field="cover"
alt="cover"
/>
{% else %}
<div class="book-img placeholder" data-field="cover">?</div>
{% endif %}
</div>
<!-- META -->
<div class="book-meta">
<!-- STATUS BADGE -->
{% if b.status %}
<span
class="status-badge status-{{ b.status }}"
title="
{% if b.status == 'done' %}Alle chapters en audio zijn compleet{% endif %}
{% if b.status == 'audio' %}Downloads compleet, audio wordt nog gegenereerd{% endif %}
{% if b.status == 'downloading' %}Bezig met downloaden{% endif %}
"
>
{{ b.status | upper }}
</span>
{% endif %}
<div class="book-title" data-field="title">{{ b.title }}</div>
<div class="book-author" data-field="author">{{ b.author }}</div>
<div class="book-created">
Geregistreerd: <span data-field="created_at">{{ b.created_at }}</span>
</div>
<!-- ACTIONS -->
<div class="book-actions">
<!-- START -->
<form action="/start" method="POST">
<input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button class="icon-btn icon-start" title="Start" data-action="start">
<i class="fa-solid fa-play"></i>
</button>
</form>
<!-- ABORT -->
<form action="/abort/{{ b.book_idx }}" method="POST">
<input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button class="icon-btn icon-abort" title="Abort" data-action="abort">
<i class="fa-solid fa-stop"></i>
</button>
</form>
<form
method="post"
action="/inspect/statuscheck/{{ b.book_idx }}"
style="display: inline-block"
>
<button
type="submit"
class="icon-btn icon-statuscheck"
title="Herbereken status op basis van bestanden"
>
<i class="fa-solid fa-magnifying-glass-chart"></i>
</button>
</form>
</div>
</div>
<!-- PROGRESS -->
<div class="book-progress">{% include "components/progress_box.html" %}</div>
</div>

@ -1,16 +1,16 @@
<!-- ======================================================================= <!-- =======================================================================
File: templates/components/nav.html File: templates/components/nav.html
Purpose: Global navigation bar for BookScraper UI (improved version) Purpose: Global navigation bar for BookScraper UI
======================================================================= --> ======================================================================= -->
<nav class="navbar"> <nav class="navbar">
<div class="nav-inner"> <div class="nav-inner">
<!-- Branding / Home --> <!-- Left side: Branding -->
<div class="nav-brand"> <div class="nav-brand">
<a href="/">BookScraper</a> <a href="/">BookScraper</a>
</div> </div>
<!-- Main navigation --> <!-- Right side: Navigation Links -->
<ul class="nav-links"> <ul class="nav-links">
<li> <li>
<a href="/dashboard" class="nav-item"> Dashboard </a> <a href="/dashboard" class="nav-item"> Dashboard </a>
@ -21,19 +21,7 @@
</li> </li>
<li> <li>
<a href="/debug/inspect_state" class="nav-item"> State overview </a> <a href="/logs" class="nav-item"> Logs </a>
</li>
<!-- Tools dropdown -->
<li class="nav-dropdown">
<span class="nav-item">Tools ▾</span>
<ul class="dropdown-menu">
<li><a href="/api/db/books">DB Viewer</a></li>
<li><a href="/debug/inspect_state">Inspect State</a></li>
<li><a href="/debug/sync_state">Sync State</a></li>
<li><a href="/debug/redis-keys">Redis Keys</a></li>
<li><a href="/debug/queues">queues</a></li>
</ul>
</li> </li>
</ul> </ul>
</div> </div>

@ -1,34 +1,61 @@
<!-- ======================================================================= <!-- =======================================================================
File: templates/components/progress_box.html File: templates/components/progress_box.html
Purpose: Purpose: Reusable progress overview (download + audio) for any book.
Dumb progress UI for a book card. Notes:
Initial values via Jinja, live updates via state_updater.js - Expects the following variables from Flask:
======================================================================= --> book_id: str
title: str
download_total: int
download_done: int
audio_total: int
audio_done: int
- Pure HTML; JS for live updates will be added later.
======================================================================= -->
<div class="progress-box"> <div class="progress-box">
<!-- DOWNLOAD --> <!-- Header -->
<div class="progress-row"> <div class="progress-header">
<div class="progress-label">Download</div> <h2>Progress</h2>
<div class="progressbar"> {% if title %}
<div <div class="progress-subtitle">{{ title }}</div>
class="progressbar-fill download" {% endif %} {% if book_id %}
data-field="download_pct" <div class="progress-bookid">Book ID: <span>{{ book_id }}</span></div>
style="width: 0%" {% endif %}
></div> </div>
<div class="progressbar-text" data-field="download_text">0 / 0</div>
<!-- DOWNLOAD SECTION -->
<div class="progress-section">
<h3>Download Progress</h3>
<div class="progress-bar">
{% set pct = 0 %} {% if download_total > 0 %} {% set pct = (100 *
download_done / download_total) | round(1) %} {% endif %}
<div class="progress-bar-fill" style="width: {{ pct }}%;"></div>
</div>
<div class="progress-stats">
<span>{{ download_done }} / {{ download_total }}</span>
<span>{{ pct }}%</span>
</div> </div>
</div> </div>
<!-- AUDIO --> <!-- AUDIO SECTION -->
<div class="progress-row"> <div class="progress-section">
<div class="progress-label">Audio</div> <h3>Audio Progress</h3>
<div class="progressbar">
<div class="progress-bar audio">
{% set pct2 = 0 %} {% if audio_total > 0 %} {% set pct2 = (100 *
audio_done / audio_total) | round(1) %} {% endif %}
<div <div
class="progressbar-fill audio" class="progress-bar-fill audio-fill"
data-field="audio_pct" style="width: {{ pct2 }}%;"
style="width: 0%"
></div> ></div>
<div class="progressbar-text" data-field="audio_text">0 / 0</div> </div>
<div class="progress-stats">
<span>{{ audio_done }} / {{ audio_total }}</span>
<span>{{ pct2 }}%</span>
</div> </div>
</div> </div>
<script src="/static/js/progress.js"></script>
</div> </div>

@ -1,21 +0,0 @@
{# ============================================================ File:
templates/components/registered_books.html Purpose: Toon een grid van
geregistreerde boeken. Elke kaart wordt gerenderd via bookcard.html.
============================================================ #}
<section class="dashboard-section">
<h2>Geregistreerde boeken</h2>
{% if registered and registered|length > 0 %}
<div class="registered-grid">
{% for b in registered %} {% include "components/bookcard.html" %} {% endfor
%}
</div>
{% else %}
<p>Geen geregistreerde boeken.</p>
{% endif %}
</section>

@ -5,18 +5,17 @@
Used on landing pages or detail pages. Used on landing pages or detail pages.
======================================================================= --> ======================================================================= -->
<form method="POST" action="/init" class="url-form"> <form method="POST" action="/start" class="url-form">
<label for="urls" class="url-label"> Book URL(s) one per line: </label> <label for="url" class="url-label">Book URL:</label>
<textarea <input
id="urls" type="text"
name="urls" id="url"
name="url"
class="url-input" class="url-input"
rows="5" placeholder="https://www.piaotia.com/bookinfo/6/6072.html"
placeholder="https://www.piaotia.com/bookinfo/6/6072.html
https://www.piaotia.com/bookinfo/3/3785.html"
required required
></textarea> />
<button type="submit" class="btn-primary url-submit">Register book(s)</button> <button type="submit" class="btn-primary url-submit">Start Scraping</button>
</form> </form>

@ -1,7 +1,7 @@
<!-- ======================================================================= <!-- =======================================================================
File: templates/dashboard/book_detail.html File: templates/dashboard/book_detail.html
Purpose: Purpose:
Detailpagina voor één book_idx. Detailpagina voor één book_id.
Toont progress (download/audio) + filters + live logs. Toont progress (download/audio) + filters + live logs.
======================================================================= --> ======================================================================= -->
@ -15,9 +15,7 @@
<!-- Progress box --> <!-- Progress box -->
<section id="progressSection"> <section id="progressSection">
{% include "components/progress_box.html" with book_idx=book_idx, {% include "components/progress_box.html" %}
title=title, download_total=download_total, download_done=download_done,
audio_total=audio_total, audio_done=audio_done %}
</section> </section>
<!-- Log view --> <!-- Log view -->
@ -29,10 +27,13 @@
<!-- PAGE-SPECIFIC JS --> <!-- PAGE-SPECIFIC JS -->
<script> <script>
const BOOK_IDX = "{{ book_idx }}"; const BOOK_ID = "{{ book_id }}";
</script> </script>
<!-- Shared log viewer -->
<script src="/static/js/log_view.js"></script> <script src="/static/js/log_view.js"></script>
<!-- Dashboard behaviour (only does something if dashboard HTML is present) -->
<script src="/static/js/dashboard.js"></script> <script src="/static/js/dashboard.js"></script>
<!-- Existing global app logic -->
{% endblock %} {% endblock %}

@ -25,8 +25,19 @@
<!-- =========================================================== <!-- ===========================================================
BOOK LIST BOOK LIST
=========================================================== --> =========================================================== -->
<section class="dashboard-section">
<h2>Actieve boeken</h2>
{% if books and books|length > 0 %}
<div id="book-list" class="book-list">
{% for book in books %} {% include "components/book_list_item.html" %} {%
endfor %}
</div>
{% else %}
<div id="book-list" class="book-list-empty">Geen actieve boeken.</div>
{% endif %}
</section>
{% include "components/registered_books.html" %}
<hr /> <hr />
<!-- =========================================================== <!-- ===========================================================

@ -1,95 +0,0 @@
{# ============================================================ File:
templates/debug/inspect_state.html Purpose: Inspect SQLite vs Redis state per
book_idx - Initial render via Jinja - Live updates via inspect_state.js -
BookCard is server-rendered and NEVER replaced - Only the right-side state table
is updated dynamically
============================================================ #} {% extends
"layout.html" %} {% block content %}
<h1>State Inspection (SQL vs Redis)</h1>
<style>
.state-block {
display: grid;
grid-template-columns: 380px 1fr;
gap: 20px;
margin-bottom: 35px;
padding: 18px;
border: 1px solid #444;
background: #222;
border-radius: 8px;
}
.state-table {
width: 100%;
border-collapse: collapse;
}
.state-table th,
.state-table td {
border: 1px solid #555;
padding: 6px 10px;
}
.state-table th {
background: #333;
color: #fff;
}
.state-table td {
background: #2a2a2a;
color: #ddd;
}
.same {
color: #9f9 !important;
}
.diff {
color: #ff7b7b !important;
font-weight: bold;
}
</style>
<div id="state-container">
{% for entry in results %}
<div class="state-block" data-book-idx="{{ entry.book_idx }}">
<!-- LEFT: BookCard (server-rendered, NEVER replaced) -->
<div>
{% if entry.card %} {% with b = entry.card %} {% include
"components/bookcard.html" %} {% endwith %} {% else %}
<strong>{{ entry.book_idx }}</strong>
{% endif %}
</div>
<!-- RIGHT: State table (updated by JS) -->
<div>
<table class="state-table">
<tr>
<th>Field</th>
<th>SQLite</th>
<th>Redis</th>
<th>Merged</th>
</tr>
{% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged
= entry.would_merge_to %} {% for field in [ "status", "chapters_total",
"downloaded", "chapters_download_done", "chapters_download_skipped",
"parsed", "chapters_parsed_done", "audio_done", "audio_skipped",
"last_update" ] %}
<tr>
<th>{{ field }}</th>
<td>{{ sql.get(field, "") }}</td>
<td>{{ redis.get(field, "") }}</td>
<td>{{ merged.get(field, "") }}</td>
</tr>
{% endfor %}
</table>
</div>
</div>
{% endfor %}
</div>
{% endblock %} {% block scripts %}
<script src="/static/js/inspect_state.js"></script>
{% endblock %}

@ -1,91 +0,0 @@
{% extends "layout.html" %} {% block content %}
<h1>Celery Queue Debug</h1>
<style>
.debug-section {
margin-bottom: 40px;
}
.debug-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 20px;
}
.debug-table th,
.debug-table td {
border: 1px solid #444;
padding: 6px 10px;
}
.debug-table th {
background: #333;
color: #fff;
}
pre {
background: #1e1e1e;
color: #ddd;
padding: 10px;
overflow-x: auto;
}
code {
color: #9cf;
}
</style>
<div class="debug-section">
<h2>Workers</h2>
<h3>Active Tasks</h3>
<pre>{{ workers_active | tojson(indent=2) }}</pre>
<h3>Reserved</h3>
<pre>{{ workers_reserved | tojson(indent=2) }}</pre>
<h3>Scheduled</h3>
<pre>{{ workers_scheduled | tojson(indent=2) }}</pre>
</div>
<hr />
<div class="debug-section">
<h2>Queues</h2>
{% for q in queues %}
<div class="debug-queue">
<h3>{{ q.name }} ({{ q.length }} items)</h3>
<table class="debug-table">
<tr>
<th>Redis Key</th>
<td>{{ q.redis_key }}</td>
</tr>
<tr>
<th>Length</th>
<td>{{ q.length }}</td>
</tr>
<tr>
<th>Items (first 30)</th>
<td>
{% if q["items"] %}
<ul style="margin: 0; padding-left: 20px">
{% for item in q["items"] %}
<li><code>{{ item | e }}</code></li>
{% endfor %}
</ul>
{% else %}
<i>No items</i>
{% endif %}
</td>
</tr>
</table>
</div>
{% endfor %}
</div>
<script>
setInterval(() => {
window.location.reload();
}, 5000);
</script>
{% endblock %}

@ -1,53 +1,34 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="nl"> <html lang="nl">
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8">
<title>BookScraper</title> <title>BookScraper</title>
<style> <style>
body { body { font-family: Arial, sans-serif; padding: 40px; max-width: 600px; margin: auto; }
font-family: Arial, sans-serif; h1 { margin-bottom: 20px; }
padding: 40px;
max-width: 600px;
margin: auto;
}
h1 {
margin-bottom: 20px;
}
input[type="text"] { input[type="text"] {
width: 100%; width: 100%; padding: 12px; font-size: 16px;
padding: 12px; border: 1px solid #ccc; border-radius: 6px;
font-size: 16px;
border: 1px solid #ccc;
border-radius: 6px;
} }
button { button {
margin-top: 20px; margin-top: 20px;
padding: 12px 20px; padding: 12px 20px;
background: #007bff; background: #007bff; color: white;
color: white; border: none; border-radius: 6px;
border: none; font-size: 16px; cursor: pointer;
border-radius: 6px;
font-size: 16px;
cursor: pointer;
}
button:hover {
background: #0056b3;
} }
button:hover { background: #0056b3; }
</style> </style>
</head> </head>
<body> <body>
<h1>BookScraper WebGUI</h1>
<form action="/init" method="POST"> <h1>BookScraper WebGUI</h1>
<label for="url">Geef een boek-URL op:</label><br /><br />
<input <form action="/start" method="POST">
type="text" <label for="url">Geef een boek-URL op:</label><br><br>
id="url" <input type="text" id="url" name="url" placeholder="https://example.com/book/12345" required>
name="url"
placeholder="https://example.com/book/12345"
required
/>
<button type="submit">Start Scraping</button> <button type="submit">Start Scraping</button>
</form> </form>
</body>
</body>
</html> </html>

@ -1,115 +0,0 @@
{% extends "layout.html" %} {% block content %}
<h2>Statuscheck Inspect</h2>
{% if error %}
<div class="error"><strong>Fout:</strong> {{ error }}</div>
{% else %}
<!-- ===================================================== -->
<!-- BOEK -->
<!-- ===================================================== -->
<h3>Boek</h3>
<table class="kv">
<tr>
<th>Book idx</th>
<td>{{ result.book_idx }}</td>
</tr>
<tr>
<th>Pad</th>
<td>{{ result.filesystem.book_dir }}</td>
</tr>
<tr>
<th>Bestaat</th>
<td>{{ result.filesystem.exists }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- FILESYSTEM -->
<!-- ===================================================== -->
<h3>Filesystem (source of truth)</h3>
<table class="kv">
<tr>
<th>Volumes</th>
<td>{{ result.filesystem.volumes }}</td>
</tr>
<tr>
<th>Chapters (.txt)</th>
<td>{{ result.filesystem.chapters_txt }}</td>
</tr>
<tr>
<th>Audio (.m4b)</th>
<td>{{ result.filesystem.audio_files }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- SQL -->
<!-- ===================================================== -->
<h3>SQL snapshot</h3>
<h4>Voor</h4>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ result.sql_before.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ result.sql_before.audio_done }}</td>
</tr>
<tr>
<th>Status</th>
<td>{{ result.sql_before.status }}</td>
</tr>
</table>
<h4>Na</h4>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ result.sql_after.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ result.sql_after.audio_done }}</td>
</tr>
<tr>
<th>Last update</th>
<td>{{ result.sql_after.last_update }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- REPOSITORY -->
<!-- ===================================================== -->
<h3>Repository merged state (UI input)</h3>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ repo_state.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ repo_state.audio_done }}</td>
</tr>
<tr>
<th>Chapters total</th>
<td>{{ repo_state.chapters_total }}</td>
</tr>
</table>
<details>
<summary>Raw repository state</summary>
<pre>{{ repo_state | tojson(indent=2) }}</pre>
</details>
{% endif %}
<hr />
<a href="/dashboard">← Terug naar dashboard</a>
{% endblock %}

@ -14,12 +14,6 @@
<!-- CSS --> <!-- CSS -->
<link rel="stylesheet" href="/static/css/style.css" /> <link rel="stylesheet" href="/static/css/style.css" />
<link rel="stylesheet" href="/static/css/dashboard.css" /> <link rel="stylesheet" href="/static/css/dashboard.css" />
<link rel="stylesheet" href="/static/css/bookcard.css" />
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css"
/>
<!-- GLOBAL HELPERS (moet ALTIJD boven alles geladen worden) --> <!-- GLOBAL HELPERS (moet ALTIJD boven alles geladen worden) -->
<script src="/static/js/helpers.js"></script> <script src="/static/js/helpers.js"></script>
@ -32,11 +26,6 @@
<footer class="footer"> <footer class="footer">
BookScraper © 2025 — Powered by Celery + Redis BookScraper © 2025 — Powered by Celery + Redis
</footer> </footer>
{% block scripts %}{% endblock %}
<script src="/static/js/bookcard_controller.js"></script>
<script src="/static/js/state_updater.js"></script>
<script src="/static/js/dashboard.js"></script>
<!-- GLOBAL APP LOGIC (altijd als laatste) --> <!-- GLOBAL APP LOGIC (altijd als laatste) -->
<script src="/static/js/app.js"></script> <script src="/static/js/app.js"></script>

@ -66,7 +66,6 @@
display: none; display: none;
} }
</style> </style>
s
</head> </head>
<body> <body>

@ -1,13 +0,0 @@
#!/bin/sh
# mp4info shim for m4b-tool (ffprobe-based)
if [ -z "$1" ]; then
echo "Usage: mp4info <file>" >&2
exit 1
fi
# ffprobe outputs float seconds; m4b-tool expects an integer
ffprobe -v error \
-show_entries format=duration \
-of default=noprint_wrappers=1:nokey=1 \
"$1" | awk '{ printf "%d\n", ($1 + 0.5) }'

@ -5,7 +5,7 @@ import requests
from io import BytesIO from io import BytesIO
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scraper.logger import log_debug from scraper.logger import log_debug
from scraper.utils.utils import clean_text from scraper.utils import clean_text
from urllib.parse import urljoin from urllib.parse import urljoin
@ -103,11 +103,8 @@ class ChapterDownloader:
collecting = True collecting = True
continue continue
text = ( text = sib.get_text("\n", strip=True) if hasattr(
sib.get_text("\n", strip=True) sib, "get_text") else str(sib).strip()
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text: if text:
parts.append(text) parts.append(text)
@ -124,7 +121,6 @@ class ChapterDownloader:
vdir = f"{output_base}/v{volume}" vdir = f"{output_base}/v{volume}"
import os import os
os.makedirs(vdir, exist_ok=True) os.makedirs(vdir, exist_ok=True)
fname = f"{number:05d}_{title}.txt" fname = f"{number:05d}_{title}.txt"

Loading…
Cancel
Save