Compare commits

...

14 Commits

Author SHA1 Message Date
peter.fong 7c2f65bbf4 add m4btool worker, statuscheck and audio completion flow
3 days ago
peter.fong 65842505b0 filesystem-based statuscheck, merged state fixes
5 days ago
peter.fong 3a7cc7687c inspect state accepted
5 days ago
peter.fong 516bca6de5 state bugs
6 days ago
peter.fong fa2f212e03 inspect_state
6 days ago
peter.fong 3a62dfae79 Refactor: unified book_idx state model + controller rewrite + Redis/SQLite sync overhaul
1 week ago
peter.fong feb8ca60d7 extra book metadata
1 week ago
peter.fong 292c9246a1 init werkt.
2 weeks ago
peter.fong f7f08fa45c nfs mount
2 weeks ago
peter.fong 7439d26744 create m4b instead of m4a
2 weeks ago
peter.fong 5159c32f58 parsing fix+progress half working
2 weeks ago
peter.fong 7ee6c5e276 abort button
2 weeks ago
peter.fong 6d15746738 abort+chaptertitle+dashboardupgrade
2 weeks ago
peter.fong 16012543ea templates. done,
2 weeks ago

4
.gitignore vendored

@ -11,4 +11,6 @@
# Negeer alle .env bestanden # Negeer alle .env bestanden
.env .env
**/.env **/.env
log.txt log.txt
**/static/covers/

@ -125,8 +125,41 @@ docker run \
``` ```
docker compose down docker compose down --remove-orphans
docker image prune -f
docker builder prune -af docker builder prune -af
docker volume prune -f docker volume prune -f
docker compose build --no-cache docker compose build --no-cache
docker compose up docker compose up
docker compose down
docker compose build
docker compose up
docker compose up -d
docker compose build --no-cache web && docker compose up web
docker compose build worker_download && docker compose up worker_download
docker compose down --remove-orphans
docker compose build --no-cache worker_m4b
docker compose up -d worker_m4b
docker compose up web
docker compose build web
docker compose restart web
tar \
--exclude="**pycache**" \
--exclude="_/**pycache**/_" \
--exclude="\*.pyc" \
--exclude=".venv" \
--exclude="venv" \
-czvf project.tar.gz .
docker compose down
docker image rm bookscraper-worker_m4b || true
docker builder prune -af
docker compose build --no-cache worker_m4b
docker compose up -d worker_m4b

@ -1,138 +1,528 @@
# ============================================ # ============================================
# File: bookscraper/app.py (ASYNC SCRAPING) # File: bookscraper/app.py (ASYNC SCRAPING)
# ============================================ # ============================================
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
import os
from flask import (
Flask,
render_template,
request,
jsonify,
send_from_directory,
redirect,
url_for,
)
print(">>> [WEB] Importing celery_app …") print(">>> [WEB] Importing celery_app …")
from celery_app import celery_app from celery_app import celery_app
from celery.result import AsyncResult
from flask import Flask, render_template, request, jsonify from db.db import init_db
from scraper.logger import log_debug from db.repository import (
get_registered_books,
fetch_book,
fetch_all_books,
get_progress,
)
# Abort + Progress (per book_id) from logbus.publisher import log
from scraper.logger import log_debug
from scraper.abort import set_abort from scraper.abort import set_abort
from scraper.progress import get_progress from scraper.ui_log import get_ui_logs, reset_ui_logs
from scraper.state import state as r
# UI LOGS (GLOBAL — no book_id) from scraper.logger_decorators import logcall
from scraper.ui_log import get_ui_logs, reset_ui_logs # <-- ADDED from scraper.utils.state_sync import sync_books_from_redis
from scraper.services.init_service import InitService
from celery.result import AsyncResult
# ⬇⬇⬇ TOEGEVOEGD voor cover-serving # INIT DB
from flask import send_from_directory init_db()
import os
app = Flask(__name__) app = Flask(__name__)
# ===================================================== # =====================================================
# STATIC FILE SERVING FOR OUTPUT ← TOEGEVOEGD # STATIC FILE SERVING
# ===================================================== # =====================================================
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@app.route("/output/<path:filename>") @app.route("/output/<path:filename>")
@logcall
def serve_output(filename): def serve_output(filename):
"""Serve output files such as cover.jpg and volumes."""
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False) return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
# ===================================================== # =====================================================
# HOME PAGE # SECTION 1 — NAVIGATION / HTML PAGES
# ===================================================== # =====================================================
@app.route("/", methods=["GET"]) @app.route("/", methods=["GET"])
@logcall
def index(): def index():
return render_template("index.html") return redirect(url_for("dashboard"))
@app.route("/dashboard", methods=["GET"])
@logcall
def dashboard():
logs_list = get_ui_logs() or []
registered_books = get_registered_books()
log(f"[WEB] Registered books: {registered_books}")
from db.repository import fetch_all_books
from pprint import pprint
pprint(fetch_all_books())
pprint(get_registered_books())
# reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
books=list_active_books(),
registered=registered_books,
logs=logs_list,
)
@app.route("/book/<book_idx>")
@logcall
def book_detail(book_idx):
title = r.get(f"book:{book_idx}:title") or book_idx
return render_template(
"dashboard/book_detail.html",
book_id=book_idx,
title=title,
logs=get_ui_logs(),
)
# ===================================================== # =====================================================
# START SCRAPING (async via Celery) # SECTION 2 — ACTION ROUTES (INIT, START, ABORT, HIDE)
# ===================================================== # =====================================================
@app.route("/init", methods=["POST"])
@logcall
def init_book():
# -------------------------------------------------
# Accept single URL (legacy) OR multi-line URLs
# -------------------------------------------------
raw_urls = request.form.get("urls") or request.form.get("url") or ""
urls = [line.strip() for line in raw_urls.splitlines() if line.strip()]
if not urls:
return render_template(
"dashboard/dashboard.html",
error="Geen URL(s) opgegeven.",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
# -------------------------------------------------
# Duplicate check: existing book_ids
# -------------------------------------------------
existing_books = {b["book_idx"] for b in fetch_all_books()}
results = []
# -------------------------------------------------
# Process each URL independently
# -------------------------------------------------
for url in urls:
try:
book_id = InitService.derive_book_id(url)
if book_id in existing_books:
results.append(
{
"url": url,
"status": "skipped",
"book_id": book_id,
"message": "Al geregistreerd",
}
)
continue
result = InitService.execute(url)
results.append(
{
"url": url,
"status": "registered",
"book_id": result.get("book_id"),
"title": result.get("title"),
}
)
except Exception as e:
log_debug(f"[INIT] ERROR for url={url}: {e}")
results.append(
{
"url": url,
"status": "error",
"error": str(e),
}
)
# -------------------------------------------------
# Summary message
# -------------------------------------------------
ok = sum(1 for r in results if r["status"] == "registered")
skipped = sum(1 for r in results if r["status"] == "skipped")
failed = sum(1 for r in results if r["status"] == "error")
message = f"Geregistreerd: {ok}, overgeslagen: {skipped}, fouten: {failed}"
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
message=message,
init_results=results, # optioneel voor UI-weergave
books=list_active_books(),
registered=reg,
logs=get_ui_logs(),
)
@app.route("/hide/<book_idx>", methods=["POST"])
@logcall
def hide_registered_book(book_idx):
# intentionally left disabled
pass
@app.route("/start", methods=["POST"]) @app.route("/start", methods=["POST"])
@logcall
def start_scraping(): def start_scraping():
url = request.form.get("url", "").strip() # 1) Form field: book_idx
book_idx = request.form.get("book_idx")
log(f"[WEB][START] Received start request for book_idx={book_idx}")
if not book_idx:
msg = "book_idx ontbreekt in formulier"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 400
# 2) Fetch boek uit SQLite
try:
book = fetch_book(book_idx)
log(f"[WEB][START] Fetched book from DB: {book}")
except Exception as e:
log(f"[WEB][START] DB ERROR: {e}")
return jsonify({"status": "error", "message": "DB fout"}), 500
if not book:
msg = f"Boek '{book_idx}' niet gevonden in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 404
# 3) Boek moet een URL hebben
url = book.get("book_url")
if not url: if not url:
return render_template("result.html", error="Geen URL opgegeven.") msg = f"Boek '{book_idx}' heeft geen book_url in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 500
# --------------------------------------------------------- # 4) Reset UI logs
# NEW: Clear UI log buffer when starting a new scrape
# ---------------------------------------------------------
reset_ui_logs() reset_ui_logs()
log_debug(f"[WEB] Scraping via Celery: {url}") # 5) Logging
log(f"[WEB][START] Starting full scrape book_idx={book_idx}, url={url}")
log_debug(f"[WEB][START] DEBUG: book data = {book}")
async_result = celery_app.send_task( # 6) Celery controller taak starten
"scraper.tasks.scraping.start_scrape_book", try:
args=[url], async_result = celery_app.send_task(
queue="scraping", "scraper.tasks.controller_tasks.start_full_scrape",
) args=[book_idx],
queue="controller",
)
except Exception as e:
log(f"[WEB][START] Celery ERROR: {e}")
return jsonify({"status": "error", "message": f"Celery fout: {e}"}), 500
# 7) Successfully dispatched task
log(f"[WEB][START] Task dispatched: {async_result.id}")
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template( return render_template(
"result.html", "dashboard/dashboard.html",
message="Scraping gestart.",
scraping_task_id=async_result.id, scraping_task_id=async_result.id,
book_title=None, books=list_active_books(),
registered=reg,
logs=get_ui_logs(),
) )
@app.route("/abort/<book_idx>", methods=["POST"])
@logcall
def abort_download(book_idx):
log_debug(f"[WEB] Abort requested for book: {book_idx}")
set_abort(book_idx)
return jsonify({"status": "ok", "aborted": book_idx})
# ===================================================== # =====================================================
# CLEAR UI LOGS MANUALLY (NEW) # SECTION 3 — API ROUTES (JSON)
# ===================================================== # =====================================================
@app.route("/api/state/all", methods=["GET"])
@logcall
def api_state_all():
"""
Returns the merged SQL + Redis state for all books
(same logic as /debug/inspect_state but JSON-only).
"""
from scraper.utils.state_sync import inspect_books_state
return jsonify(inspect_books_state())
@app.route("/api/books")
@logcall
def api_books():
return jsonify(list_active_books())
@app.route("/api/book/<book_idx>/status")
@logcall
def api_book_status(book_idx):
return jsonify(getStatus(book_idx))
@app.route("/api/book/<book_idx>/logs")
@logcall
def api_book_logs(book_idx):
logs = r.lrange(f"logs:{book_idx}", 0, -1) or []
return jsonify(logs)
@app.route("/progress/<book_idx>")
@logcall
def progress(book_idx):
return jsonify(get_progress(book_idx))
@app.route("/celery-result/<task_id>")
@logcall
def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app)
if result.successful():
return jsonify({"ready": True, "result": result.get()})
if result.failed():
return jsonify({"ready": True, "error": "failed"})
return jsonify({"ready": False})
@app.route("/clear-logs", methods=["POST"]) @app.route("/clear-logs", methods=["POST"])
@logcall
def clear_logs(): def clear_logs():
reset_ui_logs() reset_ui_logs()
return jsonify({"status": "ok", "message": "UI logs cleared"}) return jsonify({"status": "ok"})
# ===================================================== @app.route("/logs", methods=["GET"])
# ABORT (per book_id) @logcall
# ===================================================== def logs():
@app.route("/abort/<book_id>", methods=["POST"]) try:
def abort_download(book_id): last_index = int(request.args.get("last_index", -1))
log_debug(f"[WEB] Abort requested for book: {book_id}") except:
set_abort(book_id) last_index = -1
return jsonify({"status": "ok", "aborted": book_id})
all_logs = get_ui_logs() or []
new_lines = []
new_last = last_index
for idx, line in enumerate(all_logs):
if idx > last_index:
new_lines.append(line)
new_last = idx
return jsonify({"lines": new_lines, "last": new_last})
from flask import render_template
from scraper.services.status_check_service import StatusCheckService
from logbus.publisher import log
from db.repository import get_book_state
@app.route("/inspect/statuscheck/<book_idx>", methods=["POST"])
@logcall
def inspect_statuscheck(book_idx):
try:
StatusCheckService.run(book_idx)
return ("", 204) # background action, geen UI
except Exception as e:
log(f"[STATUSCHECK] ERROR book_idx={book_idx}: {e}")
return jsonify({"error": str(e)}), 500
# ===================================================== # =====================================================
# PROGRESS (per book_id) # SECTION 4 — DEBUG ROUTES
# ===================================================== # =====================================================
@app.route("/progress/<book_id>", methods=["GET"])
def progress(book_id):
return jsonify(get_progress(book_id)) @app.route("/debug/sync_state", methods=["GET"])
def debug_sync_state():
results = sync_books_from_redis()
return {"status": "ok", "synced": results}
from scraper.utils.state_sync import inspect_books_state
@app.route("/debug/inspect_state", methods=["GET"])
def debug_inspect_state():
results = inspect_books_state()
return render_template("debug/inspect_state.html", results=results)
@app.route("/debug/redis-keys")
@logcall
def debug_redis_keys():
cursor = 0
results = {}
while True:
cursor, keys = r.scan(cursor, match="*", count=200)
for k in keys:
try:
results[k] = r.get(k)
except:
results[k] = "<non-string value>"
if cursor == 0:
break
return jsonify(results)
# ===================================================== # =====================================================
# LOGS — GLOBAL UI LOGS # DB DEBUG
# ===================================================== # =====================================================
@app.route("/logs", methods=["GET"])
def logs():
return jsonify({"logs": get_ui_logs()}) @app.route("/api/db/books")
@logcall
def api_db_books():
try:
books = fetch_all_books()
return jsonify({"status": "ok", "books": books})
except Exception as e:
return jsonify({"status": "error", "message": str(e)}), 500
# =============================================
# DEBUG QUEUE VIEW (HTML)
# =============================================
from flask import render_template
from urllib.parse import urlparse
import redis
from celery_app import celery_app
@app.route("/debug/queues")
def debug_queues():
insp = celery_app.control.inspect()
workers_active = insp.active() or {}
workers_scheduled = insp.scheduled() or {}
workers_reserved = insp.reserved() or {}
redis_url = os.getenv("REDIS_BROKER")
parsed = urlparse(redis_url)
r2 = redis.Redis(
host=parsed.hostname,
port=parsed.port,
db=int(parsed.path.strip("/") or 0),
decode_responses=True,
)
queue_names = ["scraping", "controller", "download", "parse", "save", "audio"]
queues = []
for q in queue_names:
key = f"celery:{q}"
try:
queues.append(
{
"name": q,
"redis_key": key,
"length": r2.llen(key),
"items": r2.lrange(key, 0, 30),
}
)
except Exception as e:
queues.append(
{
"name": q,
"redis_key": key,
"length": "ERR",
"items": [str(e)],
}
)
return render_template(
"debug/queues.html",
queues=queues,
workers_active=workers_active,
workers_reserved=workers_reserved,
workers_scheduled=workers_scheduled,
)
# ===================================================== # =====================================================
# CELERY RESULT → return book_id when scraping finishes # SECTION 5 — INTERNAL HELPERS
# ===================================================== # =====================================================
@app.route("/celery-result/<task_id>", methods=["GET"])
def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app)
if result.successful():
return jsonify({"ready": True, "result": result.get()})
if result.failed():
return jsonify({"ready": True, "error": "failed"})
return jsonify({"ready": False}) @logcall
def getStatus(book_idx):
state = r.hgetall(f"book:{book_idx}:state")
status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_idx
return {
"book_id": book_idx,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": dl_total,
}
@logcall
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_idx = key[first + 1 : second]
books.append(getStatus(book_idx))
return books
# ===================================================== # =====================================================
# RUN FLASK # SECTION 6 — FLASK RUNNER
# ===================================================== # =====================================================
if __name__ == "__main__": if __name__ == "__main__":
debug = os.getenv("FLASK_DEBUG", "0") == "1" debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0") host = os.getenv("HOST", "0.0.0.0")

@ -54,6 +54,7 @@ def main():
"-l", "-l",
"INFO", "INFO",
"--pool=prefork", "--pool=prefork",
"--concurrency=2",
] ]
print("[AUDIO-LOCAL] Launching Celery via subprocess…") print("[AUDIO-LOCAL] Launching Celery via subprocess…")

@ -5,6 +5,9 @@ from dotenv import load_dotenv
print(">>> [celery_app] Loading .env BEFORE initializing Celery...") print(">>> [celery_app] Loading .env BEFORE initializing Celery...")
load_dotenv() load_dotenv()
from db.db import init_db
init_db() # ensures DB exists for all workers
BROKER = os.getenv("REDIS_BROKER") BROKER = os.getenv("REDIS_BROKER")
BACKEND = os.getenv("REDIS_BACKEND") BACKEND = os.getenv("REDIS_BACKEND")
@ -29,6 +32,17 @@ celery_app = Celery(
], ],
) )
# >>>>> PLAATS DIT HIER <<<<<
celery_app.conf.update(
worker_redirect_stdouts_level="WARNING",
task_send_sent_event=False,
resultrepr_maxsize=0,
worker_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s",
worker_task_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s",
)
# >>>>> TOT HIER <<<<<
celery_app.conf.task_routes = { celery_app.conf.task_routes = {
"scraper.tasks.scraping.*": {"queue": "scraping"}, "scraper.tasks.scraping.*": {"queue": "scraping"},
"scraper.tasks.controller_tasks.*": {"queue": "controller"}, "scraper.tasks.controller_tasks.*": {"queue": "controller"},

@ -0,0 +1,160 @@
# ============================================================
# File: db/db.py (UPDATED for book_idx-only architecture)
# Purpose:
# Raw SQLite engine for BookScraper.
# - Connection management
# - init_db() schema creation + safe schema upgrade
# - upsert_book() atomic write (now uses book_idx)
# - raw fetch helpers
# ============================================================
import os
import sqlite3
from threading import Lock
DB_PATH = os.environ.get("BOOKSCRAPER_DB", "/app/data/books.db")
# Ensure directory exists
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
# Per-process connection cache
_connection_cache = {}
_connection_lock = Lock()
# ------------------------------------------------------------
# Connection handling
# ------------------------------------------------------------
def get_db():
pid = os.getpid()
if pid not in _connection_cache:
with _connection_lock:
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.row_factory = sqlite3.Row
enable_wal_mode(conn)
_connection_cache[pid] = conn
return _connection_cache[pid]
def enable_wal_mode(conn):
conn.execute("PRAGMA journal_mode=DELETE;")
conn.execute("PRAGMA synchronous=NORMAL;")
conn.commit()
# ------------------------------------------------------------
# Schema creation + SAFE schema upgrades
# ------------------------------------------------------------
def init_db():
conn = get_db()
# --------------------------------------------------------
# BASE SCHEMA — book_idx is now PRIMARY KEY
# --------------------------------------------------------
conn.execute(
"""
CREATE TABLE IF NOT EXISTS books (
book_idx INTEGER PRIMARY KEY,
title TEXT,
author TEXT,
description TEXT,
cover_url TEXT,
cover_path TEXT,
book_url TEXT,
chapters_total INTEGER,
status TEXT,
downloaded INTEGER DEFAULT 0,
parsed INTEGER DEFAULT 0,
audio_done INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
processdate DATETIME,
last_update DATETIME
);
"""
)
conn.commit()
# --------------------------------------------------------
# SCHEMA UPGRADE UTILITY
# --------------------------------------------------------
def add_column(name, type_):
try:
conn.execute(f"ALTER TABLE books ADD COLUMN {name} {type_};")
except:
pass # column already exists
cols = conn.execute("PRAGMA table_info(books);").fetchall()
colnames = [c[1] for c in cols]
# --------------------------------------------------------
# UPGRADE NEW FIELDS — future-proof, matched with Redis state model
# --------------------------------------------------------
# (book_idx already exists as PRIMARY KEY — no need to add)
add_column("description", "TEXT")
add_column("cover_path", "TEXT")
add_column("book_url", "TEXT")
# Download counters
add_column("chapters_download_done", "INTEGER DEFAULT 0")
add_column("chapters_download_skipped", "INTEGER DEFAULT 0")
# Audio counters
add_column("audio_skipped", "INTEGER DEFAULT 0")
# Optional future fields
add_column("audio_total", "INTEGER DEFAULT 0")
conn.commit()
# ------------------------------------------------------------
# WRITE OPERATIONS (book_idx-based UPSERT)
# ------------------------------------------------------------
def upsert_book(book_idx, **fields):
"""
UPSERT by book_idx.
Replaces old upsert that used book_id.
"""
conn = get_db()
keys = ["book_idx"] + list(fields.keys())
values = [book_idx] + list(fields.values())
placeholders = ",".join(["?"] * len(values))
updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()])
sql = f"""
INSERT INTO books ({','.join(keys)})
VALUES ({placeholders})
ON CONFLICT(book_idx)
DO UPDATE SET {updates},
last_update = CURRENT_TIMESTAMP;
"""
conn.execute(sql, values)
conn.commit()
# ------------------------------------------------------------
# RAW READ OPERATIONS
# ------------------------------------------------------------
def _raw_get_book(book_idx):
conn = get_db()
row = conn.execute(
"SELECT * FROM books WHERE book_idx = ?;", (book_idx,)
).fetchone()
return dict(row) if row else None
def _raw_get_all_books():
conn = get_db()
cur = conn.execute("SELECT * FROM books ORDER BY created_at DESC;")
return [dict(row) for row in cur.fetchall()]

@ -0,0 +1,320 @@
# ============================================================
# File: db/repository.py
# Purpose:
# Unified façade for BookScraper database state.
#
# Responsibilities:
# - Route metadata → SQLite
# - Route counters → Redis (live) + SQLite (snapshot)
# - Provide a clean API for tasks and Flask UI
# ============================================================
# ============================================================
# UPDATED — canonical read model via get_book_state
# ============================================================
from scraper.logger_decorators import logcall
from logbus.publisher import log
import redis
import os
# ============================================================
# SQL low-level engines (snapshot storage)
# ============================================================
from db.state_sql import (
sql_fetch_book,
sql_fetch_all_books,
sql_set_status,
sql_set_chapters_total,
sql_register_book,
sql_update_book,
)
# ============================================================
# REDIS low-level engines (live counters)
# ============================================================
from db.state_redis import (
redis_set_status,
redis_set_chapters_total,
redis_inc_download_done,
redis_inc_download_skipped,
redis_inc_parsed_done,
redis_inc_audio_done,
redis_inc_audio_skipped,
)
# ============================================================
# Redis client (read-only for legacy + guards)
# ============================================================
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
_r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ============================================================
# LEGACY PROGRESS (UI only, unchanged)
# ============================================================
def _legacy_get_progress(book_idx):
return {
"book_idx": book_idx,
"total": int(_r.get(f"progress:{book_idx}:total") or 0),
"completed": int(_r.get(f"progress:{book_idx}:completed") or 0),
"skipped": int(_r.get(f"progress:{book_idx}:skipped") or 0),
"failed": int(_r.get(f"progress:{book_idx}:failed") or 0),
"abort": _r.exists(f"abort:{book_idx}") == 1,
"failed_list": _r.lrange(f"progress:{book_idx}:failed_list", 0, -1),
}
@logcall
def get_progress(book_idx):
return _legacy_get_progress(book_idx)
# ============================================================
# FETCH (SQLite snapshot)
# ============================================================
@logcall
def fetch_book(book_idx):
return sql_fetch_book(book_idx)
@logcall
def fetch_all_books():
return sql_fetch_all_books()
# ============================================================
# INIT / UPDATE METADATA
# ============================================================
@logcall
def register_book(
book_idx,
title,
author=None,
description=None,
cover_url=None,
cover_path=None,
book_url=None,
):
sql_register_book(
book_idx,
{
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path,
"book_url": book_url,
"chapters_total": 0,
"status": "registered",
},
)
@logcall
def update_book_after_full_scrape(
book_idx,
title=None,
author=None,
description=None,
cover_url=None,
chapters_total=None,
):
fields = {}
if title is not None:
fields["title"] = title
if author is not None:
fields["author"] = author
if description is not None:
fields["description"] = description
if cover_url is not None:
fields["cover_url"] = cover_url
if chapters_total is not None:
fields["chapters_total"] = chapters_total
fields["status"] = "active"
sql_update_book(book_idx, fields)
# ============================================================
# STATUS
# ============================================================
@logcall
def set_status(book_idx, status):
redis_set_status(book_idx, status)
sql_set_status(book_idx, status)
# ============================================================
# TOTALS
# ============================================================
@logcall
def set_chapters_total(book_idx, total):
redis_set_chapters_total(book_idx, total)
sql_set_chapters_total(book_idx, total)
# ============================================================
# COUNTERS — WRITE ONLY
# ============================================================
@logcall
def inc_download_done(book_idx, amount=1):
redis_inc_download_done(book_idx, amount)
@logcall
def inc_download_skipped(book_idx, amount=1):
redis_inc_download_skipped(book_idx, amount)
@logcall
def inc_parsed_done(book_idx, amount=1):
redis_inc_parsed_done(book_idx, amount)
@logcall
def inc_audio_done(book_idx, amount=1):
redis_inc_audio_done(book_idx, amount)
@logcall
def inc_audio_skipped(book_idx, amount=1):
redis_inc_audio_skipped(book_idx, amount)
# ============================================================
# CANONICAL READ MODEL
# ============================================================
@logcall
def get_book_state(book_idx):
"""
Canonical merged read model.
Rules:
- SQL = snapshot baseline
- Redis = live counters
- merged = max(sql, redis)
- capped at chapters_total
"""
sqlite_row = sql_fetch_book(book_idx) or {}
redis_state = _r.hgetall(f"book:{book_idx}:state") or {}
def _int(v):
try:
return int(v)
except Exception:
return 0
chapters_total = _int(sqlite_row.get("chapters_total"))
# SQL snapshot
sql_downloaded = _int(sqlite_row.get("downloaded"))
sql_audio_done = _int(sqlite_row.get("audio_done"))
sql_audio_skipped = _int(sqlite_row.get("audio_skipped"))
# Redis live
redis_downloaded = _int(redis_state.get("chapters_download_done")) + _int(
redis_state.get("chapters_download_skipped")
)
redis_audio_done = _int(redis_state.get("audio_done"))
redis_audio_skipped = _int(redis_state.get("audio_skipped"))
# Merge
merged_downloaded = max(sql_downloaded, redis_downloaded)
merged_audio_done = max(sql_audio_done, redis_audio_done)
merged_audio_skipped = max(sql_audio_skipped, redis_audio_skipped)
if chapters_total > 0:
merged_downloaded = min(merged_downloaded, chapters_total)
merged_audio_done = min(merged_audio_done, chapters_total)
merged_audio_skipped = min(merged_audio_skipped, chapters_total)
audio_completed = merged_audio_done + merged_audio_skipped
# Build state
state = dict(sqlite_row)
state.update(
{
"downloaded": merged_downloaded,
"audio_done": merged_audio_done,
"audio_skipped": merged_audio_skipped,
"chapters_total": chapters_total,
}
)
# Derived status
status = sqlite_row.get("status") or "unknown"
if chapters_total > 0:
if merged_downloaded < chapters_total:
status = "downloading"
elif merged_downloaded == chapters_total and audio_completed < chapters_total:
status = "audio"
elif audio_completed >= chapters_total:
status = "done"
state["status"] = status
return state
# ============================================================
# READ HELPERS (VIA get_book_state ONLY)
# ============================================================
@logcall
def get_chapters_total(book_idx):
return int(get_book_state(book_idx).get("chapters_total", 0))
@logcall
def get_audio_done(book_idx):
return int(get_book_state(book_idx).get("audio_done", 0))
@logcall
def get_audio_completed_total(book_idx):
state = get_book_state(book_idx)
return int(state.get("audio_done", 0)) + int(state.get("audio_skipped", 0))
# ============================================================
# STATUSCHECK GUARD (INTENTIONAL DIRECT REDIS)
# ============================================================
@logcall
def try_trigger_statuscheck(book_idx):
return bool(_r.set(f"book:{book_idx}:statuscheck:triggered", "1", nx=True))
# ============================================================
# ACTIVE / REGISTERED BOOK LISTS (UI API)
# ============================================================
@logcall
def get_registered_books():
"""
Books visible in the 'registered' list in the UI.
"""
all_books = sql_fetch_all_books()
HIDDEN_STATES = {"hidden"}
return [b for b in all_books if b.get("status") not in HIDDEN_STATES]
@logcall
def get_active_books():
"""
Books currently active in the dashboard.
"""
all_books = sql_fetch_all_books()
HIDDEN_STATES = {"hidden", "done"}
return [b for b in all_books if b.get("status") not in HIDDEN_STATES]
@logcall
def store_m4b_error(book_idx: str, volume: str, error_text: str):
"""
Passive storage of m4b errors.
No logic, no retries, no state transitions.
"""
key = f"book:{book_idx}:m4b:errors"
entry = f"{volume}: {error_text}"
_r.rpush(key, entry)

@ -0,0 +1,130 @@
# ============================================================
# File: db/state_redis.py (UPDATED for book_idx-only architecture)
# Purpose:
# Low-level Redis counters/state for BookScraper.
# Used ONLY by db.repository façade.
# ============================================================
import os
import time
import redis
from logbus.publisher import log
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ------------------------------------------------------------
# INTERNAL KEY BUILDER
# ------------------------------------------------------------
def _key(book_idx: str) -> str:
return f"book:{book_idx}:state"
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def redis_set_status(book_idx: str, status: str):
log(f"[DB-REDIS] Setting status for {book_idx} to {status}")
key = _key(book_idx)
r.hset(key, "status", status)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# TOTAL CHAPTERS
# ------------------------------------------------------------
def redis_set_chapters_total(book_idx: str, total: int):
key = _key(book_idx)
r.hset(key, "chapters_total", total)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# DOWNLOAD COUNTERS
# ------------------------------------------------------------
def redis_inc_download_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_download_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_download_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download skipped for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_download_skipped", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# PARSE COUNTERS
# ------------------------------------------------------------
def redis_inc_parsed_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing parsed done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_parsed_done", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# AUDIO COUNTERS
# ------------------------------------------------------------
def redis_inc_audio_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "audio_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_audio_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio skipped for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "audio_skipped", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# INITIALISE BOOK STATE
# ------------------------------------------------------------
def init_book_state(book_idx: str, title: str, url: str, chapters_total: int):
"""
Initialiseert de complete Redis state voor een nieuw boek.
LET OP:
- Als een key al bestaat NIET resetten (progress behouden).
- Alleen missende velden worden toegevoegd.
"""
key = f"book:{book_idx}:state"
# Bestaat al? Dan vullen we alleen missende velden aan.
exists = r.exists(key)
pipeline = r.pipeline()
# Basis metadata
pipeline.hsetnx(key, "book_id", book_idx)
pipeline.hsetnx(key, "title", title or "")
pipeline.hsetnx(key, "url", url or "")
# State
pipeline.hsetnx(key, "status", "registered")
# Counters
pipeline.hsetnx(key, "chapters_total", chapters_total)
pipeline.hsetnx(key, "chapters_download_done", 0)
pipeline.hsetnx(key, "chapters_download_skipped", 0)
pipeline.hsetnx(key, "chapters_parsed_done", 0)
pipeline.hsetnx(key, "audio_done", 0)
pipeline.hsetnx(key, "audio_skipped", 0)
# Timestamp
pipeline.hset(key, "last_update", int(time.time()))
pipeline.execute()
if exists:
log(f"[DB-REDIS] init_book_state(): UPDATED existing state for {book_idx}")
else:
log(f"[DB-REDIS] init_book_state(): CREATED new state for {book_idx}")

@ -0,0 +1,178 @@
# ============================================================
# File: db/state_sql.py (UPDATED for book_idx-only architecture)
# Purpose:
# Low-level SQLite snapshot layer for BookScraper metadata.
# Used ONLY through db.repository façade.
# ============================================================
import sqlite3
import os
from logbus.publisher import log
# Must match db/db.py
DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/data/books.db")
# ------------------------------------------------------------
# INTERNAL HELPERS
# ------------------------------------------------------------
def _connect():
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
# ------------------------------------------------------------
# FETCH
# ------------------------------------------------------------
def sql_fetch_book(book_idx):
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
row = cur.fetchone()
conn.close()
return dict(row) if row else None
def sql_fetch_all_books():
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books ORDER BY created_at DESC")
rows = cur.fetchall()
conn.close()
return [dict(r) for r in rows]
# ------------------------------------------------------------
# REGISTER / UPDATE
# ------------------------------------------------------------
def sql_register_book(book_idx, fields: dict):
"""
Insert or replace entire book record.
book_idx is the PRIMARY KEY.
"""
conn = _connect()
cur = conn.cursor()
cols = ", ".join(["book_idx"] + list(fields.keys()))
placeholders = ", ".join(["?"] * (1 + len(fields)))
values = [book_idx] + list(fields.values())
cur.execute(
f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})",
values,
)
conn.commit()
conn.close()
def sql_update_book(book_idx, fields: dict):
if not fields:
return
conn = _connect()
cur = conn.cursor()
set_clause = ", ".join([f"{k} = ?" for k in fields])
params = list(fields.values()) + [book_idx]
cur.execute(
f"UPDATE books SET {set_clause} WHERE book_idx = ?",
params,
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def sql_set_status(book_idx, status: str):
conn = _connect()
cur = conn.cursor()
cur.execute(
"UPDATE books SET status = ? WHERE book_idx = ?",
(status, book_idx),
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# CHAPTER TOTAL (snapshot)
# ------------------------------------------------------------
def sql_set_chapters_total(book_idx, total: int):
conn = _connect()
cur = conn.cursor()
cur.execute(
"UPDATE books SET chapters_total = ? WHERE book_idx = ?",
(total, book_idx),
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# COUNTERS (SNAPSHOT-ONLY)
# ------------------------------------------------------------
def sql_inc_downloaded(book_idx, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET downloaded = COALESCE(downloaded,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_parsed(book_idx, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET parsed = COALESCE(parsed,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_audio_done(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio_done for {book_idx} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_done = COALESCE(audio_done,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_audio_skipped(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio_skipped for {book_idx} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_skipped = COALESCE(audio_skipped,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()

@ -5,14 +5,15 @@ services:
redis: redis:
image: redis:7 image: redis:7
container_name: bookscraper_redis container_name: bookscraper_redis
command: [ command:
[
"redis-server", "redis-server",
"--save", "--save",
"", # Disable RDB snapshots "",
"--appendonly", "--appendonly",
"no", # Disable AOF "no",
"--stop-writes-on-bgsave-error", "--stop-writes-on-bgsave-error",
"no", # Never block writes "no",
] ]
ports: ports:
- "6379:6379" - "6379:6379"
@ -41,7 +42,8 @@ services:
- PYTHONUNBUFFERED=1 - PYTHONUNBUFFERED=1
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
restart: "no" restart: "no"
# ---------------------------------------------------------- # ----------------------------------------------------------
@ -54,7 +56,8 @@ services:
container_name: bookscraper_web container_name: bookscraper_web
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -77,7 +80,8 @@ services:
container_name: worker_download container_name: worker_download
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -96,7 +100,8 @@ services:
container_name: worker_parse container_name: worker_parse
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -115,7 +120,8 @@ services:
container_name: worker_save container_name: worker_save
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -134,7 +140,8 @@ services:
container_name: worker_scraping container_name: worker_scraping
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -142,3 +149,22 @@ services:
- .env - .env
command: celery -A celery_app worker -Q scraping -n scraping@%h -l INFO command: celery -A celery_app worker -Q scraping -n scraping@%h -l INFO
restart: "no" restart: "no"
# ----------------------------------------------------------
# M4B Worker (Finalization)
# ----------------------------------------------------------
worker_m4b:
build:
context: .
dockerfile: docker/Dockerfile.m4b
container_name: worker_m4b
command: celery -A celery_app worker -Q m4b -n m4b@%h -l INFO
depends_on:
redis:
condition: service_healthy
env_file:
- .env
volumes:
- .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
restart: "no"

@ -0,0 +1,70 @@
FROM debian:12
ENV DEBIAN_FRONTEND=noninteractive
# ----------------------------------------------------------
# System + PHP (PHP 8.2 native)
# ----------------------------------------------------------
RUN apt-get update && apt-get install -y \
ffmpeg \
curl \
ca-certificates \
bash \
php-cli \
php-intl \
php-json \
php-mbstring \
php-xml \
php-curl \
php-zip \
python3 \
python3-pip \
python3-venv \
\
# build deps for mp4v2
git \
build-essential \
autoconf \
automake \
libtool \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# ----------------------------------------------------------
# Python venv (PEP 668 compliant)
# ----------------------------------------------------------
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:/usr/local/bin:$PATH"
# ----------------------------------------------------------
# Build & install mp4v2 (mp4info)
# ----------------------------------------------------------
WORKDIR /tmp
RUN git clone https://github.com/sandreas/mp4v2 \
&& cd mp4v2 \
&& ./configure \
&& make -j$(nproc) \
&& make install \
&& echo "/usr/local/lib" > /etc/ld.so.conf.d/mp4v2.conf \
&& ldconfig \
&& cd / \
&& rm -rf /tmp/mp4v2
# ----------------------------------------------------------
# Install m4b-tool
# ----------------------------------------------------------
RUN curl -L https://github.com/sandreas/m4b-tool/releases/latest/download/m4b-tool.phar \
-o /usr/local/bin/m4b-tool \
&& chmod +x /usr/local/bin/m4b-tool
# ----------------------------------------------------------
# App
# ----------------------------------------------------------
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY . /app
CMD ["bash"]

@ -1,9 +1,31 @@
# logbus/publisher.py # logbus/publisher.py
import logging import logging
import os
logger = logging.getLogger("logbus") logger = logging.getLogger("logbus")
logger.setLevel(logging.WARNING)
# ============================================================
# FILE LOGGER — log.txt in BOOKSCRAPER_OUTPUT_DIR
# ============================================================
try:
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
os.makedirs(root, exist_ok=True)
file_path = os.path.join(root, "log.txt")
file_handler = logging.FileHandler(file_path, mode="a", encoding="utf-8")
file_formatter = logging.Formatter("%(message)s") # exact zoals input
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
except Exception:
# Logging naar file mag nooit de app laten crashen
pass
def log(message: str): def log(message: str):
""" """
@ -27,3 +49,32 @@ def log(message: str):
push_ui(message) push_ui(message)
except Exception: except Exception:
pass pass
# ============================================================
# Delta-based log retrieval using Redis indexes
# ============================================================
def get_ui_logs_delta(last_index: int):
"""
Returns (new_lines, total_count)
Only returns log lines AFTER last_index.
Example:
last_index = 10 returns logs with Redis indexes 11..end
"""
# Determine total lines in buffer
total = r.llen(UI_LOG_KEY)
if total == 0:
return [], 0
# First load OR index invalid → send entire buffer
if last_index < 0 or last_index >= total:
logs = r.lrange(UI_LOG_KEY, 0, -1)
return logs, total
# Only new lines:
new_lines = r.lrange(UI_LOG_KEY, last_index + 1, -1)
return new_lines, total

@ -0,0 +1 @@
Subproject commit 480a73324f53d0d24bea4931c3902097f8e2a663

@ -25,7 +25,7 @@ def set_total(book_id: str, total: int):
# ------------------------------------------------------------ # ------------------------------------------------------------
# COUNTERS # COUNTERS legacy
# ------------------------------------------------------------ # ------------------------------------------------------------
def inc_completed(book_id: str): def inc_completed(book_id: str):
r.incr(f"progress:{book_id}:completed") r.incr(f"progress:{book_id}:completed")
@ -96,6 +96,7 @@ def init_book_state(
"status": "scraping", "status": "scraping",
"chapters_total": chapters_total, "chapters_total": chapters_total,
"chapters_done": 0, "chapters_done": 0,
"chapters_download_skipped": 0,
"audio_total": 0, "audio_total": 0,
"audio_done": 0, "audio_done": 0,
"last_update": now, "last_update": now,
@ -120,7 +121,7 @@ def set_last_update(book_id: str):
# ------------------------------------------------------------ # ------------------------------------------------------------
# Chapter counters # Chapter counters new model
# ------------------------------------------------------------ # ------------------------------------------------------------
def set_chapter_total(book_id: str, total: int): def set_chapter_total(book_id: str, total: int):
key = f"book:{book_id}:state" key = f"book:{book_id}:state"
@ -128,9 +129,15 @@ def set_chapter_total(book_id: str, total: int):
set_last_update(book_id) set_last_update(book_id)
def inc_chapter_download_skipped(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_download_skipped", 1)
set_last_update(book_id)
def inc_chapter_done(book_id: str): def inc_chapter_done(book_id: str):
key = f"book:{book_id}:state" key = f"book:{book_id}:state"
r.hincrby(key, "chapters_done", 1) r.hincrby(key, "chapters_download_done", 1)
set_last_update(book_id) set_last_update(book_id)
@ -149,6 +156,12 @@ def inc_audio_done(book_id: str):
set_last_update(book_id) set_last_update(book_id)
def inc_audio_skipped(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "audio_skipped", 1)
set_last_update(book_id)
# ------------------------------------------------------------ # ------------------------------------------------------------
# Skip reasons # Skip reasons
# ------------------------------------------------------------ # ------------------------------------------------------------
@ -171,7 +184,14 @@ def get_state(book_id: str):
state = r.hgetall(key) or {} state = r.hgetall(key) or {}
# Numeric conversions # Numeric conversions
numeric_fields = ["chapters_total", "chapters_done", "audio_total", "audio_done"] numeric_fields = [
"chapters_total",
"chapters_download_done",
"chapters_download_skipped",
"audio_total",
"audio_skipped",
"audio_done",
]
for field in numeric_fields: for field in numeric_fields:
if field in state: if field in state:
try: try:

@ -1,7 +1,7 @@
import os import os
import redis import redis
# GUI log (non-breaking) from scraper.logger_decorators import logcall
from scraper.ui_log import push_ui from scraper.ui_log import push_ui
# --------------------------------------------------------- # ---------------------------------------------------------
@ -13,55 +13,58 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# Debug mode (optional) # Debug mode (optional)
ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1" ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1"
# Internal flag to avoid spamming the same message # Avoid duplicate spam
_seen_debug_keys = set() _seen_debug_keys = set()
# ========================================================= # =========================================================
# ABORT FLAG # INTERNAL DEBUGGING
# ========================================================= # =========================================================
def _debug(msg: str): def _debug(msg: str):
"""Print + GUI log (non-breaking, minimal noise)."""
print(msg) print(msg)
push_ui(msg) push_ui(msg)
def set_abort(book_id: str): # =========================================================
"""Enable abort mode for this book.""" # ABORT FLAG — unified book_idx
key = f"abort:{book_id}" # =========================================================
def set_abort(book_idx: str):
"""Enable abort mode for book_idx."""
key = f"abort:{book_idx}"
r.set(key, "1") r.set(key, "1")
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT] SET {key}") _debug(f"[ABORT] SET {key}")
def clear_abort(book_id: str): def clear_abort(book_idx: str):
"""Clear abort flag.""" """Clear abort flag."""
key = f"abort:{book_id}" key = f"abort:{book_idx}"
r.delete(key) r.delete(key)
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT] CLEAR {key}") _debug(f"[ABORT] CLEAR {key}")
def abort_requested(book_id: str, redis_client=None) -> bool: def abort_requested(book_idx: str, redis_client=None) -> bool:
""" """
Return True if abort flag is set. Check whether abort flag is active for book_idx.
redis_client: redis_client:
- Docker workers None use default Redis (r) - Docker workers None use default Redis (r)
- Local macOS audio passes Redis(host=127.0.0.1) - Local macOS audio worker passes Redis(host=127.0.0.1)
""" """
client = redis_client or r client = redis_client or r
key = f"abort:{book_id}" key = f"abort:{book_idx}"
try: try:
exists = client.exists(key) exists = client.exists(key)
if ABORT_DEBUG: if ABORT_DEBUG:
# Log once per key
# Log only once per book
if key not in _seen_debug_keys: if key not in _seen_debug_keys:
try: try:
conn = client.connection_pool.connection_kwargs conn = client.connection_pool.connection_kwargs
@ -69,54 +72,53 @@ def abort_requested(book_id: str, redis_client=None) -> bool:
port = conn.get("port") port = conn.get("port")
db = conn.get("db") db = conn.get("db")
_debug( _debug(
f"[ABORT_DEBUG] first check book_id={book_id} " # f"[ABORT_DEBUG] first check book_idx={book_idx} "
f"redis={host}:{port} db={db}" f"redis={host}:{port} db={db}"
) )
except Exception: except Exception:
_debug(f"[ABORT_DEBUG] first check book_id={book_id}") _debug(f"[ABORT_DEBUG] first check book_idx={book_idx}")
_seen_debug_keys.add(key) _seen_debug_keys.add(key)
# Only log abort ACTIVE # Log ACTIVE state
if exists == 1: if exists == 1:
_debug(f"[ABORT] ACTIVE for {book_id}") _debug(f"[ABORT] ACTIVE for {book_idx}")
return exists == 1 return exists == 1
except Exception as e: except Exception as e:
if ABORT_DEBUG: if ABORT_DEBUG:
_debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}") _debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}")
return False return False
# ========================================================= # =========================================================
# PER-CHAPTER STATE # PER-CHAPTER STATE — unified book_idx
# ========================================================= # =========================================================
def mark_chapter_started(book_id: str, chapter_num: int): def mark_chapter_started(book_idx: str, chapter_num: int):
key = f"started:{book_id}:{chapter_num}" key = f"started:{book_idx}:{chapter_num}"
r.set(key, "1") r.set(key, "1")
def chapter_started(book_id: str, chapter_num: int) -> bool: def chapter_started(book_idx: str, chapter_num: int) -> bool:
key = f"started:{book_id}:{chapter_num}" key = f"started:{book_idx}:{chapter_num}"
return r.exists(key) == 1 return r.exists(key) == 1
# ========================================================= # =========================================================
# UTILITY: RESET FOR A BOOK # RESET STATE FOR BOOK_IDX
# ========================================================= # =========================================================
def reset_book_state(book_id: str): def reset_book_state(book_idx: str):
""" """
Remove abort flag and all chapter-start markers. Remove abort flag and all per-chapter started markers.
""" """
key = f"abort:{book_id}" # abort flag
r.delete(key) r.delete(f"abort:{book_idx}")
pattern = f"started:{book_id}:*" # chapter markers
pattern = f"started:{book_idx}:*"
for k in r.scan_iter(pattern): for k in r.scan_iter(pattern):
r.delete(k) r.delete(k)

@ -1,202 +1,55 @@
# scraper/book_scraper.py # ============================================================
# File: scraper/book_scraper.py
# Purpose:
# Backwards-compatible wrapper giving the SAME public API
# as the old BookScraper, but internally uses ScrapeEngine.
#
# execute() → full metadata + chapterlist (NO book_idx creation)
#
# ID management is now handled exclusively by InitService.
# ============================================================
import requests from scraper.logger_decorators import logcall
from bs4 import BeautifulSoup from scraper.services.scrape_engine import ScrapeEngine
from urllib.parse import urljoin
import re
from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements
from scraper.models.book_state import Chapter
class BookScraper: class BookScraper:
""" """
Minimal scraper: only metadata + chapter list. Backwards-compatible BookScraper façade.
The DownloadController handles Celery pipelines for:
- download
- parse
- save
"""
def __init__(self, site, url): Old responsibilities (metadata, chapters, covers, downloads)
self.site = site are now split:
self.url = url
self.book_title = "" ScrapeEngine metadata + chapterlist
self.book_author = "" Download tasks handle download/parse/save
self.book_description = "" InitService determines book_idx (single source of truth)
self.cover_url = ""
self.chapter_base = None
self.chapters = [] This wrapper intentionally does NOT generate a book_idx or book_id.
It only returns metadata/chapters in legacy-compatible dict format.
"""
# Load custom replacements @logcall
extra = load_replacements("replacements.txt") def __init__(self, site_scraper, url: str):
self.site.replacements.update(extra) self.site = site_scraper
self.url = url
# ------------------------------------------------------------ @logcall
def execute(self): def execute(self):
"""Main entry point. Returns metadata + chapter URLs."""
soup = self._fetch(self.url)
self._parse_title(soup)
self._parse_author(soup)
self._parse_description(soup)
self._parse_cover(soup)
chapter_page = self.get_chapter_page(soup)
self.parse_chapter_links(chapter_page)
log_debug(f"[BookScraper] Completed metadata parse")
return {
"title": self.book_title,
"author": self.book_author,
"description": self.book_description,
"cover_url": self.cover_url, # ← used by DownloadController
"book_url": self.url,
"chapters": [
{"num": ch.number, "title": ch.title, "url": ch.url}
for ch in self.chapters
],
}
# ------------------------------------------------------------
def _fetch(self, url):
log_debug(f"[BookScraper] Fetch: {url}")
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
resp.encoding = self.site.encoding
return BeautifulSoup(resp.text, "lxml")
# ------------------------------------------------------------
def _parse_title(self, soup):
h1 = soup.find("h1")
self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
log_debug(f"[BookScraper] Title = {self.book_title}")
def _parse_author(self, soup):
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
self.book_author = raw.split("")[1] if "" in raw else "UnknownAuthor"
log_debug(f"[BookScraper] Author = {self.book_author}")
def _parse_description(self, soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
self.book_description = ""
log_debug("[BookScraper] Description not found")
return
parts = []
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
self.book_description = clean_text("\n".join(parts), self.site.replacements)
log_debug(f"[BookScraper] Description length = {len(self.book_description)}")
# ------------------------------------------------------------
def _parse_cover(self, soup):
""" """
Extract correct cover based on book_id path logic. Legacy public API:
1. primary: match "/files/article/image/{vol}/{book_id}/" Return metadata + chapter list EXACTLY as before,
2. fallback: endswith "/{book_id}s.jpg" but without generating any book_id.
""" """
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", self.url)
if not m:
log_debug("[BookScraper] No book_id found in URL → cannot match cover")
return
book_id = m.group(1)
# Extract vol folder from URL (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", self.url)
volume = m2.group(1) if m2 else None
log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
imgs = soup.find_all("img", src=True)
chosen = None
# --------------------------------------------------------
# PRIORITY 1: Path-match
# /files/article/image/{vol}/{book_id}/
# --------------------------------------------------------
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
log_debug(f"[BookScraper] Cover matched by PATH: {src}")
break
# --------------------------------------------------------
# PRIORITY 2: endswith "/{book_id}s.jpg"
# --------------------------------------------------------
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
break
# --------------------------------------------------------
# No match
# --------------------------------------------------------
if not chosen:
log_debug("[BookScraper] No matching cover found")
return
self.cover_url = urljoin(self.site.root, chosen)
log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
# ------------------------------------------------------------ data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url)
def get_chapter_page(self, soup):
"""Return BeautifulSoup of the main chapter list page."""
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
chapter_url = urljoin(self.site.root, href)
# base for chapter links # Legacy structure preserved, unchanged:
parts = chapter_url.rsplit("/", 1) return {
self.chapter_base = parts[0] + "/" "title": data.get("title"),
"author": data.get("author"),
return self._fetch(chapter_url) "description": data.get("description"),
"cover_url": data.get("cover_url"),
# ------------------------------------------------------------ "chapters": data.get("chapters", []),
def parse_chapter_links(self, soup): "chapters_total": data.get("chapters_total", 0),
cont = soup.select_one(self.site.chapter_list_selector) "book_url": data.get("book_url"), # used later by parse/save tasks
items = cont.select("ul li a[href]") }
self.chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full = urljoin(self.chapter_base, href)
self.chapters.append(Chapter(idx, title, full))
idx += 1
log_debug(f"[BookScraper] Found {len(self.chapters)} chapters")

@ -1,62 +1,54 @@
# ========================================================= # =========================================================
# File: scraper/download_controller.py # File: scraper/download_controller.py
# Purpose: # Purpose:
# Build Celery pipelines for all chapters # Build Celery pipelines for all chapters using book_idx
# and pass book_id for abort/progress/log functionality. # Handles:
# + Download and replicate cover image to all volume folders # • volume assignment
# + Generate scripts (allinone.txt, makebook, say) # • cover download + replication
# + Initialize Redis Book State Model (status + counters) # • script generation
# • Redis Book State Model init
# • abort tracking
# ========================================================= # =========================================================
from celery import group from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline from scraper.tasks.pipeline import build_chapter_pipeline
from scraper.scriptgen import generate_all_scripts
# ❗ IMPORTANT:
# generate_all_scripts MUST NOT import DownloadController, otherwise circular import.
# We keep the import, but scriptgen must be clean.
from scraper import scriptgen
from logbus.publisher import log from logbus.publisher import log
import os import os
import requests import requests
import shutil import shutil
from scraper.abort import abort_requested # DEBUG allowed
# NEW: Redis State Model (C&U) from scraper.abort import abort_requested
from scraper.progress import ( from db.state_redis import init_book_state
init_book_state, from db.repository import set_status, set_chapters_total
set_status,
set_chapter_total,
)
class DownloadController: class DownloadController:
""" """
Coordinates all chapter pipelines (download parse save), Coordinates all chapter pipelines (download parse save).
including:
- volume splitting
- consistent meta propagation
- book_id-based abort + progress tracking
- cover download + volume replication
- script generation (allinone.txt, makebook, say)
- Redis book state initialisation and status updates
""" """
def __init__(self, book_id: str, scrape_result: dict): def __init__(self, book_idx: str, scrape_result: dict):
self.book_id = book_id self.book_idx = str(book_idx)
self.scrape_result = scrape_result self.scrape_result = scrape_result
# Core metadata # Metadata
self.title = scrape_result.get("title", "UnknownBook") self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or [] self.chapters = scrape_result.get("chapters", []) or []
self.cover_url = scrape_result.get("cover_url") self.cover_url = scrape_result.get("cover_url")
# Output base dir # Output folder
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
# Volume size
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
# Base folder for the whole book
self.book_base = os.path.join(root, self.title) self.book_base = os.path.join(root, self.title)
os.makedirs(self.book_base, exist_ok=True) os.makedirs(self.book_base, exist_ok=True)
# Meta passed to parse/save stage # Meta passed downstream
self.meta = { self.meta = {
"title": self.title, "title": self.title,
"author": scrape_result.get("author"), "author": scrape_result.get("author"),
@ -64,176 +56,120 @@ class DownloadController:
"book_url": scrape_result.get("book_url"), "book_url": scrape_result.get("book_url"),
} }
# ------------------------------------------------- log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}")
# DEBUG — bevestig dat controller correct book_id ziet
# -------------------------------------------------
log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'")
try: # Init Redis Book State Model
abort_state = abort_requested(book_id)
log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested ERROR: {e}")
# -------------------------------------------------
# NEW: Initialize Redis Book State Model
# -------------------------------------------------
try: try:
init_book_state( init_book_state(
book_id=self.book_id, book_id=self.book_idx,
title=self.title, title=self.title,
url=self.scrape_result.get("book_url"), url=self.meta["book_url"],
chapters_total=len(self.chapters), chapters_total=len(self.chapters),
) )
log(f"[CTRL_STATE] init_book_state() completed for {self.title}")
except Exception as e: except Exception as e:
log(f"[CTRL_STATE] init_book_state FAILED: {e}") log(f"[CTRL_STATE] init_book_state FAILED: {e}")
# ---------------------------------------------------------
# Cover Download
# --------------------------------------------------------- # ---------------------------------------------------------
def download_cover(self): def download_cover(self):
"""Download one cover image into the root of the book folder."""
if not self.cover_url: if not self.cover_url:
log(f"[CTRL] No cover URL found for '{self.title}'") return log(f"[CTRL] No cover URL for '{self.title}'")
return
cover_path = os.path.join(self.book_base, "cover.jpg") cover_path = os.path.join(self.book_base, "cover.jpg")
headers = { headers = {
"User-Agent": ( "User-Agent": "Mozilla/5.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) " "Referer": self.scrape_result.get("book_url") or "",
"Gecko/20100101 Firefox/118.0"
),
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
} }
try: try:
log(f"[CTRL] Downloading cover: {self.cover_url}") log(f"[CTRL] Downloading cover: {self.cover_url}")
resp = requests.get(self.cover_url, timeout=10, headers=headers) resp = requests.get(self.cover_url, timeout=10, headers=headers)
resp.raise_for_status() resp.raise_for_status()
with open(cover_path, "wb") as f: with open(cover_path, "wb") as f:
f.write(resp.content) f.write(resp.content)
log(f"[CTRL] Cover saved to: {cover_path}") log(f"[CTRL] Cover saved: {cover_path}")
except Exception as e: except Exception as e:
log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})") log(f"[CTRL] Cover download failed: {e}")
# ---------------------------------------------------------
# Cover Replication to Volumes
# --------------------------------------------------------- # ---------------------------------------------------------
def replicate_cover_to_volumes(self): def replicate_cover_to_volumes(self):
"""Copy cover.jpg into each existing Volume_xxx directory."""
src = os.path.join(self.book_base, "cover.jpg") src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src): if not os.path.exists(src):
log("[CTRL] No cover.jpg found, replication skipped")
return return
try: for entry in os.listdir(self.book_base):
for entry in os.listdir(self.book_base): if entry.lower().startswith("volume_"):
if entry.lower().startswith("volume_"): dst = os.path.join(self.book_base, entry, "cover.jpg")
vol_dir = os.path.join(self.book_base, entry) try:
dst = os.path.join(vol_dir, "cover.jpg")
shutil.copyfile(src, dst) shutil.copyfile(src, dst)
log(f"[CTRL] Cover replicated into: {dst}") log(f"[CTRL] Cover replicated → {dst}")
except Exception as e:
log(f"[CTRL] Cover replication failed: {e}")
# ---------------------------------------------------------
def store_cover_in_static(self):
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
return
os.makedirs("static/covers", exist_ok=True)
dst = os.path.join("static/covers", f"{self.book_idx}.jpg")
try:
shutil.copyfile(src, dst)
log(f"[CTRL] Cover stored for UI: {dst}")
except Exception as e: except Exception as e:
log(f"[CTRL] Cover replication failed: {e}") log(f"[CTRL] Failed storing cover: {e}")
# ---------------------------------------------------------
# Volume isolation
# --------------------------------------------------------- # ---------------------------------------------------------
def get_volume_path(self, chapter_num: int) -> str: def get_volume_path(self, chapter_num: int) -> str:
"""Returns the correct volume directory for a chapter."""
vol_index = (chapter_num - 1) // self.max_vol + 1 vol_index = (chapter_num - 1) // self.max_vol + 1
vol_name = f"Volume_{vol_index:03d}" vol_name = f"Volume_{vol_index:03d}"
vol_path = os.path.join(self.book_base, vol_name) vol_path = os.path.join(self.book_base, vol_name)
os.makedirs(vol_path, exist_ok=True) os.makedirs(vol_path, exist_ok=True)
return vol_path return vol_path
# ---------------------------------------------------------
# Pipeline launcher
# --------------------------------------------------------- # ---------------------------------------------------------
def start(self): def start(self):
total = len(self.chapters) total = len(self.chapters)
log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)")
log( # Update Redis/SQLite state
f"[CTRL] Initialising pipeline for '{self.title}' "
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
)
log(f"[CTRL] Output root: {self.book_base}")
# -------------------------------------
# NEW: Redis state update
# -------------------------------------
try: try:
set_status(self.book_id, "downloading") set_status(self.book_idx, "downloading")
set_chapter_total(self.book_id, total) set_chapters_total(self.book_idx, total)
log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}")
except Exception as e: except Exception as e:
log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}") log(f"[CTRL_STATE] Unable to set state: {e}")
# ------------------------------------- # Download cover
# 1) Download cover
# -------------------------------------
self.download_cover() self.download_cover()
# Build pipeline tasks
tasks = [] tasks = []
for ch in self.chapters: for ch in self.chapters:
num = ch["num"]
# Build chapter_dict (NEW) chapter_info = {
chapter_num = ch["num"] "num": num,
chapter_url = ch["url"] "url": ch["url"],
chapter_title = ch.get("title") "title": ch.get("title"),
"volume_path": self.get_volume_path(num),
volume_path = self.get_volume_path(chapter_num)
chapter_dict = {
"num": chapter_num,
"url": chapter_url,
"title": chapter_title,
"volume_path": volume_path,
} }
tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta))
# Dispatch pipeline with chapter_dict
tasks.append(
build_chapter_pipeline(
self.book_id,
chapter_dict,
self.meta,
)
)
async_result = group(tasks).apply_async() async_result = group(tasks).apply_async()
log( # Replicate cover + place in static
f"[CTRL] Pipelines dispatched for '{self.title}' "
f"(book_id={self.book_id}, group_id={async_result.id})"
)
# Debug abort state
try:
abort_state = abort_requested(self.book_id)
log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}")
# -------------------------------------------------------
self.replicate_cover_to_volumes() self.replicate_cover_to_volumes()
self.store_cover_in_static()
# ------------------------------------------------------- # Generate scripts (LATE IMPORT to avoid circular)
try: try:
generate_all_scripts( scriptgen.generate_all_scripts(
self.book_base, self.book_base, self.title, self.meta["author"]
self.title,
self.meta.get("author"),
) )
log(f"[CTRL] Scripts generated for '{self.title}'") log("[CTRL] Scripts generated")
except Exception as e: except Exception as e:
log(f"[CTRL] Script generation failed: {e}") log(f"[CTRL] Script generation failed: {e}")

@ -0,0 +1,27 @@
# ============================================================
# File: scraper/engine/fetcher.py
# Purpose:
# Low-level HTML fetch utility shared by all site scrapers.
# Replaces scattered _fetch() logic inside BookScraper.
# ============================================================
import requests
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
)
}
def fetch_html(url: str, encoding: str = "utf-8", timeout: int = 10) -> BeautifulSoup:
"""
Fetch HTML with a consistent user-agent and encoding.
Returns BeautifulSoup(lxml).
"""
resp = requests.get(url, headers=HEADERS, timeout=timeout)
resp.encoding = encoding
return BeautifulSoup(resp.text, "lxml")

@ -0,0 +1,65 @@
# ============================================================
# File: scraper/engine/parser.py
# Purpose:
# High-level scraping API coordinating metadata extraction
# and chapter extraction using pluggable SiteScraper classes.
#
# This is the new central engine:
# - extract_metadata_only() used by INIT flow
# - extract_metadata_full() used by full scraping pipeline
# ============================================================
from scraper.engine.fetcher import fetch_html
def extract_metadata_only(url: str, site_scraper):
"""
Extract ONLY lightweight metadata:
- title
- author
- description
- cover_url
- chapters_total = 0
"""
soup = fetch_html(url, site_scraper.encoding)
title = site_scraper.parse_title(soup)
author = site_scraper.parse_author(soup)
description = site_scraper.parse_description(soup)
cover_url = site_scraper.parse_cover(soup, url)
return {
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"chapters_total": 0,
"book_url": url,
}
def extract_metadata_full(url: str, site_scraper):
"""
Full scraping (metadata + chapterlist).
Used by the scraping Celery pipeline.
"""
soup = fetch_html(url, site_scraper.encoding)
# metadata
meta = extract_metadata_only(url, site_scraper)
# chapter list
chapter_page_url = site_scraper.extract_chapter_page_url(soup)
chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding)
chapters = site_scraper.parse_chapter_list(chapter_page_soup)
meta["chapters"] = chapters
return meta
def build_book_id(title: str) -> str:
"""
Canonical book_id generator.
SCRAPE currently uses title as ID preserve that behavior.
"""
return title

@ -0,0 +1,33 @@
# ============================================================
# File: scraper/logger_decorators.py
# Purpose: Function-call logging decorator
# ============================================================
from functools import wraps
from scraper.logger import log_debug
def logcall(func):
"""
Decorator: log function name + arguments every time it's called.
Usage: @logcall above any function.
"""
@wraps(func)
def wrapper(*args, **kwargs):
# Naam van de functie
name = func.__qualname__
# Eerste logregel vóór uitvoering
# log_debug(f"[CALL] {name} args={args} kwargs={kwargs}")
log_debug(f"[CALL] {name} args={args}")
# log_debug(f"[CALL] {name}")
result = func(*args, **kwargs)
# Log ná uitvoering
# log_debug(f"[RETURN] {name} → {result}")
return result
return wrapper

@ -36,7 +36,8 @@
All rights reserved= All rights reserved=
Copyright= Copyright=
飘天文学= 飘天文学=
=
…=
# --- Piaotia specific --- # --- Piaotia specific ---
请记住本书域名= 请记住本书域名=
请收藏本书= 请收藏本书=
@ -53,7 +54,17 @@ Copyright=
章节出错= 章节出错=
点此举报= 点此举报=
举报原因= 举报原因=
求收藏=
推荐票=
www.piaotia.com=
www.piaotian.com=
www.=
www=
.com=
piaotia=
.net=
piaotian=
www.piaotia.com=
# --- Ads / QR / watermark --- # --- Ads / QR / watermark ---
关注公众号= 关注公众号=
微信扫一扫= 微信扫一扫=
@ -68,10 +79,17 @@ sponsored=
ADVERTISEMENT= ADVERTISEMENT=
Advertisment= Advertisment=
Adblock= Adblock=
bookid=
bookname=
# --- Mode / UI related --- # --- Mode / UI related ---
选择背景颜色= 选择背景颜色=
选择字体大小= 选择字体大小=
繁體中文= 繁體中文=
模式选择= 模式选择=
阅读模式= 阅读模式=
冲榜
求票
诸神学徒
感谢各位书友的支持=
您的支持就是我们最大的动力=
感谢各位书友的支持,您的支持就是我们最大的动力=

@ -5,6 +5,7 @@
import os import os
import stat import stat
from logbus.publisher import log from logbus.publisher import log
from scraper.logger_decorators import logcall
TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates") TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")
@ -35,7 +36,7 @@ def detect_volumes(book_base: str):
except Exception: except Exception:
continue continue
vols.sort() vols.sort()
return [v[0] for v in vols] return vols
# ------------------------------------------------------------ # ------------------------------------------------------------
@ -43,13 +44,40 @@ def detect_volumes(book_base: str):
# ------------------------------------------------------------ # ------------------------------------------------------------
def build_merge_block(title: str, author: str, volumes): def build_merge_block(title: str, author: str, volumes):
lines = [] lines = []
for vol in volumes:
# --------------------------------------------------------
# Normalize input (defensive)
# --------------------------------------------------------
title = (title or "").strip()
author = (author or "").strip()
total_vols = len(volumes)
# Padding-regel:
# - altijd minimaal 2 (01, 02)
# - 3 bij >=100
if total_vols >= 100:
pad = 3
else:
pad = 2
for num, dirname in volumes:
vol_num = f"{num:0{pad}d}" # voor filename
series_part = f"{num:0{pad}d}" # voor series-part (string!)
line = ( line = (
f'm4b-tool merge --jobs=4 --writer="{author}" ' f"m4b-tool merge --jobs=4 "
f'--albumartist="{author}" --album="{title}" ' f'--writer="{author}" '
f'--name="{title}" --output-file="{title}-{vol}.m4b" ' f'--sortalbum="{title}" '
f'"{vol}" -vvv' f'--albumartist="{author}" '
f'--album="{title}" '
f'--name="{title}" '
f'--series="{title}" '
f'--series-part="{series_part}" '
f'--output-file="{title}-{vol_num}.m4b" '
f'"{dirname}" -vvv'
) )
lines.append(line) lines.append(line)
if not lines: if not lines:
@ -61,7 +89,14 @@ def build_merge_block(title: str, author: str, volumes):
# ------------------------------------------------------------ # ------------------------------------------------------------
# Main generator # Main generator
# ------------------------------------------------------------ # ------------------------------------------------------------
@logcall
def generate_all_scripts(book_base: str, title: str, author: str): def generate_all_scripts(book_base: str, title: str, author: str):
# --------------------------------------------------------
# Defensive normalize
# --------------------------------------------------------
title = (title or "").strip()
author = (author or "").strip()
log(f"[SCRIPTGEN] Generating scripts in {book_base}") log(f"[SCRIPTGEN] Generating scripts in {book_base}")
# Load templates # Load templates

@ -0,0 +1,94 @@
# ============================================================
# File: scraper/services/audio_completion.py
# Purpose:
# Orchestration hook after audio completion.
#
# Rules (STRICT):
# - ALWAYS read via get_book_state()
# - Use ONLY merged counters from repository
# - NO usage of derived status field
# - Completion rule:
# audio_completed < chapters_total → NOT DONE
# ============================================================
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import (
get_book_state,
try_trigger_statuscheck,
)
from scraper.services.status_check_service import StatusCheckService
from scraper.tasks.m4b_tasks import queue_m4b_for_book
@logcall
def trigger_audio_completion_check(book_idx: str):
"""
Called after inc_audio_done() OR inc_audio_skipped().
Flow:
1. Fetch canonical merged state from repository
2. Evaluate completion via merged counters ONLY
3. Run filesystem validation (authoritative)
4. Apply idempotency guard
5. Queue m4b exactly once
"""
try:
# ----------------------------------------------------
# STEP 1 — CANONICAL MERGED STATE
# ----------------------------------------------------
state = get_book_state(book_idx)
chapters_total = int(state.get("chapters_total", 0))
audio_done = int(state.get("audio_done", 0))
audio_skipped = int(state.get("audio_skipped", 0))
audio_completed = audio_done + audio_skipped
log(
f"[AUDIO-COMPLETION] book={book_idx} "
f"audio_completed={audio_completed} chapters_total={chapters_total}"
)
# ----------------------------------------------------
# STEP 2 — FAST REJECT (MERGED COUNTERS ONLY)
# ----------------------------------------------------
if chapters_total <= 0 or audio_completed < chapters_total:
log(f"[AUDIO-COMPLETION] not yet complete for book={book_idx}")
return
# ----------------------------------------------------
# STEP 3 — FILESYSTEM VALIDATION (AUTHORITATIVE)
# ----------------------------------------------------
result = StatusCheckService.run(book_idx)
fs = result.get("filesystem", {})
audio_files = fs.get("audio_files", 0)
chapters_txt = fs.get("chapters_txt", 0)
effective_audio = audio_files + audio_skipped
if effective_audio < chapters_txt:
log(
f"[AUDIO-COMPLETION] FS validation failed "
f"(audio_files={audio_files}, skipped={audio_skipped}, txt={chapters_txt})"
)
return
# ----------------------------------------------------
# STEP 4 — IDEMPOTENCY GUARD (AFTER FS CONFIRMATION)
# ----------------------------------------------------
if not try_trigger_statuscheck(book_idx):
log(f"[AUDIO-COMPLETION] statuscheck already triggered for {book_idx}")
return
# ----------------------------------------------------
# STEP 5 — FINAL ACTION
# ----------------------------------------------------
log(f"[AUDIO-COMPLETION] DONE → queue m4b for book={book_idx}")
queue_m4b_for_book(book_idx)
except Exception as exc:
# MUST NEVER break audio workers
log(f"[AUDIO-COMPLETION][ERROR] book={book_idx} error={exc}")

@ -0,0 +1,45 @@
# ============================================================
# File: scraper/services/cover_service.py
# ============================================================
import os
import requests
from logbus.publisher import log
from typing import Optional
class CoverService:
@staticmethod
def download_main_cover(cover_url: str, book_id: str) -> Optional[str]:
"""
Downloads cover image into: static/covers/<book_id>.jpg.
Returns local path or None.
"""
if not cover_url:
log(f"[COVER] No cover URL for book={book_id}")
return None
static_dir = os.path.join("static", "covers")
os.makedirs(static_dir, exist_ok=True)
dst_path = os.path.join(static_dir, f"{book_id}.jpg")
try:
log(f"[COVER] Downloading: {cover_url}")
resp = requests.get(
cover_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
)
resp.raise_for_status()
with open(dst_path, "wb") as f:
f.write(resp.content)
log(f"[COVER] Stored: {dst_path}")
return dst_path
except Exception as e:
log(f"[COVER] FAILED ({cover_url}) → {e}")
return None

@ -0,0 +1,95 @@
# ============================================================
# File: scraper/services/init_service.py
# Purpose:
# Orchestrate INIT-flow:
# - resolve site
# - fetch minimal metadata
# - derive book_idx
# - register in SQLite
# - store main cover
# ============================================================
import re
from scraper.services.site_resolver import SiteResolver
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.cover_service import CoverService
from db.repository import register_book
from scraper.logger_decorators import logcall
class InitService:
# ------------------------------------------------------------
# BOOK IDX DERIVATION
# ------------------------------------------------------------
@staticmethod
@logcall
def derive_book_id(url: str) -> str:
"""
PTWXZ URL format ends with /{id}.html.
If no match fallback to sanitized URL.
Returns:
book_idx (string)
"""
m = re.search(r"/(\d+)\.html$", url)
if m:
return m.group(1)
# Fallback — ensures deterministic ID for unknown formats
return url.replace("/", "_").replace(":", "_")
# ------------------------------------------------------------
# MAIN INIT FLOW
# ------------------------------------------------------------
@staticmethod
@logcall
def execute(url: str) -> dict:
"""
INIT entry point.
Returns complete metadata + registration result.
"""
# 1) Resolve site handler
site = SiteResolver.resolve(url)
# 2) Create unified book_idx
book_idx = InitService.derive_book_id(url)
# Some site objects historically expect .book_id — we support it but DO NOT rely on it.
site.book_id = book_idx
# 3) Fetch initial metadata (title/author/description/cover)
meta = ScrapeEngine.fetch_metadata_only(site, url)
title = meta.get("title") or "Unknown"
author = meta.get("author")
description = meta.get("description")
cover_url = meta.get("cover_url")
# 4) Download & store main cover for UI
cover_path = CoverService.download_main_cover(cover_url, book_idx)
# 5) Register in SQLite (book_idx is the SOLE primary ID)
register_book(
book_idx=book_idx,
title=title,
author=author,
description=description,
cover_url=cover_url,
cover_path=cover_path,
book_url=url,
)
# 6) Return metadata for UI / API
return {
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path,
"status": "registered",
}

@ -0,0 +1,286 @@
# ============================================================
# File: scraper/services/scrape_engine.py (C&U — no circular import)
# Purpose:
# Unified scraping engine for INIT-flow and Celery tasks.
# ScrapeEngine does NOT determine book_idx itself.
# ============================================================
import os
import time
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from logbus.publisher import log
from scraper.logger import log_debug
from scraper.logger_decorators import logcall
from scraper.utils.utils import load_replacements
class ScrapeEngine:
"""
Central scraping engine.
Metadata + chapterlist scraping.
All methods logged with @logcall.
IMPORTANT:
- ScrapeEngine NEVER decides book_idx.
- No dependency on InitService (prevents circular import).
"""
# ------------------------------------------------------------
# REPLACEMENTS LOADER
# ------------------------------------------------------------
@staticmethod
@logcall
def _apply_replacements(site):
fp = os.path.join(os.getcwd(), "replacements.txt")
extra = load_replacements(fp)
if not hasattr(site, "replacements"):
site.replacements = {}
site.replacements.update(extra)
return True
# ------------------------------------------------------------
# RATE LIMITER
# ------------------------------------------------------------
MIN_DELAY = 1.0 / float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1"))
@staticmethod
@logcall
def _throttle(last_time=[0]):
now = time.time()
elapsed = now - last_time[0]
if elapsed < ScrapeEngine.MIN_DELAY:
time.sleep(ScrapeEngine.MIN_DELAY - elapsed)
last_time[0] = time.time()
return True
# ------------------------------------------------------------
# HTTP GET
# ------------------------------------------------------------
@staticmethod
@logcall
def _get_doc(url: str, site):
attempt = 1
while True:
ScrapeEngine._throttle()
log_debug(f"[SCRAPER] GET {url} (attempt {attempt})")
try:
resp = requests.get(
url,
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
except Exception as e:
log_debug(f"Network error {e} → retry {attempt + 1}s")
time.sleep(attempt + 1)
attempt += 1
continue
code = resp.status_code
if code == 200:
resp.encoding = getattr(site, "encoding", "utf-8")
return BeautifulSoup(resp.text, "lxml")
if code == 429:
cooldown = 60
log_debug("429 detected — cooldown 60s")
for i in range(cooldown, 0, -1):
log_debug(f" cooldown {i}s…")
time.sleep(1)
attempt += 1
continue
if code in (403, 500):
wait = min(5 * attempt, 30)
log_debug(f"HTTP {code} → retry in {wait}s")
time.sleep(wait)
attempt += 1
continue
wait = attempt + 1
log_debug(f"Unexpected HTTP {code} → sleep {wait}s")
time.sleep(wait)
attempt += 1
# ------------------------------------------------------------
# PARSER HELPERS
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_title(soup):
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownTitle"
@staticmethod
@logcall
def _parse_author(soup):
td = soup.find("td", string=lambda t: t and "" in t)
if td and "" in td.get_text():
return td.get_text(strip=True).split("")[1]
return "UnknownAuthor"
@staticmethod
@logcall
def _parse_description(soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
txt = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if txt:
parts.append(txt)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSER (NO InitService dependency)
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_cover(soup, site):
"""
Extract book index from URL heuristically instead of InitService
(prevents circular import).
"""
# Typical Chinese novel sites embed numeric ID in URL path
try:
parsed = urlparse(site.url)
digits = re.findall(r"\d+", parsed.path)
book_idx = digits[-1] if digits else None
except Exception:
book_idx = None
imgs = soup.find_all("img", src=True)
candidates = []
for img in imgs:
src = img["src"].strip()
filename = os.path.basename(src)
if book_idx and book_idx in filename:
candidates.append((filename, src))
if not candidates:
return None
candidates.sort(key=lambda t: len(t[0])) # smallest filename
return urljoin(site.root, candidates[0][1])
# ------------------------------------------------------------
# RESOLVE CHAPTER PAGE
# ------------------------------------------------------------
@staticmethod
@logcall
def _resolve_chapter_page(soup, site):
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
if not node:
raise ValueError("Could not locate chapter list base node")
href = node.select_one("a").get("href")
url = urljoin(site.root, href)
parsed = urlparse(url)
basepath = parsed.path.rsplit("/", 1)[0] + "/"
chapter_base = f"{parsed.scheme}://{parsed.netloc}{basepath}"
return url, chapter_base
# ------------------------------------------------------------
# PARSE CHAPTER LINKS
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_chapter_links(soup, chapter_base, selector):
cont = soup.select_one(selector)
if not cont:
return []
items = cont.select("ul li a[href]")
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full = urljoin(chapter_base, href)
chapters.append({"num": idx, "title": title, "url": full})
idx += 1
return chapters
# ============================================================
# PUBLIC APIS
# ============================================================
@staticmethod
@logcall
def fetch_metadata_only(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url # needed for cover parsing
return {
"title": ScrapeEngine._parse_title(soup),
"author": ScrapeEngine._parse_author(soup),
"description": ScrapeEngine._parse_description(soup),
"cover_url": ScrapeEngine._parse_cover(soup, site),
"book_url": url,
}
@staticmethod
@logcall
def fetch_metadata_and_chapters(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url
title = ScrapeEngine._parse_title(soup)
author = ScrapeEngine._parse_author(soup)
desc = ScrapeEngine._parse_description(soup)
cover = ScrapeEngine._parse_cover(soup, site)
chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site)
chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site)
chapters = ScrapeEngine._parse_chapter_links(
chapter_soup, chapter_base, site.chapter_list_selector
)
return {
"title": title,
"author": author,
"description": desc,
"cover_url": cover,
"chapters": chapters,
"chapters_total": len(chapters),
"book_url": url,
}
@staticmethod
@logcall
def fetch_chapterlist(site, url: str):
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site)
chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site)
return ScrapeEngine._parse_chapter_links(
chapter_soup, chapter_base, site.chapter_list_selector
)

@ -0,0 +1,20 @@
# ============================================================
# File: scraper/services/site_resolver.py
# Purpose:
# Determine which BookSite implementation applies for a given URL.
# This keeps INIT-flow and SCRAPE-flow site-agnostic.
# ============================================================
from scraper.sites import BookSite # current PTWXZ implementation
class SiteResolver:
"""
Resolves the correct BookSite class based on URL.
Currently only PTWXZ/Piaotian is supported.
"""
@staticmethod
def resolve(url: str):
# Later: add more domain rules for other sources
return BookSite()

@ -0,0 +1,135 @@
# ============================================================
# File: scraper/services/status_check_service.py
# Purpose:
# Handmatige, idempotente statuscheck per boek.
#
# Bepaalt op basis van het filesystem:
# - aantal gedownloade chapters (.txt)
# - aantal gegenereerde audiofiles (.m4b)
#
# En schrijft deze gevalideerde werkelijkheid naar SQL.
#
# LET OP:
# - Geen Redis
# - Geen Celery
# - Geen status-transities
# - Geen pipeline-logica
# ============================================================
import os
from datetime import datetime
from typing import Dict, Any
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.state_sql import sql_fetch_book, sql_update_book
class StatusCheckService:
"""
Statuscheck op basis van filesystem.
Single source of truth = disk.
"""
@staticmethod
@logcall
def run(book_idx: str) -> Dict[str, Any]:
"""
Voer statuscheck uit voor één boek.
Returns een inspecteerbaar dict met:
- filesystem tellingen
- SQL before / after snapshot
"""
# ----------------------------------------------------
# 1. SQL fetch (bestaat het boek?)
# ----------------------------------------------------
sql_before = sql_fetch_book(book_idx)
if not sql_before:
raise ValueError(f"[STATUSCHECK] Book not found in SQL: {book_idx}")
# ----------------------------------------------------
# 2. Bepaal filesystem root
# ----------------------------------------------------
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
title = sql_before.get("title")
book_dir = os.path.join(output_root, title)
if not os.path.isdir(book_dir):
log(
f"[STATUSCHECK] No output directory for book_idx={book_idx} : title='{title}')"
)
chapters_txt = 0
audio_files = 0
volumes = 0
else:
chapters_txt = 0
audio_files = 0
volumes = 0
# ------------------------------------------------
# 3. Scan volumes
# ------------------------------------------------
for entry in os.listdir(book_dir):
if not entry.lower().startswith("volume_"):
continue
volumes += 1
volume_path = os.path.join(book_dir, entry)
if not os.path.isdir(volume_path):
continue
# ---- TXT chapters ----
for fname in os.listdir(volume_path):
if fname.lower().endswith(".txt"):
chapters_txt += 1
# ---- Audio ----
audio_dir = os.path.join(volume_path, "Audio")
if os.path.isdir(audio_dir):
for fname in os.listdir(audio_dir):
if fname.lower().endswith(".m4b"):
audio_files += 1
# ----------------------------------------------------
# 4. SQL update (snapshot)
# ----------------------------------------------------
now = datetime.utcnow().isoformat(timespec="seconds")
update_fields = {
"downloaded": chapters_txt,
"audio_done": audio_files,
"last_update": now,
}
sql_update_book(book_idx, update_fields)
sql_after = sql_fetch_book(book_idx)
# ----------------------------------------------------
# 5. Resultaat voor inspect/debug
# ----------------------------------------------------
result = {
"book_idx": book_idx,
"filesystem": {
"book_dir": book_dir,
"exists": os.path.isdir(book_dir),
"volumes": volumes,
"chapters_txt": chapters_txt,
"audio_files": audio_files,
},
"sql_before": sql_before,
"sql_after": sql_after,
"notes": [],
}
log(
f"[STATUSCHECK] book_idx={book_idx} "
f"chapters={chapters_txt} audio={audio_files}"
)
return result

@ -0,0 +1,28 @@
# ============================================================
# File: scraper/sites/__init__.py
# Purpose:
# Site autodetection based on URL.
# ============================================================
from scraper.sites.piaotian import PiaotianScraper
def get_scraper_for_url(url: str):
"""
Return the correct scraper instance for a given URL.
Later: add more site implementations.
"""
if "ptwxz" in url or "piaotian" in url:
return PiaotianScraper()
raise ValueError(f"No scraper available for URL: {url}")
# ============================================================
# Backwards-compatibility export for legacy BookScraper
# ============================================================
# Old code expects:
# from scraper.sites import BookSite
# We map that to our new PiaotianScraper implementation.
BookSite = PiaotianScraper

@ -0,0 +1,52 @@
# ============================================================
# File: scraper/sites/base.py
# Purpose:
# Abstract interface that every site-specific scraper must implement.
# ============================================================
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from typing import Optional
class SiteScraper(ABC):
"""
Defines the interface for site-specific scrapers.
Each concrete scraper (Piaotian, Biquge, etc.) must implement these.
"""
@property
@abstractmethod
def root(self) -> str: ...
@property
@abstractmethod
def encoding(self) -> str: ...
@property
@abstractmethod
def chapter_list_selector(self) -> str: ...
# --------------------------
# Metadata extraction
# --------------------------
@abstractmethod
def parse_title(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_author(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_description(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]: ...
# --------------------------
# Chapter extraction
# --------------------------
@abstractmethod
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_chapter_list(self, soup: BeautifulSoup) -> list: ...

@ -0,0 +1,121 @@
# ============================================================
# File: scraper/sites/piaotian.py
# Purpose:
# Concrete SiteScraper implementation for ptwxz.com (Piaotian).
# Moves all parsing logic out of BookScraper.
# ============================================================
from scraper.sites.base import SiteScraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from typing import Optional
class PiaotianScraper(SiteScraper):
root = "https://www.ptwxz.com"
encoding = "GB18030"
chapter_list_selector = "div.centent"
# ------------------------------------------------------------
# METADATA PARSING
# ------------------------------------------------------------
def parse_title(self, soup: BeautifulSoup) -> str:
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownBook"
def parse_author(self, soup: BeautifulSoup) -> str:
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
return raw.split("")[1] if "" in raw else "UnknownAuthor"
def parse_description(self, soup: BeautifulSoup) -> str:
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
# stop when next <span> reappears
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSING
# (exactly your BookScraper._parse_cover logic)
# ------------------------------------------------------------
def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]:
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", url)
if not m:
return None
book_id = m.group(1)
# Extract vol (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", url)
volume = m2.group(1) if m2 else None
imgs = soup.find_all("img", src=True)
chosen = None
# Priority 1: match "/files/article/image/{vol}/{book_id}/"
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
break
# Priority 2: endswith "/{book_id}s.jpg"
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
break
if not chosen:
return None
return urljoin(self.root, chosen)
# ------------------------------------------------------------
# CHAPTER EXTRACTION
# ------------------------------------------------------------
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str:
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
return urljoin(self.root, href)
def parse_chapter_list(self, soup: BeautifulSoup) -> list:
cont = soup.select_one(self.chapter_list_selector)
items = cont.select("ul li a[href]") if cont else []
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full_url = urljoin(self.root, href)
chapters.append({"num": idx, "title": title, "url": full_url})
idx += 1
return chapters

@ -0,0 +1,7 @@
# scraper/state.py
import os
import redis
REDIS_STATE_URL = os.getenv("REDIS_STATE", "redis://redis:6379/2")
state = redis.Redis.from_url(REDIS_STATE_URL, decode_responses=True)

@ -1,5 +1,8 @@
# ============================================================ # ============================================================
# File: scraper/tasks/audio_tasks.py # File: scraper/tasks/audio_tasks.py
# Purpose: Convert chapter text files into audio using macOS
# “say”, with Redis-based slot control.
# Updated: now uses db.repository for audio counters.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
@ -7,57 +10,81 @@ from logbus.publisher import log
import os import os
import subprocess import subprocess
import time import time
import socket
import os
from scraper.abort import abort_requested from scraper.abort import abort_requested
from scraper.logger_decorators import logcall
from redis import Redis from redis import Redis
from urllib.parse import urlparse from urllib.parse import urlparse
from scraper.services.audio_completion import trigger_audio_completion_check
# Kies lokale redis als aanwezig, anders standaard backend # NEW — unified repository façade
redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND") from db.repository import (
inc_audio_done,
inc_audio_skipped,
)
parsed = urlparse(redis_url) HOST = socket.gethostname()
# ------------------------------------------------------------ # ------------------------------------------------------------
# REGULIER REDIS CLIENT (slots, file checks, state) # REDIS CLIENT SETUP
# ------------------------------------------------------------ # ------------------------------------------------------------
redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
parsed = urlparse(redis_url)
# Slot locking Redis client
redis_client = Redis( redis_client = Redis(
host=parsed.hostname, host=parsed.hostname,
port=parsed.port, port=parsed.port,
db=parsed.path.strip("/"), db=parsed.path.strip("/"),
) )
# ------------------------------------------------------------ # Abort + global progress flags always live in DB 0
# BACKEND CLIENT (abort flags, progress counters) - altijd DB 0
# ------------------------------------------------------------
backend_client = Redis( backend_client = Redis(
host=parsed.hostname, host=parsed.hostname,
port=parsed.port, port=parsed.port,
db=0, db=0,
) )
# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
AUDIO_TIMEOUT = int(os.getenv("AUDIO_TIMEOUT_SECONDS", "300")) AUDIO_TIMEOUT = int(os.getenv("AUDIO_TIMEOUT_SECONDS", "300"))
AUDIO_VOICE = os.getenv("AUDIO_VOICE", "SinJi") AUDIO_VOICE = os.getenv("AUDIO_VOICE", "SinJi")
AUDIO_RATE = int(os.getenv("AUDIO_RATE", "200")) AUDIO_RATE = int(os.getenv("AUDIO_RATE", "200"))
HOST_PATH = os.getenv("HOST_PATH", "/app/output")
AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1"))
HOST_PATH = os.getenv("HOST_PATH", "/app/output")
CONTAINER_PREFIX = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "/app/output") CONTAINER_PREFIX = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "/app/output")
AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1"))
# ============================================================
# CELERY TASK
# ============================================================
@celery_app.task(bind=True, queue="audio", ignore_result=True) @celery_app.task(bind=True, queue="audio", ignore_result=True)
@logcall
def generate_audio( def generate_audio(
self, book_id, volume_name, chapter_number, chapter_title, chapter_text self, book_id, volume_name, chapter_number, chapter_title, chapter_path
): ):
log(f"[AUDIO] CH{chapter_number}: START task → raw_input={chapter_text}") """
chapter_path: absolute container path to chapter text file.
"""
log(f"[AUDIO]({HOST}) CH{chapter_number}: START → {chapter_title}")
# Abort early # ------------------------------------------------------------
# ABORT CHECK
# ------------------------------------------------------------
if abort_requested(book_id, backend_client): if abort_requested(book_id, backend_client):
log(f"[AUDIO] ABORT detected → skip CH{chapter_number}") inc_audio_skipped(book_id)
log(f"[AUDIO]({HOST}) ABORT detected → skip CH{chapter_number}")
return return
# ============================================================ # ------------------------------------------------------------
# ACQUIRE AUDIO SLOT # ACQUIRE SLOT
# ============================================================ # ------------------------------------------------------------
slot_key = None slot_key = None
ttl = AUDIO_TIMEOUT + 15 ttl = AUDIO_TIMEOUT + 15
@ -68,11 +95,13 @@ def generate_audio(
log(f"[AUDIO] CH{chapter_number}: Acquired slot {i}/{AUDIO_SLOTS}") log(f"[AUDIO] CH{chapter_number}: Acquired slot {i}/{AUDIO_SLOTS}")
break break
# Need to wait
if slot_key is None: if slot_key is None:
log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting...") log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting")
start_wait = time.time() start_wait = time.time()
while slot_key is None: while slot_key is None:
# Try all slots again
for i in range(1, AUDIO_SLOTS + 1): for i in range(1, AUDIO_SLOTS + 1):
key = f"audio_slot:{i}" key = f"audio_slot:{i}"
if redis_client.set(key, "1", nx=True, ex=ttl): if redis_client.set(key, "1", nx=True, ex=ttl):
@ -80,32 +109,32 @@ def generate_audio(
log(f"[AUDIO] CH{chapter_number}: Slot acquired after wait") log(f"[AUDIO] CH{chapter_number}: Slot acquired after wait")
break break
if slot_key: # If still no slot
break if not slot_key:
if abort_requested(book_id, backend_client):
log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}")
inc_audio_skipped(book_id)
return
if abort_requested(book_id, backend_client): if time.time() - start_wait > ttl:
log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}") log(f"[AUDIO] CH{chapter_number}: Wait timeout → abort audio")
return inc_audio_skipped(book_id)
return
if time.time() - start_wait > ttl: time.sleep(0.25)
log(f"[AUDIO] CH{chapter_number}: Slot wait timeout → aborting audio")
return
time.sleep(0.25) # ------------------------------------------------------------
# ============================================================
# PATH NORMALISATION # PATH NORMALISATION
# ============================================================ # ------------------------------------------------------------
container_path = chapter_path
container_path = chapter_text
# Fix 1 — container_path kan None zijn → abort zonder crash
if not container_path: if not container_path:
log(f"[AUDIO] CH{chapter_number}: FATAL — no input path provided") log(f"[AUDIO] CH{chapter_number}: ERROR — no input file path provided")
redis_client.delete(slot_key) redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return return
# Fix 2 — veilige startswith # Strip container prefix so that host path is resolvable
if CONTAINER_PREFIX and container_path.startswith(CONTAINER_PREFIX): if CONTAINER_PREFIX and container_path.startswith(CONTAINER_PREFIX):
relative_path = container_path[len(CONTAINER_PREFIX) :].lstrip("/") relative_path = container_path[len(CONTAINER_PREFIX) :].lstrip("/")
else: else:
@ -114,35 +143,36 @@ def generate_audio(
parts = relative_path.split("/") parts = relative_path.split("/")
if len(parts) < 3: if len(parts) < 3:
log( log(
f"[AUDIO] CH{chapter_number}: FATAL — cannot parse book/volume from {relative_path}" f"[AUDIO] CH{chapter_number}: ERROR — cannot parse book/volume from {relative_path}"
) )
redis_client.delete(slot_key) redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return return
book_from_path = parts[0] # book_from_path = parts[0] # volume_name passed explicitly anyway
volume_from_path = parts[1] # volume_from_path = parts[1]
host_path = os.path.join(HOST_PATH, relative_path) host_path = os.path.join(HOST_PATH, relative_path)
# ============================================================ # ------------------------------------------------------------
# OUTPUT PREP # OUTPUT DIRECTORY
# ============================================================ # ------------------------------------------------------------
base_dir = os.path.join(HOST_PATH, parts[0], parts[1], "Audio")
base_dir = os.path.join(HOST_PATH, book_from_path, volume_from_path, "Audio")
os.makedirs(base_dir, exist_ok=True) os.makedirs(base_dir, exist_ok=True)
safe_num = f"{chapter_number:04d}" safe_num = f"{chapter_number:04d}"
audio_file = os.path.join(base_dir, f"{safe_num}.m4a") audio_file = os.path.join(base_dir, f"{safe_num}.m4b")
# Skip if audio already exists
if os.path.exists(audio_file): if os.path.exists(audio_file):
log(f"[AUDIO] Skip CH{chapter_number} → already exists") log(f"[AUDIO] CH{chapter_number}: Already exists → skip")
redis_client.delete(slot_key) redis_client.delete(slot_key)
inc_audio_skipped(book_id)
trigger_audio_completion_check(book_id)
return return
# ============================================================ # ------------------------------------------------------------
# BUILD CMD # BUILD TTS COMMAND
# ============================================================ # ------------------------------------------------------------
cmd = ( cmd = (
f"say --voice={AUDIO_VOICE} " f"say --voice={AUDIO_VOICE} "
f"--input-file='{host_path}' " f"--input-file='{host_path}' "
@ -153,28 +183,36 @@ def generate_audio(
f"--data-format=aac" f"--data-format=aac"
) )
log(f"[AUDIO] CH{chapter_number}: CMD = {cmd}") log(f"[AUDIO]({HOST}) CH{chapter_number} → output: {audio_file}")
# ============================================================ # ------------------------------------------------------------
# RUN TTS # EXECUTE
# ============================================================ # ------------------------------------------------------------
try: try:
subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT) subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT)
log(f"[AUDIO] CH{chapter_number}: Completed")
# NEW — repository façade
inc_audio_done(book_id)
trigger_audio_completion_check(book_id)
log(f"trigger_audio_completion_check ")
log(f"[AUDIO]({HOST}) CH{chapter_number}: Completed")
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
log(f"[AUDIO] CH{chapter_number}: TIMEOUT → remove incomplete file") log(f"[AUDIO]({HOST}) CH{chapter_number}: TIMEOUT → removing file")
if os.path.exists(audio_file): if os.path.exists(audio_file):
try: try:
os.remove(audio_file) os.remove(audio_file)
except Exception: except Exception:
pass pass
inc_audio_skipped(book_id)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
log(f"[AUDIO] CH{chapter_number}: ERROR during say → {e}") log(f"[AUDIO] CH{chapter_number}: ERROR during say → {e}")
inc_audio_skipped(book_id)
except Exception as e: except Exception as e:
log(f"[AUDIO] CH{chapter_number}: UNEXPECTED ERROR → {e}") log(f"[AUDIO] CH{chapter_number}: UNEXPECTED ERROR → {e}")
inc_audio_skipped(book_id)
finally: finally:
if slot_key: if slot_key:

@ -1,81 +1,167 @@
# ============================================================ # ============================================================
# File: scraper/tasks/controller_tasks.py # File: scraper/tasks/controller_tasks.py
# Purpose: # Purpose:
# Start the download → parse → save pipeline for a scraped book, # FULL scrape entrypoint + launching download/parse/save pipelines.
# including progress/abort tracking via book_id. # NO result.get() anywhere. Scraping is done inline.
# ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from logbus.publisher import log from logbus.publisher import log
from scraper.download_controller import DownloadController import os
from scraper.progress import ( import time
set_total, import redis
) from urllib.parse import urlparse
from scraper.logger_decorators import logcall
from scraper.abort import abort_requested from scraper.abort import abort_requested
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.site_resolver import SiteResolver
from db.repository import fetch_book, set_chapters_total
from scraper.download_controller import DownloadController
print(">>> [IMPORT] controller_tasks.py loaded") print(">>> [IMPORT] controller_tasks.py loaded")
@celery_app.task(bind=True, queue="controller", ignore_result=False) # =============================================================
def launch_downloads(self, book_id: str, scrape_result: dict): # 1) PUBLIC ENTRYPOINT — CALLED FROM /start
# =============================================================
@celery_app.task(
bind=True,
queue="controller",
ignore_result=False,
name="scraper.tasks.controller_tasks.start_full_scrape",
)
@logcall
def start_full_scrape(self, book_idx: str):
"""
FULL SCRAPE ENTRYPOINT.
Scraping is done inline no Celery .get() needed.
""" """
Launch the entire pipeline (download parse save),
AND initialize progress counters.
Chapter-level progress is updated INSIDE the download/parse/save tasks. log(f"[CTRL] start_full_scrape(book_idx={book_idx})")
This task MUST NOT call .get() on async subtasks (Celery restriction).
# Abort before doing anything
if abort_requested(book_idx):
log(f"[CTRL] PRE-ABORT flag detected for {book_idx}")
return {"book_idx": book_idx, "aborted": True, "reason": "pre-abort"}
# --------------------------------------------------------
# 1) Load book metadata from SQLite
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
msg = f"[CTRL] Book '{book_idx}' not found in DB"
log(msg)
raise ValueError(msg)
url = book.get("book_url")
if not url:
msg = f"[CTRL] No book_url stored for {book_idx}"
log(msg)
raise ValueError(msg)
# --------------------------------------------------------
# 2) INLINE SCRAPE (fast, no Celery wait)
# --------------------------------------------------------
site = SiteResolver.resolve(url)
try:
scrape_result = ScrapeEngine.fetch_metadata_and_chapters(site, url)
log(f"[CTRL] Scrape OK for {book_idx}: {scrape_result.get('title')}")
except Exception as e:
log(f"[CTRL] ERROR during scrape of {book_idx}: {e}")
raise
# --------------------------------------------------------
# 3) Continue → dispatch pipelines
# --------------------------------------------------------
return launch_downloads(book_idx, scrape_result)
# =============================================================
# 2) PIPELINE DISPATCH (NOT a Celery task)
# =============================================================
@logcall
def launch_downloads(book_idx: str, scrape_result: dict):
"""
Launches the entire processing pipeline:
- initialize Redis UI state
- initialize SQLite totals
- dispatch per-chapter pipelines via DownloadController
""" """
title = scrape_result.get("title", "UnknownBook") title = scrape_result.get("title", "UnknownBook")
chapters = scrape_result.get("chapters", []) or [] chapters = scrape_result.get("chapters", []) or []
total = len(chapters) total = len(chapters)
log(f"[CTRL] Book '{title}'{total} chapters (book_id={book_id})")
# ------------------------------------------------------------ # ------------------------------------------------------------
# INIT PROGRESS # INIT REDIS STATE
# ------------------------------------------------------------ # ------------------------------------------------------------
set_total(book_id, total) broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
log(f"[CTRL] Progress initialized for {book_id}: total={total}") parsed = urlparse(broker_url)
r = redis.Redis(
host=parsed.hostname,
port=parsed.port,
db=int(parsed.path.strip("/")),
decode_responses=True,
)
base = f"book:{book_idx}:state"
r.hset(base, "title", title)
r.hset(base, "status", "starting")
r.hset(base, "chapters_total", total)
r.hset(base, "chapters_download_done", 0)
r.hset(base, "chapters_download_skipped", 0)
r.hset(base, "chapters_parsed_done", 0)
r.hset(base, "audio_done", 0)
r.hset(base, "audio_skipped", 0)
r.hset(base, "last_update", int(time.time()))
# ------------------------------------------------------------ # ------------------------------------------------------------
# BUILD CONTROLLER # INIT SQLITE SNAPSHOT
# ------------------------------------------------------------ # ------------------------------------------------------------
ctl = DownloadController(book_id, scrape_result) try:
set_chapters_total(book_idx, total)
except Exception as e:
log(f"[CTRL] ERROR updating SQLite totals: {e}")
raise
log(f"[CTRL] Initialized totals for {book_idx}: {total}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# START PIPELINES (ASYNC) # ABORT CHECK BEFORE LAUNCHING JOBS
# Returns a celery group AsyncResult. We DO NOT iterate or get().
# Progress & failures are handled by the worker subtasks.
# ------------------------------------------------------------ # ------------------------------------------------------------
try: if abort_requested(book_idx):
group_result = ctl.start() log(f"[CTRL] ABORT flag detected — stopping BEFORE dispatch for {book_idx}")
r.hset(base, "status", "aborted")
return {"book_idx": book_idx, "aborted": True, "reason": "abort-before-start"}
log( # ------------------------------------------------------------
f"[CTRL] Pipelines dispatched for '{title}' " # BUILD + DISPATCH PER-CHAPTER PIPELINES
f"(book_id={book_id}, group_id={group_result.id})" # ------------------------------------------------------------
) controller = DownloadController(book_idx, scrape_result)
# Abort flag set BEFORE tasks start?
if abort_requested(book_id):
log(f"[CTRL] ABORT requested before tasks start")
return {"book_id": book_id, "aborted": True}
except Exception as exc: try:
log(f"[CTRL] ERROR while dispatching pipelines: {exc}") group_result = controller.start()
gid = getattr(group_result, "id", None)
log(f"[CTRL] Pipelines dispatched for {book_idx} (group_id={gid})")
except Exception as e:
log(f"[CTRL] ERROR dispatching pipelines for {book_idx}: {e}")
raise raise
# ------------------------------------------------------------ # Update UI state to "downloading"
# CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS r.hset(base, "status", "downloading")
# (Download/parse/save tasks update progress themselves) r.hset(base, "last_update", int(time.time()))
# ------------------------------------------------------------
log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
return { return {
"book_id": book_id, "book_idx": book_idx,
"total": total, "total": total,
"started": True, "started": True,
"group_id": group_result.id, "group_id": gid,
} }

@ -1,20 +1,24 @@
# ============================================================ # ============================================================
# File: scraper/tasks/download_tasks.py # File: scraper/tasks/download_tasks.py
# Purpose: Download chapter HTML with global concurrency, # Purpose:
# retry/backoff logic, 429 support, and abort-awareness. # Download chapter HTML into payload["html"].
# # Updated for book_idx unified ID model.
# Logging:
# - timestamp + book_id in message
# - logbus.publisher → console
# - ui_log.push_ui → Redis GUI
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from scraper.utils import get_save_path from scraper.utils.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started from scraper.abort import abort_requested, chapter_started, mark_chapter_started
# Unified repository façade
from db.repository import (
set_status,
inc_download_done,
inc_download_skipped,
)
from logbus.publisher import log from logbus.publisher import log
from scraper.ui_log import push_ui from scraper.ui_log import push_ui
from scraper.logger_decorators import logcall
import requests import requests
import redis import redis
@ -29,9 +33,9 @@ print(">>> [IMPORT] download_tasks.py loaded")
# ----------------------------------------------------------- # -----------------------------------------------------------
# TIMESTAMPED LOG WRAPPER # TIMESTAMPED LOG WRAPPER
# ----------------------------------------------------------- # -----------------------------------------------------------
def log_msg(book_id: str, message: str): def log_msg(book_idx: str, message: str):
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
full = f"{ts} [{book_id}] {message}" full = f"{ts} [{book_idx}] {message}"
log(full) log(full)
push_ui(full) push_ui(full)
@ -84,46 +88,63 @@ def release_global_slot():
# ============================================================ # ============================================================
# CELERY TASK — NEW SIGNATURE WITH chapter_dict + book_meta # CELERY TASK — Payload v3 (book_idx model)
# ============================================================ # ============================================================
@celery_app.task(bind=True, queue="download", ignore_result=False) @celery_app.task(bind=True, queue="download", ignore_result=False)
def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict): @logcall
def download_chapter(self, payload: dict):
""" """
New unified chapter model: Payload format:
chapter_dict = {
{
"book_idx": str,
"chapter": {
"num": int, "num": int,
"url": str,
"title": str, "title": str,
"url": str,
"volume_path": str "volume_path": str
} },
"book_meta": dict,
book_meta is propagated through the pipeline for parse/save.
# fields filled during pipeline:
"html": None | str,
"parsed": None | str,
"skipped": bool,
"path": None | str
}
""" """
chapter_num = chapter_dict.get("num") if not payload:
chapter_url = chapter_dict.get("url") raise ValueError("download_chapter received empty payload")
chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
volume_path = chapter_dict.get("volume_path") book_idx = payload["book_idx"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
chapter_num = chapter["num"]
chapter_url = chapter["url"]
chapter_title = chapter.get("title") or f"Chapter {chapter_num}"
volume_path = chapter["volume_path"]
# -----------------------------------------------------------
# STATUS UPDATE (book is now in 'downloading')
# -----------------------------------------------------------
set_status(book_idx, "downloading")
# ----------------------------------------------------------- # -----------------------------------------------------------
# ABORT BEFORE START # ABORT CHECK (skip if not yet started)
# ----------------------------------------------------------- # -----------------------------------------------------------
if abort_requested(book_id) and not chapter_started(book_id, chapter_num): if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num):
msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)" log_msg(book_idx, f"[ABORT] Skip chapter {chapter_num}")
log_msg(book_id, msg)
inc_download_skipped(book_idx)
return {
"book_id": book_id, payload["html"] = None
"chapter": chapter_dict, payload["skipped"] = True
"html": None, payload["path"] = None
"skipped": True, return payload
"path": None,
"abort": True, mark_chapter_started(book_idx, chapter_num)
"book_meta": book_meta,
}
# Mark chapter as started
mark_chapter_started(book_id, chapter_num)
# ----------------------------------------------------------- # -----------------------------------------------------------
# SKIP IF FILE ALREADY EXISTS # SKIP IF FILE ALREADY EXISTS
@ -131,34 +152,28 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
save_path = get_save_path(chapter_num, volume_path) save_path = get_save_path(chapter_num, volume_path)
if os.path.exists(save_path): if os.path.exists(save_path):
log_msg(book_id, f"[DL] SKIP {chapter_num} ({chapter_title}) → {save_path}") log_msg(book_idx, f"[DL] SKIP {chapter_num}{save_path}")
return { inc_download_skipped(book_idx)
"book_id": book_id,
"chapter": chapter_dict, payload["html"] = None
"html": None, payload["skipped"] = True
"skipped": True, payload["path"] = save_path
"path": save_path, return payload
"book_meta": book_meta,
}
# ----------------------------------------------------------- # -----------------------------------------------------------
# GLOBAL + SYNC DELAY # GLOBAL DELAY + CONCURRENCY
# ----------------------------------------------------------- # -----------------------------------------------------------
if GLOBAL_DELAY > 0: if GLOBAL_DELAY > 0:
time.sleep(GLOBAL_DELAY) time.sleep(GLOBAL_DELAY)
wait_for_global_delay() wait_for_global_delay()
acquire_global_slot(MAX_CONCURRENCY) acquire_global_slot(MAX_CONCURRENCY)
log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
# ----------------------------------------------------------- # -----------------------------------------------------------
# HTTP DOWNLOAD # HTTP DOWNLOAD
# ----------------------------------------------------------- # -----------------------------------------------------------
try: try:
log_msg( log_msg(book_idx, f"[DL] Downloading {chapter_num} ({chapter_title})")
book_id,
f"[DL] Downloading {chapter_num} ({chapter_title}): {chapter_url}",
)
resp = requests.get( resp = requests.get(
chapter_url, chapter_url,
@ -170,41 +185,28 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
resp.encoding = resp.apparent_encoding or "gb2312" resp.encoding = resp.apparent_encoding or "gb2312"
html = resp.text html = resp.text
log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes") log_msg(book_idx, f"[DL] OK {chapter_num}: {len(html)} bytes")
return { payload["html"] = html
"book_id": book_id, payload["skipped"] = False
"chapter": chapter_dict, payload["path"] = save_path
"html": html, return payload
"skipped": False,
"path": save_path,
"book_meta": book_meta,
}
except Exception as exc: except Exception as exc:
attempt = self.request.retries attempt = self.request.retries
delay = BASE_DELAY * (BACKOFF**attempt) delay = BASE_DELAY * (BACKOFF**attempt)
# Specific 429 handler # Handle 429
if getattr(getattr(exc, "response", None), "status_code", None) == 429: if getattr(getattr(exc, "response", None), "status_code", None) == 429:
log_msg( log_msg(book_idx, f"[DL] 429 → WAIT {DELAY_429}s")
book_id,
f"[DL] 429 {chapter_num} → WAIT {DELAY_429}s "
f"(attempt {attempt}/{MAX_RETRIES})",
)
time.sleep(DELAY_429) time.sleep(DELAY_429)
set_global_delay() set_global_delay()
raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES) raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES)
# Normal retry # General retry with backoff
log_msg( log_msg(book_idx, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s")
book_id,
f"[DL] ERROR {chapter_num}: {exc} → retry in {delay}s "
f"(attempt {attempt}/{MAX_RETRIES})",
)
raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES) raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
finally: finally:
set_global_delay() set_global_delay()
release_global_slot() release_global_slot()
log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")

@ -0,0 +1,132 @@
# ============================================================
# File: scraper/tasks/m4b_tasks.py
# ============================================================
import os
import subprocess
from typing import List
from celery_app import celery_app
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import fetch_book, store_m4b_error
from scraper.scriptgen import build_merge_block
# ------------------------------------------------------------
# Helper: detect volumes (UNCHANGED)
# ------------------------------------------------------------
def detect_volumes(book_base: str) -> List[str]:
volumes = []
for name in os.listdir(book_base):
if name.lower().startswith("volume_"):
full = os.path.join(book_base, name)
if os.path.isdir(full):
volumes.append(name)
volumes.sort()
return volumes
# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="m4b", ignore_result=True)
@logcall
def run_m4btool(self, book_idx: str):
log(f"[M4B] START book_idx={book_idx}")
book = fetch_book(book_idx)
if not book:
log(f"[M4B] Book not found in SQL: book_idx={book_idx}")
return
title = book.get("title", book_idx)
author = book.get("author", "Unknown")
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
book_base = os.path.join(output_root, title)
log(f"[M4B] Book base directory: {book_base}")
if not os.path.isdir(book_base):
log(f"[M4B] Book directory missing: {book_base}")
return
volumes = detect_volumes(book_base)
if not volumes:
log(f"[M4B] No volumes found for book_idx={book_idx}")
return
log(f"[M4B] Volumes detected: {volumes}")
# --------------------------------------------------------
# Build canonical commands via scriptgen
# --------------------------------------------------------
merge_block = build_merge_block(
title, author, [(i + 1, v) for i, v in enumerate(volumes)]
)
commands = [c.strip() for c in merge_block.split("&&") if c.strip()]
for volume, cmd in zip(volumes, commands):
audio_dir = os.path.join(book_base, volume, "Audio")
if not os.path.isdir(audio_dir):
log(f"[M4B] SKIP {volume}: no Audio directory")
continue
log(f"[M4B] Running for volume={volume}")
log(f"[M4B] CMD: {cmd}")
try:
result = subprocess.run(
cmd,
cwd=book_base,
shell=True,
capture_output=True,
text=True,
check=True,
)
if result.stdout:
log(f"[M4B][STDOUT] {result.stdout}")
except subprocess.CalledProcessError as exc:
log(f"[M4B][FAILED] volume={volume}")
if exc.stdout:
log(f"[M4B][STDOUT] {exc.stdout}")
if exc.stderr:
log(f"[M4B][STDERR] {exc.stderr}")
store_m4b_error(
book_idx=book_idx,
volume=volume,
error_text=exc.stderr or str(exc),
)
continue
except Exception as exc:
log(f"[M4B][UNEXPECTED ERROR] volume={volume}: {exc}")
store_m4b_error(
book_idx=book_idx,
volume=volume,
error_text=str(exc),
)
continue
log(f"[M4B] FINISHED book_idx={book_idx}")
# ------------------------------------------------------------
# Orchestration helper (UNCHANGED)
# ------------------------------------------------------------
@logcall
def queue_m4b_for_book(book_idx: str):
log(f"[M4B] Queuing m4b-tool for book_idx={book_idx}")
celery_app.send_task(
"scraper.tasks.m4b_tasks.run_m4btool",
args=[book_idx],
queue="m4b",
)

@ -1,49 +1,121 @@
# ========================================================= # ============================================================
# File: scraper/tasks/parse_tasks.py # File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text. # Purpose: Parse downloaded HTML into clean chapter text.
# Enhanced version: Piaotia H1→content extractor + clean pipeline # Enhanced Piaotia extractor + selector fallback + clean pipeline.
# NO HARDCODED REPLACEMENTS — everything comes from replacement files # Compatible with payload pipeline v3 + book_idx refactor.
# ========================================================= # ============================================================
from celery_app import celery_app from celery_app import celery_app
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, NavigableString, Comment
from scraper.utils import clean_text, load_all_replacements from scraper.tasks.download_tasks import log_msg
from scraper.tasks.download_tasks import log_msg # unified logger from scraper.utils.utils import clean_text, load_all_replacements
from scraper.logger_decorators import logcall
from db.repository import inc_parsed_done
print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)")
# ============================================================
# PIAOTIA ADVANCED CONTENT EXTRACTOR
# ============================================================
def extract_piaotia_content(soup):
h1 = soup.find("h1")
if not h1:
return None
# Find first table after <h1>
table = None
for sib in h1.next_siblings:
if getattr(sib, "name", None) == "table":
table = sib
break
if not table:
return None
parts = []
for sib in table.next_siblings:
name = getattr(sib, "name", None)
text = None
if hasattr(sib, "get_text"):
text = sib.get_text(strip=True)
# Stop conditions
if isinstance(sib, Comment) and ("翻页" in sib):
break
if name == "div":
sid = sib.get("id", "")
cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break
if text and ("重要声明" in text or "Copyright" in text):
break
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break
if name in ("script", "style"):
continue
if name == "center":
continue
# Accumulate
if isinstance(sib, NavigableString):
s = sib.strip()
if s:
parts.append(s)
elif hasattr(sib, "get_text"):
t = sib.get_text(separator="\n").strip()
if t:
parts.append(t)
return "\n".join(parts).strip()
# ============================================================
# PARSE TASK — PAYLOAD PIPELINE v3 (book_idx)
# ============================================================
@celery_app.task(bind=True, queue="parse", ignore_result=False) @celery_app.task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, download_result: dict): @logcall
""" def parse_chapter(self, payload: dict):
New signature under chapter_dict pipeline:
- receives ONLY the output dict from download_chapter if not payload:
- book_meta is inside download_result["book_meta"] return {"skipped": True, "reason": "empty_payload"}
- chapter_dict is inside download_result["chapter"]
""" # NEW MODEL
book_idx = payload["book_idx"]
book_id = download_result.get("book_id", "NOBOOK") chapter = payload["chapter"]
chapter_dict = download_result.get("chapter") or {} book_meta = payload.get("book_meta") or {}
book_meta = download_result.get("book_meta") or {}
num = chapter.get("num")
chapter_num = chapter_dict.get("num") title = chapter.get("title") or f"Chapter {num}"
chapter_url = chapter_dict.get("url") html = payload.get("html")
html = download_result.get("html")
# ------------------------------------------------------------ # ------------------------------------------------------------
# SKIPPED DOWNLOAD → SKIP PARSE # DOWNLOAD SKIPPED → PARSE SKIP
# ------------------------------------------------------------ # ------------------------------------------------------------
if download_result.get("skipped"): if payload.get("skipped"):
log_msg(book_id, f"[PARSE] SKIP chapter {chapter_num} (download skipped)") log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)")
return download_result # already has chapter + book_meta + skipped return payload
log_msg(book_id, f"[PARSE] Parsing chapter {chapter_num}") if not html:
log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP")
payload["parsed"] = None
payload["skipped"] = True
return payload
log_msg(book_idx, f"[PARSE] Parsing chapter {num}")
soup = BeautifulSoup(html, "lxml") soup = BeautifulSoup(html, "lxml")
# ------------------------------------------------------------ # ------------------------------------------------------------
# STRICT SELECTORS (direct content blocks) # STRICT SELECTORS
# ------------------------------------------------------------ # ------------------------------------------------------------
selectors = [ selectors = [
"#content", "#content",
@ -64,46 +136,22 @@ def parse_chapter(self, download_result: dict):
node = tmp node = tmp
break break
# ------------------------------------------------------------
# PIAOTIA FALLBACK:
# Extract content between <H1> and the "bottomlink" block.
# ------------------------------------------------------------
raw = None raw = None
if node is None:
h1 = soup.find("h1")
if h1:
content_parts = []
for sib in h1.next_siblings:
sib_class = getattr(sib, "get", lambda *_: None)("class")
if sib_class and (
"bottomlink" in sib_class or sib_class == "bottomlink"
):
break
if getattr(sib, "name", None) in ["script", "style", "center"]:
continue
if hasattr(sib, "get_text"): # strict selectors failed → piaotia extractor
content_parts.append(sib.get_text(separator="\n")) if node is None:
else: raw = extract_piaotia_content(soup)
content_parts.append(str(sib)) else:
raw = node.get_text(separator="\n")
raw = "\n".join(content_parts)
# ------------------------------------------------------------
# FINAL FALLBACK # FINAL FALLBACK
# ------------------------------------------------------------
if raw is None: if raw is None:
if node: for tag in soup(["script", "style", "noscript"]):
raw = node.get_text(separator="\n") tag.decompose()
else: raw = soup.get_text(separator="\n")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
raw = soup.get_text(separator="\n")
# ------------------------------------------------------------ # ------------------------------------------------------------
# MULTIPASS CLEANING via replacement files ONLY # MULTIPASS CLEANING VIA replacement-block files
# ------------------------------------------------------------ # ------------------------------------------------------------
REPL = load_all_replacements() REPL = load_all_replacements()
@ -112,29 +160,30 @@ def parse_chapter(self, download_result: dict):
text = clean_text(text, REPL) text = clean_text(text, REPL)
# ------------------------------------------------------------ # ------------------------------------------------------------
# Collapse excessive empty lines # Collapse double blank lines
# ------------------------------------------------------------ # ------------------------------------------------------------
cleaned = [] cleaned = []
prev_blank = False prev_blank = False
for line in text.split("\n"): for line in text.split("\n"):
stripped = line.rstrip() s = line.rstrip()
if stripped == "": if s == "":
if prev_blank: if prev_blank:
continue continue
prev_blank = True prev_blank = True
cleaned.append("") cleaned.append("")
else: else:
prev_blank = False prev_blank = False
cleaned.append(stripped) cleaned.append(s)
text = "\n".join(cleaned) text = "\n".join(cleaned)
text = f"{title}\n{text}"
# ------------------------------------------------------------ # ------------------------------------------------------------
# Add header to chapter 1 # Header on chapter 1
# ------------------------------------------------------------ # ------------------------------------------------------------
if chapter_num == 1: if num == 1:
book_url = book_meta.get("book_url") or book_meta.get("url") or "UNKNOWN" book_url = book_meta.get("book_url") or "UNKNOWN"
header = ( header = (
f"{book_meta.get('title','')}\n" f"{book_meta.get('title','')}\n"
f"Author: {book_meta.get('author','')}\n" f"Author: {book_meta.get('author','')}\n"
@ -143,13 +192,14 @@ def parse_chapter(self, download_result: dict):
) )
text = header + text text = header + text
log_msg(book_id, f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars") log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars")
# ------------------------------------------------------------
# OUTPUT PAYLOAD
# ------------------------------------------------------------
payload["parsed"] = text
payload["skipped"] = False
inc_parsed_done(book_idx)
# NEW RETURN FORMAT: chapter_dict stays intact return payload
return {
"book_id": book_id,
"chapter": chapter_dict,
"text": text,
"length": len(text),
"book_meta": book_meta,
}

@ -1,16 +1,16 @@
# ========================================================= # =========================================================
# File: scraper/tasks/pipeline.py # File: scraper/tasks/pipeline.py
# Purpose: # Purpose:
# Build Celery chains for chapter processing using chapter_dict. # Build Celery chains for chapter processing using payload dict.
# #
# New Chain: # Pipeline v3:
# download_chapter(book_id, chapter_dict, book_meta) # download_chapter(payload)
# → parse_chapter(download_result) # → parse_chapter(payload)
# → save_chapter(parsed_result) # → save_chapter(payload)
# → update_progress(final_result, book_id)
# #
# All subtasks pass through result dicts unchanged so the # NOTE:
# next stage receives the correct fields. # - book_idx is the single authoritative key for all tasks
# - payload travels unchanged through the entire pipeline
# ========================================================= # =========================================================
from celery import chain from celery import chain
@ -18,26 +18,33 @@ from celery import chain
from scraper.tasks.download_tasks import download_chapter from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter from scraper.tasks.save_tasks import save_chapter
from scraper.tasks.progress_tasks import update_progress
from scraper.logger_decorators import logcall
def build_chapter_pipeline(
book_id: str,
chapter_dict: dict,
book_meta: dict,
):
"""
Build a Celery chain for one chapter using chapter_dict.
download_chapter(book_id, chapter_dict, book_meta) @logcall
parse_chapter(download_result) def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict):
save_chapter(parsed_result) """
update_progress(result, book_id) Create a payload object passed through the pipeline.
Consistent with the chapter_dict-based task signature.
""" """
payload = {
"book_idx": book_idx,
"chapter": chapter_dict,
"book_meta": book_meta,
# Will be filled by download_chapter
"html": None,
# Will be filled by parse_chapter
"parsed": None,
# Set by download or parse on skip/404/etc
"skipped": False,
# Final path written by save_chapter
"path": None,
}
return chain( return chain(
download_chapter.s(book_id, chapter_dict, book_meta), download_chapter.s(payload),
parse_chapter.s(), parse_chapter.s(),
save_chapter.s(), save_chapter.s(),
update_progress.s(book_id),
) )

@ -1,57 +0,0 @@
# ============================================================
# File: scraper/tasks/progress_tasks.py
# Purpose: Central progress updater for chapter pipelines.
# Updated for chapter_dict pipeline model.
# ============================================================
from celery_app import celery_app
from scraper.progress import inc_completed, inc_skipped, inc_failed
from logbus.publisher import log
print(">>> [IMPORT] progress_tasks.py loaded")
@celery_app.task(bind=False, name="progress.update", queue="controller")
def update_progress(result: dict, book_id: str):
"""
Central progress logic:
- result: output of save_chapter
- book_id: explicitly passed by pipeline
IMPORTANT:
- save_chapter already updates counters for skipped & normal chapters
- progress.update MUST NOT double-increment
"""
ch = result.get("chapter") or {}
chapter_num = ch.get("num")
skipped = result.get("skipped", False)
failed = result.get("failed", False)
# ------------------------------------------------------------
# FAILED CASE
# ------------------------------------------------------------
if failed:
inc_failed(book_id)
log(f"[PROG] FAILED chapter {chapter_num}")
return result
# ------------------------------------------------------------
# SKIPPED CASE
# ------------------------------------------------------------
if skipped:
# save_chapter already did:
# inc_skipped(book_id)
log(f"[PROG] SKIPPED chapter {chapter_num}")
return result
# ------------------------------------------------------------
# NORMAL COMPLETION
# ------------------------------------------------------------
# save_chapter did NOT increment completed for skipped cases
# but DID inc_completed(book_id) for normal cases.
# update_progress should NOT double increment, so only log here.
log(f"[PROG] DONE chapter {chapter_num}")
return result

@ -1,7 +1,5 @@
# ============================================================ # ============================================================
# File: scraper/tasks/save_tasks.py # File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC + book_idx)
# Purpose: Save parsed chapter text to disk + trigger audio.
# Updated for chapter_dict + book_meta pipeline model.
# ============================================================ # ============================================================
print(">>> [IMPORT] save_tasks.py loaded") print(">>> [IMPORT] save_tasks.py loaded")
@ -9,130 +7,78 @@ print(">>> [IMPORT] save_tasks.py loaded")
from celery import shared_task from celery import shared_task
import os import os
from scraper.utils import get_save_path from logbus.publisher import log
from scraper.tasks.download_tasks import log_msg # unified logger from scraper.logger_decorators import logcall
from scraper.progress import ( from scraper.utils.utils import get_save_path
inc_completed, from scraper.tasks.download_tasks import log_msg
inc_skipped,
)
from scraper.tasks.audio_tasks import generate_audio from scraper.tasks.audio_tasks import generate_audio
from db.repository import inc_download_done, inc_download_skipped
@shared_task(bind=True, queue="save", ignore_result=False) @shared_task(bind=True, queue="save", ignore_result=False)
def save_chapter(self, parsed: dict): @logcall
""" def save_chapter(self, payload: dict):
New pipeline model:
parsed = { if not payload:
"book_id": str, log("[SAVE] ERROR: payload is None")
"chapter": chapter_dict, return {"error": True}
"text": str,
"length": int, # NEW unified ID
"book_meta": dict, book_idx = payload["book_idx"]
"skipped": bool,
"path": optional str (if skipped) chapter = payload["chapter"]
} parsed = payload.get("parsed")
""" path = payload.get("path")
skipped = payload.get("skipped")
book_id = parsed.get("book_id", "NOBOOK")
chapter_dict = parsed.get("chapter") or {} num = chapter["num"]
book_meta = parsed.get("book_meta") or {} title = chapter.get("title") or f"Chapter {num}"
volume = chapter.get("volume_path")
chapter_num = chapter_dict.get("num") volume_name = os.path.basename(volume.rstrip("/"))
chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
volume_path = chapter_dict.get("volume_path") # ============================================================
# SKIPPED CASE (old behavior restored)
# ------------------------------------------------------------ # ============================================================
# VALIDATION if skipped or not parsed:
# ------------------------------------------------------------ log_msg(book_idx, f"[SAVE] SKIP chapter {num}")
if chapter_num is None or volume_path is None: inc_download_skipped(book_idx)
raise ValueError("Invalid parsed payload: chapter_dict missing fields.")
# OLD behavior: even skipped chapters still queue audio
# ------------------------------------------------------------
# SKIPPED CASE
# ------------------------------------------------------------
if parsed.get("skipped"):
path = parsed.get("path", None)
log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num}{path}")
inc_skipped(book_id)
volume_name = os.path.basename(volume_path.rstrip("/"))
# Queue audio only if a valid file exists
if path and os.path.exists(path): if path and os.path.exists(path):
log_msg(book_idx, f"[AUDIO] Queueing audio for SKIPPED chapter {num}")
try: try:
generate_audio.delay( generate_audio.delay(book_idx, volume_name, num, title, path)
book_id, except Exception as exc:
volume_name, log_msg(book_idx, f"[AUDIO] ERROR queueing skipped audio: {exc}")
chapter_num,
chapter_title,
path,
)
log_msg(
book_id,
f"[AUDIO] Task queued (SKIPPED) for chapter {chapter_num} in {volume_name}",
)
except Exception as audio_exc:
log_msg(
book_id,
f"[AUDIO] ERROR queueing (SKIPPED) chapter {chapter_num}: {audio_exc}",
)
return {
"book_id": book_id,
"chapter": chapter_dict,
"path": path,
"skipped": True,
"book_meta": book_meta,
}
# ------------------------------------------------------------
# NORMAL SAVE CASE
# ------------------------------------------------------------
try:
text = parsed.get("text", "")
# Ensure volume folder exists return payload
os.makedirs(volume_path, exist_ok=True)
# Build final chapter file path # ============================================================
path = get_save_path(chapter_num, volume_path) # NORMAL SAVE CASE
# ============================================================
try:
os.makedirs(volume, exist_ok=True)
save_path = get_save_path(num, volume)
# Write chapter text to file with open(save_path, "w", encoding="utf-8") as f:
with open(path, "w", encoding="utf-8") as f: f.write(parsed)
f.write(text)
log_msg(book_id, f"[SAVE] Saved chapter {chapter_num}{path}") log_msg(book_idx, f"[SAVE] Saved chapter {num}{save_path}")
inc_completed(book_id)
# Determine volume name inc_download_done(book_idx)
volume_name = os.path.basename(volume_path.rstrip("/"))
# Queue audio task # OLD behavior: ALWAYS queue audio
try: try:
generate_audio.delay( generate_audio.delay(book_idx, volume_name, num, title, save_path)
book_id, log_msg(book_idx, f"[AUDIO] Task queued for chapter {num}")
volume_name, except Exception as exc:
chapter_num, log_msg(book_idx, f"[AUDIO] ERROR queueing chapter {num}: {exc}")
chapter_title,
path, payload["path"] = save_path
) payload["skipped"] = False
log_msg( return payload
book_id,
f"[AUDIO] Task queued for chapter {chapter_num} in {volume_name}",
)
except Exception as audio_exc:
log_msg(
book_id, f"[AUDIO] ERROR queueing chapter {chapter_num}: {audio_exc}"
)
return {
"book_id": book_id,
"chapter": chapter_dict,
"path": path,
"book_meta": book_meta,
}
except Exception as exc: except Exception as exc:
log_msg(book_id, f"[SAVE] ERROR saving chapter {chapter_num}: {exc}") log_msg(book_idx, f"[SAVE] ERROR saving chapter {num}: {exc}")
raise raise

@ -1,7 +1,9 @@
# ============================================================ # ============================================================
# File: scraper/tasks/scraping.py # File: scraper/tasks/scraping.py
# Purpose: Scrape metadata + chapter list and initialise # Purpose:
# Redis progress tracking + launch download controller # Scrape ONLY metadata + chapter list.
# Does NOT launch download controller anymore.
# Controller decides when pipelines start.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
@ -9,88 +11,91 @@ from logbus.publisher import log
import os import os
import redis import redis
from scraper.logger_decorators import logcall
from scraper.sites import BookSite from scraper.sites import BookSite
from scraper.book_scraper import BookScraper from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort # no circular deps from scraper.abort import clear_abort
from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT from scraper.ui_log import reset_ui_logs
from scraper.services.init_service import InitService
print(">>> [IMPORT] scraping.py loaded") print(">>> [IMPORT] scraping.py loaded")
# Redis connection (same as Celery broker) # Redis connection (same DB as Celery broker)
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True) r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
@celery_app.task(bind=True, queue="scraping", ignore_result=False) @celery_app.task(
bind=True,
queue="scraping",
ignore_result=False,
name="scraper.tasks.scraping.start_scrape_book",
)
@logcall
def start_scrape_book(self, url: str): def start_scrape_book(self, url: str):
"""Scrapes metadata + chapters and prepares download tracking.""" """
Scrapes metadata + chapters.
DOES NOT START download / pipeline controller.
The controller_tasks.start_full_scrape() task will call this one.
"""
# ------------------------------------------------------------ # ------------------------------------------------------------
# NEW: clear UI log buffer at start of new run # CLEAR UI LOG BUFFER
# ------------------------------------------------------------ # ------------------------------------------------------------
reset_ui_logs() reset_ui_logs()
log(f"[SCRAPING] Start scraping for: {url}") log(f"[SCRAPING] Start scraping for: {url}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# Book scrape # SCRAPE (old engine)
# ------------------------------------------------------------ # ------------------------------------------------------------
site = BookSite() site = BookSite()
scraper = BookScraper(site, url) scraper = BookScraper(site, url)
result = scraper.execute() # returns dict with metadata + chapters result = scraper.execute() # → { title, author, chapters, cover_url, ... }
chapters = result.get("chapters", []) chapters = result.get("chapters", [])
full_count = len(chapters) full_count = len(chapters)
# ------------------------------------------------------------ # ------------------------------------------------------------
# DRY RUN # Compute unified book_idx
# ------------------------------------------------------------
book_idx = InitService.derive_book_id(url)
result["book_idx"] = book_idx
log(f"[SCRAPING] Assigned book_idx = {book_idx}")
# ------------------------------------------------------------
# DRY RUN TEST LIMIT
# ------------------------------------------------------------ # ------------------------------------------------------------
DRY_RUN = os.getenv("DRY_RUN", "0") == "1" DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5")) TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
if DRY_RUN: if DRY_RUN:
log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}") log(f"[SCRAPING] DRY RUN active → limiting chapters to {TEST_LIMIT}")
chapters = chapters[:TEST_LIMIT] result["chapters"] = chapters[:TEST_LIMIT]
result["chapters"] = chapters
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
# ------------------------------------------------------------ # ------------------------------------------------------------
# BOOK RUN ID (using title as ID) # LOG RESULTS
# ------------------------------------------------------------ # ------------------------------------------------------------
title = result.get("title") or "UnknownBook" log(
book_id = title # user requirement f"[SCRAPING] Completed scrape: "
f"{len(result['chapters'])}/{full_count} chapters"
result["book_id"] = book_id )
log(f"[SCRAPING] Assigned book_id = '{book_id}'")
# ------------------------------------------------------------ # ------------------------------------------------------------
# RESET ABORT + INITIALISE PROGRESS # RESET ABORT + INITIALIZE LEGACY PROGRESS
# ------------------------------------------------------------ # ------------------------------------------------------------
clear_abort(book_id) clear_abort(book_idx)
r.set(f"progress:{book_id}:total", len(chapters)) r.set(f"progress:{book_idx}:total", len(result["chapters"]))
r.set(f"progress:{book_id}:done", 0) r.set(f"progress:{book_idx}:done", 0)
r.delete(f"logs:{book_id}") # clear old logs if any
r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}") r.delete(f"logs:{book_idx}")
r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters") r.rpush(f"logs:{book_idx}", f":: SCRAPING STARTED for {url}")
r.rpush(f"logs:{book_idx}", f":: Found {len(result['chapters'])} chapters")
# ------------------------------------------------------------ # ------------------------------------------------------------
# DISPATCH DOWNLOAD CONTROLLER # IMPORTANT: DO NOT DISPATCH any pipelines here
# Controller will receive scrape_result and continue.
# ------------------------------------------------------------ # ------------------------------------------------------------
celery_app.send_task( return result
"scraper.tasks.controller_tasks.launch_downloads",
args=[book_id, result],
queue="controller",
)
log(f"[SCRAPING] Dispatched download controller for '{book_id}'")
return {
"book_id": book_id,
"title": result.get("title"),
"author": result.get("author"),
"chapters": len(chapters),
}

@ -0,0 +1,149 @@
# ============================================================
# File: scraper/tasks/statuscheck.py
# Purpose:
# Final status check after audio completion.
#
# Responsibilities:
# - Verify Redis counters (sanity check)
# - Verify filesystem (Audio files present)
# - Queue m4btool task
#
# Design rules:
# - Book-scope ONLY
# - No direct Redis usage
# - Repository is the single source of truth
# - Idempotent, defensive, non-blocking
# ============================================================
import os
from celery_app import celery_app
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import (
get_audio_done,
get_chapters_total,
set_status,
fetch_book,
)
from scraper.tasks.m4b_tasks import run_m4btool
# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
@log
def _detect_volumes(book_base: str):
"""
Return sorted list of Volume_XXX directories.
"""
vols = []
for name in os.listdir(book_base):
if name.lower().startswith("volume_"):
full = os.path.join(book_base, name)
if os.path.isdir(full):
vols.append(name)
vols.sort()
return vols
@logcall
def _count_audio_files(audio_dir: str) -> int:
"""
Count .m4b files in an Audio directory.
"""
if not os.path.isdir(audio_dir):
return 0
return len([f for f in os.listdir(audio_dir) if f.lower().endswith(".m4b")])
# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="controller", ignore_result=True)
@logcall
def run_statuscheck(self, book_idx: str):
"""
Final statuscheck before m4btool execution.
Triggered exactly once by audio_completion quickcheck.
"""
log(f"[STATUSCHECK] START book={book_idx}")
# --------------------------------------------------------
# 1. Redis sanity check (via repository)
# --------------------------------------------------------
audio_done = get_audio_done(book_idx)
chapters_total = get_chapters_total(book_idx)
log(
f"[STATUSCHECK] Counters book={book_idx} "
f"audio_done={audio_done} chapters_total={chapters_total}"
)
if chapters_total <= 0:
log(f"[STATUSCHECK] No chapters_total → abort")
return
if audio_done < chapters_total:
# Defensive: should not happen, but never assume
log(
f"[STATUSCHECK] Audio not complete yet "
f"({audio_done}/{chapters_total}) → abort"
)
return
# --------------------------------------------------------
# 2. Fetch book metadata (for paths & m4b meta)
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
log(f"[STATUSCHECK] Book not found in DB: {book_idx}")
return
title = book.get("title") or book_idx
author = book.get("author") or "Unknown"
# Base output directory
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
book_base = os.path.join(root, title)
if not os.path.isdir(book_base):
log(f"[STATUSCHECK] Book directory missing: {book_base}")
return
# --------------------------------------------------------
# 3. Filesystem validation (light, non-blocking)
# --------------------------------------------------------
volumes = _detect_volumes(book_base)
if not volumes:
log(f"[STATUSCHECK] No volumes found for {book_idx}")
# Still allow m4btool to decide (it will no-op)
else:
for vol in volumes:
audio_dir = os.path.join(book_base, vol, "Audio")
count = _count_audio_files(audio_dir)
log(f"[STATUSCHECK] {vol}: " f"{count} audio files detected")
# --------------------------------------------------------
# 4. Queue m4btool (final pipeline step)
# --------------------------------------------------------
log(f"[STATUSCHECK] Queue m4btool for book={book_idx}")
set_status(book_idx, "m4b_running")
run_m4btool.delay(
book_idx=book_idx,
book_base=book_base,
meta={
"title": title,
"author": author,
},
)
log(f"[STATUSCHECK] DONE book={book_idx}")

@ -0,0 +1,38 @@
#!/bin/sh
main_dir="$( cd "$( dirname "$0" )" && pwd )"
shopt -s nocasematch
for subfolder in "$main_dir"/*; do
if [ -d "$subfolder" ]; then
audiofolder="$subfolder/Audio"
mkdir -p "$audiofolder"
for entry in "$subfolder"/*.txt; do
fn=$(basename "$entry")
[[ "${entry##*.}" =~ txt ]]
echo "$fn"
inputfile="$subfolder/$fn"
outputfile="$audiofolder/${fn%.*}.m4b"
now=$(date +"%T")
echo "Current time : $now"
echo "$inputfile ->"
echo "$outputfile"
if [ -f "$outputfile" ]; then
echo "$outputfile exists: skipping"
else
say --voice=Sinji \
--output-file="$outputfile" \
--input-file="$inputfile" \
--file-format=m4bf \
--quality=127 \
-r 200 \
--data-format=aac
fi
done
fi
done

@ -44,3 +44,31 @@ def reset_ui_logs():
- Auto-clear when new book scraping starts - Auto-clear when new book scraping starts
""" """
r.delete(UI_LOG_KEY) r.delete(UI_LOG_KEY)
# ============================================================
# Delta-based log retrieval using Redis indexes
# ============================================================
def get_ui_logs_delta(last_index: int):
"""
Returns (new_lines, total_count).
Only returns log lines AFTER last_index.
Example:
last_index = 10 returns logs with Redis indexes 11..end
"""
total = r.llen(UI_LOG_KEY)
if total == 0:
return [], 0
# First load OR index invalid → send entire buffer
if last_index < 0 or last_index >= total:
logs = r.lrange(UI_LOG_KEY, 0, -1)
return logs, total
# Only new logs
new_lines = r.lrange(UI_LOG_KEY, last_index + 1, -1)
return new_lines, total

@ -0,0 +1,272 @@
# ============================================================
# File: scraper/utils/state_sync.py
# Purpose:
# State inspection + optional sync logic for unified book_idx model.
# Generates full book-card compatible dicts for debug UI.
# ============================================================
import os
import redis
from db.db import get_db
def _build_card(sqlite_row, redis_state, merged):
"""
Creates a dict that matches the fields required by components/bookcard.html:
b.book_idx
b.title
b.author
b.cover_path
b.status
b.created_at
b.download_done
b.download_total
b.audio_done
b.audio_total
"""
return {
"book_idx": sqlite_row.get("book_idx"),
"title": sqlite_row.get("title") or "Unknown",
"author": sqlite_row.get("author"),
"cover_path": sqlite_row.get("cover_path"),
# Use merged status (Redis > SQLite)
"status": merged.get("status") or sqlite_row.get("status") or "unknown",
# Meta
"created_at": sqlite_row.get("created_at"),
# Download counters
"download_done": merged.get("downloaded", 0),
"download_total": merged.get("chapters_total", 0),
# Audio counters
"audio_done": merged.get("audio_done", 0),
"audio_total": merged.get("chapters_total", 0),
}
# ============================================================
# INSPECT ONLY — NO WRITES
# ============================================================
def inspect_books_state_depecrated():
"""
Reads all books from SQLite and fetches Redis progress.
Builds:
entry.sqlite
entry.redis
entry.would_merge_to
entry.card (book-card compatible)
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db()
cur = db.cursor()
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
sqlite_row = dict(row)
book_idx = sqlite_row["book_idx"]
redis_key = f"book:{book_idx}:state"
redis_state = r.hgetall(redis_key) or {}
# ================================
# DRY-RUN MERGE LOGIC
# ================================
merged = sqlite_row.copy()
if redis_state:
merged["downloaded"] = int(
redis_state.get("chapters_download_done", merged.get("downloaded", 0))
)
merged["parsed"] = int(
redis_state.get("chapters_parsed_done", merged.get("parsed", 0))
)
merged["audio_done"] = int(
redis_state.get("audio_done", merged.get("audio_done", 0))
)
merged["chapters_total"] = int(
redis_state.get("chapters_total", merged.get("chapters_total", 0))
)
merged["status"] = redis_state.get(
"status", merged.get("status", "unknown")
)
# ================================
# Build book-card data
# ================================
card = _build_card(sqlite_row, redis_state, merged)
# ================================
# Append final result entry
# ================================
results.append(
{
"book_idx": book_idx,
"title": sqlite_row.get("title"),
"sqlite": sqlite_row,
"redis": redis_state,
"would_merge_to": merged,
"card": card,
}
)
return results
# ============================================================
# INSPECT ONLY — NO WRITES
# ============================================================
def inspect_books_state():
"""
Reads canonical book state from repository.
Builds:
entry.sqlite
entry.redis
entry.would_merge_to
entry.card (book-card compatible)
"""
from db.repository import get_book_state
from db.db import get_db
db = get_db()
cur = db.cursor()
# Alleen nodig om te weten *welke* books er zijn
cur.execute("SELECT book_idx FROM books")
rows = cur.fetchall()
results = []
for row in rows:
book_idx = row["book_idx"]
# --------------------------------
# Canonical state (ENIGE waarheid)
# --------------------------------
state = get_book_state(book_idx)
# SQLite-view = alleen SQLite-kolommen
sqlite_view = {
k: v
for k, v in state.items()
if k
in (
"book_idx",
"title",
"author",
"description",
"cover_path",
"book_url",
"chapters_total",
"status",
"downloaded",
"parsed",
"audio_done",
"created_at",
"processdate",
"last_update",
)
}
# Redis-view = alleen Redis counters/status
redis_view = {
k: v
for k, v in state.items()
if k.startswith("chapters_")
or k in ("status", "audio_done", "audio_skipped")
}
merged = state # letterlijk de canonieke state
card = _build_card(sqlite_view, redis_view, merged)
results.append(
{
"book_idx": book_idx,
"title": state.get("title"),
"sqlite": sqlite_view,
"redis": redis_view,
"would_merge_to": merged,
"card": card,
}
)
return results
# ============================================================
# SYNC REDIS → SQLITE (writes)
# ============================================================
def sync_books_from_redis():
"""
Writes Redis progress values back into SQLite.
Uses unified book_idx as identifier.
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db()
cur = db.cursor()
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
before = dict(row)
book_idx = before["book_idx"]
redis_key = f"book:{book_idx}:state"
redis_state = r.hgetall(redis_key)
if not redis_state:
results.append(
{
"book_idx": book_idx,
"before": before,
"redis": {},
"after": before,
}
)
continue
# Extract progress from Redis
downloaded = int(redis_state.get("chapters_download_done", 0))
parsed = int(redis_state.get("chapters_parsed_done", 0))
audio_done = int(redis_state.get("audio_done", 0))
total = int(redis_state.get("chapters_total", 0))
status = redis_state.get("status", before.get("status"))
# Update SQLite
cur.execute(
"""
UPDATE books
SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now')
WHERE book_idx = ?
""",
(downloaded, parsed, audio_done, total, status, book_idx),
)
db.commit()
cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
after = dict(cur.fetchone())
results.append(
{
"book_idx": book_idx,
"before": before,
"redis": redis_state,
"after": after,
}
)
return results

@ -97,6 +97,7 @@ def clean_text(raw: str, repl: dict) -> str:
# Apply loaded replacements # Apply loaded replacements
for key, val in repl.items(): for key, val in repl.items():
# print(f"Replacing: {key} → {val}")
txt = txt.replace(key, val) txt = txt.replace(key, val)
# Collapse 3+ blank lines → max 1 # Collapse 3+ blank lines → max 1

@ -0,0 +1,310 @@
/* =======================================================================
File: static/css/bookcard.css
Purpose:
Styling voor registered book cards:
- status kleuren
- badges
- start/abort/statuscheck
- progress bars
======================================================================= */
/* -----------------------------------------------------------------------
GRID WRAPPER
----------------------------------------------------------------------- */
.registered-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(340px, 1fr));
gap: 20px;
margin-top: 15px;
}
/* -----------------------------------------------------------------------
BOOK CARD BASE
----------------------------------------------------------------------- */
.book-card {
position: relative;
display: grid;
grid-template-columns: 90px auto;
gap: 15px;
padding: 15px;
background: #fff;
border-radius: 10px;
border: 1px solid #e5e5e5;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
transition: border-color 0.25s ease, box-shadow 0.25s ease;
}
/* -----------------------------------------------------------------------
STATUS COLORS (BOOK CARD BORDER)
----------------------------------------------------------------------- */
/* Downloading / actief bezig */
.book-card.downloading {
border-color: #ff9500;
box-shadow: 0 0 6px rgba(255, 149, 0, 0.35);
}
/* Audio fase */
.book-card.audio {
border-color: #ffca28;
box-shadow: 0 0 6px rgba(255, 202, 40, 0.35);
}
/* Volledig klaar */
.book-card.done {
border: 2px solid #4caf50;
box-shadow: 0 0 6px rgba(76, 175, 80, 0.35);
}
/* Afgebroken */
.book-card.aborted {
border-color: #ff3b30;
box-shadow: 0 0 6px rgba(255, 59, 48, 0.35);
}
/* -----------------------------------------------------------------------
COVER
----------------------------------------------------------------------- */
.book-cover {
width: 90px;
}
.book-img {
width: 90px;
height: 130px;
object-fit: cover;
border-radius: 4px;
background: #f4f4f4;
}
.placeholder {
display: flex;
justify-content: center;
align-items: center;
color: #777;
font-size: 12px;
}
/* -----------------------------------------------------------------------
META
----------------------------------------------------------------------- */
.book-meta {
display: flex;
flex-direction: column;
justify-content: space-between;
}
.book-title {
font-size: 16px;
font-weight: bold;
}
.book-author {
font-size: 14px;
color: #444;
margin-bottom: 6px;
}
.book-created {
font-size: 12px;
color: #666;
}
/* -----------------------------------------------------------------------
ACTION BUTTONS
----------------------------------------------------------------------- */
.book-actions {
display: flex;
justify-content: flex-end;
gap: 10px;
margin-top: 10px;
}
.icon-btn {
width: 34px;
height: 34px;
border: none;
border-radius: 8px;
display: flex;
justify-content: center;
align-items: center;
font-size: 16px;
color: #fff;
cursor: pointer;
transition: background 0.15s ease, transform 0.1s ease;
}
/* Start */
.icon-start {
background: #2d8a3d;
}
.icon-start:hover {
background: #226c30;
transform: scale(1.05);
}
.icon-start:disabled {
background: #9bbb9f;
cursor: not-allowed;
opacity: 0.5;
}
/* Abort */
.icon-abort {
background: #c62828;
}
.icon-abort:hover {
background: #a31f1f;
transform: scale(1.05);
}
.icon-abort:disabled {
background: #d8a0a0;
cursor: not-allowed;
opacity: 0.5;
}
/* Hide */
.hide-form {
position: absolute;
top: 6px;
right: 6px;
}
.icon-hide {
background: #777;
}
.icon-hide:hover {
background: #555;
}
/* Statuscheck */
.statuscheck-btn {
background-color: #444;
color: #fff;
border: 1px solid #666;
margin-left: 4px;
padding: 4px 8px;
border-radius: 6px;
font-size: 12px;
cursor: pointer;
}
.statuscheck-btn:hover {
background-color: #333;
}
/* -----------------------------------------------------------------------
PROGRESS (FULL WIDTH)
----------------------------------------------------------------------- */
.book-progress {
grid-column: 1 / -1;
margin-top: 12px;
padding: 10px 12px;
background: #f6f6f6;
border-radius: 8px;
}
.progress-row {
margin-bottom: 4px;
}
.progress-label {
font-size: 12px;
margin-bottom: 4px;
color: #444;
}
/* BAR */
.progressbar {
position: relative;
width: 100%;
height: 14px;
background: #ddd;
border-radius: 7px;
overflow: hidden;
}
.progressbar-fill {
height: 100%;
transition: width 0.4s ease;
}
/* Download */
.progressbar-fill.download {
background: #2196f3;
}
/* Audio */
.progressbar-fill.audio {
background: #4caf50;
}
/* TEXT IN BAR */
.progressbar-text {
position: absolute;
inset: 0;
display: flex;
align-items: center;
justify-content: center;
font-size: 11px;
font-weight: 600;
color: #fff;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.6);
pointer-events: none;
}
/* -----------------------------------------------------------------------
STATUS BADGE
----------------------------------------------------------------------- */
.status-badge {
display: inline-block;
margin-bottom: 6px;
padding: 2px 8px;
font-size: 11px;
font-weight: 600;
border-radius: 10px;
text-transform: uppercase;
letter-spacing: 0.5px;
cursor: default;
}
/* DONE */
.status-badge.status-done {
background-color: #e6f4ea;
color: #2e7d32;
border: 1px solid #4caf50;
}
/* AUDIO */
.status-badge.status-audio {
background-color: #fff8e1;
color: #8d6e00;
border: 1px solid #ffca28;
}
/* DOWNLOADING */
.status-badge.status-downloading {
background-color: #e3f2fd;
color: #1565c0;
border: 1px solid #42a5f5;
}
/* Statuscheck */
.icon-statuscheck {
background: #444;
}
.icon-statuscheck:hover {
background: #333;
transform: scale(1.05);
}

@ -0,0 +1,312 @@
/* =======================================================================
File: static/css/dashboard.css
Purpose:
Clean full-width vertical dashboard layout with large log viewer.
Book-card CSS is now moved to bookcard.css
======================================================================= */
/* -----------------------------------------------------------------------
1) GENERAL PAGE LAYOUT
----------------------------------------------------------------------- */
.dashboard-container {
display: flex;
flex-direction: column;
width: 100%;
max-width: 1200px;
margin: 20px auto;
padding: 0 20px;
gap: 18px;
}
.dashboard-section {
background: #ffffff;
padding: 16px;
border-radius: 6px;
border: 1px solid #ddd;
}
.page-title {
font-size: 22px;
margin-bottom: 15px;
}
/* -----------------------------------------------------------------------
2) ACTIVE BOOK LIST (dashboard left panel)
----------------------------------------------------------------------- */
.book-list {
display: flex;
flex-direction: column;
gap: 12px;
}
.book-list-empty {
padding: 18px;
text-align: center;
color: #777;
}
.book-list-item {
padding: 12px 16px;
background: #f7f7f7;
border-radius: 6px;
border: 1px solid #ccc;
cursor: pointer;
display: flex;
flex-direction: column;
gap: 6px;
transition: background 0.2s, border-color 0.2s;
}
.book-list-item:hover,
.book-list-item.active {
background: #eaf3ff;
border-color: #1e88e5;
}
.book-title {
font-size: 16px;
font-weight: 600;
}
.book-meta {
font-size: 12px;
color: #555;
}
/* -----------------------------------------------------------------------
3) PROGRESS BOX
----------------------------------------------------------------------- */
.progress-box {
background: #fafafa;
border: 1px solid #ddd;
padding: 8px;
border-radius: 6px;
}
.progress-header h2 {
margin-bottom: 5px;
}
.progress-subtitle {
font-size: 14px;
color: #333;
font-weight: 600;
}
.progress-bookid {
font-size: 12px;
color: #777;
margin-bottom: 15px;
}
.progress-bar {
height: 14px;
background: #ddd;
border-radius: 6px;
overflow: hidden;
margin-bottom: 6px;
}
.progress-bar-fill {
height: 100%;
background: #1e88e5;
}
.progress-bar-fill.audio-fill {
background: #e65100;
}
.progress-stats {
display: flex;
justify-content: space-between;
font-size: 12px;
color: #444;
margin-top: 4px;
}
.book-abort-area {
margin-top: 10px;
text-align: right;
}
.abort-btn {
padding: 6px 12px;
border-radius: 4px;
border: 1px solid #cc0000;
background: #ff4444;
color: white;
font-size: 12px;
cursor: pointer;
transition: background 0.2s, border-color 0.2s;
}
.abort-btn:hover {
background: #ff2222;
border-color: #aa0000;
}
/* -----------------------------------------------------------------------
4) LOG VIEWER
----------------------------------------------------------------------- */
.log-viewer {
width: 100%;
max-width: 100%;
overflow: hidden;
}
.log-header {
display: flex;
justify-content: space-between;
align-items: center;
}
.log-filters {
display: flex;
align-items: center;
gap: 8px;
}
.log-output {
flex: 1;
width: 100%;
max-width: 100%;
min-height: 60vh;
max-height: 75vh;
overflow-y: auto;
overflow-x: hidden;
background: #000;
color: #00ff66;
border: 1px solid #0f0;
border-radius: 6px;
padding: 12px;
font-family: "SF Mono", "Consolas", "Courier New", monospace;
font-size: 13px;
line-height: 1.35;
white-space: pre-wrap;
word-break: break-word;
}
.log-line {
white-space: pre-wrap;
padding: 2px 0;
}
.log-line.default {
color: #00ff66;
}
.log-line.dl {
color: #00ccff;
}
.log-line.parse {
color: #ffaa00;
}
.log-line.save {
color: #ffdd33;
}
.log-line.audio {
color: #ff66ff;
}
.log-line.ctrl {
color: #66aaff;
}
.log-line.error {
color: #ff3333;
}
/* -----------------------------------------------------------------------
5) PLACEHOLDER / FOOTER
----------------------------------------------------------------------- */
.dashboard-placeholder {
font-size: 15px;
padding: 20px;
text-align: center;
color: #777;
}
.footer {
text-align: center;
padding: 12px;
color: #666;
margin-top: 25px;
font-size: 12px;
border-top: 1px solid #ddd;
}
/* -----------------------------
DROPDOWN NAVIGATION
------------------------------ */
/* Container for dropdown */
.nav-dropdown {
position: relative;
}
/* The clickable label ("Tools ▾") */
.nav-dropdown > .nav-item {
cursor: pointer;
}
/* Hide dropdown by default */
.dropdown-menu {
display: none;
position: absolute;
top: 100%;
right: 0;
background: #fff; /* zelfde achtergrond als navbar */
border: 1px solid #ddd;
padding: 8px 0;
margin: 0;
list-style: none; /* verwijder bolletjes */
border-radius: 4px;
min-width: 160px;
z-index: 1000;
}
/* Show dropdown when hovering over parent */
.nav-dropdown:hover .dropdown-menu {
display: block;
}
/* Menu item styling */
.dropdown-menu li {
padding: 0;
margin: 0;
}
.dropdown-menu li a {
display: block;
padding: 8px 16px;
white-space: nowrap;
color: #333;
text-decoration: none;
}
/* Hover state */
.dropdown-menu li a:hover {
background: #f0f0f0;
}
table.kv {
border-collapse: collapse;
margin-bottom: 16px;
}
table.kv th {
text-align: left;
padding-right: 12px;
color: #777;
font-weight: normal;
}
table.kv td {
font-weight: 500;
}

@ -0,0 +1,160 @@
/* =======================================================================
File: static/css/style.css
Purpose:
Global base styling for all pages.
Includes typography, buttons, forms, layout primitives.
======================================================================= */
/* ------------------------------
RESET / BASE
------------------------------ */
html,
body {
margin: 0;
padding: 0;
font-family: Arial, Helvetica, sans-serif;
background: #f5f6fa;
color: #222;
}
.container {
max-width: 1100px;
margin: 0 auto;
padding: 20px;
}
h1,
h2,
h3 {
margin: 0 0 15px 0;
font-weight: 600;
}
a {
color: #1e88e5;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
/* ------------------------------
BUTTONS
------------------------------ */
.btn-primary {
background: #1e88e5;
color: #fff;
padding: 10px 18px;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 15px;
transition: background 0.2s ease;
}
.btn-primary:hover {
background: #1669b9;
}
.btn-small {
padding: 5px 10px;
background: #ccc;
border-radius: 4px;
border: none;
font-size: 13px;
}
.btn-small:hover {
background: #bbb;
}
/* ------------------------------
FORM ELEMENTS
------------------------------ */
.url-form {
display: flex;
gap: 10px;
flex-direction: column;
max-width: 550px;
}
.url-label {
font-weight: 600;
}
.url-input {
padding: 10px;
font-size: 15px;
border: 1px solid #bbb;
border-radius: 4px;
}
.url-submit {
align-self: flex-start;
}
/* ------------------------------
NAVBAR
------------------------------ */
.navbar {
background: #ffffff;
border-bottom: 1px solid #ddd;
padding: 12px 20px;
}
.nav-inner {
max-width: 1200px;
margin: 0 auto;
display: flex;
align-items: center;
justify-content: space-between;
}
.nav-brand a {
font-size: 20px;
font-weight: bold;
color: #1e88e5;
}
.nav-links {
list-style: none;
display: flex;
gap: 25px;
margin: 0;
padding: 0;
}
.nav-item {
font-size: 15px;
color: #333;
}
.nav-item:hover {
color: #1e88e5;
}
/* ------------------------------
LANDING PAGE
------------------------------ */
.landing-container {
max-width: 600px;
margin: 40px auto;
background: #fff;
padding: 25px;
border-radius: 6px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.landing-title {
margin-bottom: 20px;
}
.landing-links {
margin-top: 20px;
}

@ -0,0 +1,33 @@
/* =======================================================================
File: static/js/app.js
Purpose:
Global utility functions shared across all scripts.
No page-specific logic here.
======================================================================= */
// Shortcuts
const $ = (sel, parent = document) => parent.querySelector(sel);
const $$ = (sel, parent = document) => parent.querySelectorAll(sel);
// Safe log
function dbg(...args) {
console.log("[APP]", ...args);
}
// AJAX helper
async function apiGet(url) {
try {
const res = await fetch(url, { cache: "no-store" });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return await res.json();
} catch (err) {
console.error("API GET Error:", url, err);
return null;
}
}
// Auto-scroll utility
function autoScroll(el) {
if (!el) return;
el.scrollTop = el.scrollHeight;
}

@ -0,0 +1,145 @@
/* ============================================================
File: static/js/bookcard_controller.js
Purpose:
Single owner for updating book-card DOM from merged state
(would_merge_to)
============================================================ */
console.log("[BOOKCARD] controller loaded");
/* ============================================================
ENTRY POINT (called by state_updater.js)
============================================================ */
function updateBookCardsFromState(stateList) {
console.log("[BOOKCARD] updateBookCardsFromState called");
if (!Array.isArray(stateList)) {
console.warn("[BOOKCARD] Invalid stateList", stateList);
return;
}
const stateById = {};
stateList.forEach((entry) => {
const merged = entry.would_merge_to;
if (!merged || merged.book_idx == null) {
console.warn("[BOOKCARD] entry without merged/book_idx", entry);
return;
}
stateById[String(merged.book_idx)] = merged;
});
document.querySelectorAll(".book-card").forEach((card) => {
const bookIdx = card.dataset.bookIdx;
const state = stateById[bookIdx];
if (!state) {
console.debug("[BOOKCARD] No state for book_idx", bookIdx);
return;
}
console.log("[BOOKCARD] Updating card", bookIdx, state.status);
updateSingleBookCard(card, state);
});
}
/* ============================================================
SINGLE CARD UPDATE
============================================================ */
function updateSingleBookCard(card, state) {
console.log("[BOOKCARD] updateSingleBookCard", state.book_idx);
updateStatus(card, state);
updateStatusBadge(card, state);
updateButtons(card, state);
updateProgress(card, state);
}
/* ============================================================
STATUS
============================================================ */
function updateStatus(card, state) {
console.log("[BOOKCARD][STATUS]", state.book_idx, "→", state.status);
card.className = `book-card ${state.status || ""}`;
}
function updateStatusBadge(card, state) {
const badge = card.querySelector(".status-badge");
if (!badge) return;
const status = (state.status || "").toLowerCase();
badge.textContent = status.toUpperCase();
badge.className = `status-badge status-${status}`;
badge.title =
{
downloading: "Bezig met downloaden",
audio: "Downloads compleet, audio wordt gegenereerd",
done: "Alle chapters en audio zijn compleet",
}[status] || "";
}
/* ============================================================
BUTTONS
============================================================ */
function updateButtons(card, state) {
const startBtn = card.querySelector(".icon-start");
const abortBtn = card.querySelector(".icon-abort");
const busy = ["starting", "downloading", "parsing", "audio"];
console.log("[BOOKCARD][BUTTONS]", state.book_idx, "status:", state.status);
if (startBtn) {
// startBtn.disabled = busy.includes(state.status);
}
if (abortBtn) {
abortBtn.disabled = !busy.includes(state.status);
}
}
/* ============================================================
PROGRESS (DOWNLOAD + AUDIO)
============================================================ */
function updateProgress(card, s) {
const total = Number(s.chapters_total || 0);
// const downloadDone =
// Number(s.chapters_download_done || 0) +
// Number(s.chapters_download_skipped || 0);
const downloadDone = Number(s.downloaded || 0);
const audioDone = Number(s.audio_done || 0) + Number(s.audio_skipped || 0);
const downloadPct =
total > 0 ? Math.min((downloadDone / total) * 100, 100) : 0;
const audioPct = total > 0 ? Math.min((audioDone / total) * 100, 100) : 0;
console.log("[BOOKCARD][PROGRESS]", s.book_idx, {
total,
downloadDone,
audioDone,
downloadPct,
audioPct,
});
/* ---- DOWNLOAD ---- */
const dlBar = card.querySelector('[data-field="download_pct"]');
const dlText = card.querySelector('[data-field="download_text"]');
if (dlBar) dlBar.style.width = `${downloadPct}%`;
if (dlText) dlText.textContent = `${downloadDone} / ${total}`;
/* ---- AUDIO ---- */
const auBar = card.querySelector('[data-field="audio_pct"]');
const auText = card.querySelector('[data-field="audio_text"]');
if (auBar) auBar.style.width = `${audioPct}%`;
if (auText) auText.textContent = `${audioDone} / ${total}`;
}

@ -0,0 +1,178 @@
/* =======================================================================
File: static/js/dashboard.js
Purpose:
- Sidebar selectie
- Start / Abort acties
- UI status updates
NOTE:
- GEEN polling
- state_updater.js is leidend
======================================================================= */
console.log("[DASHBOARD] loaded");
/* ---------------------------------------------------------
Helpers
--------------------------------------------------------- */
async function apiGet(url) {
console.log("[DASHBOARD][API] GET", url);
try {
const r = await fetch(url, { cache: "no-store" });
if (!r.ok) {
console.warn("[DASHBOARD][API] GET failed", url, r.status);
return null;
}
return await r.json();
} catch (e) {
console.error("[DASHBOARD][API] GET error", url, e);
return null;
}
}
function safeUpdateLogs(data) {
if (typeof window.updateLogs === "function") {
console.log("[DASHBOARD] updateLogs()");
window.updateLogs(data);
}
}
/* ---------------------------------------------------------
State
--------------------------------------------------------- */
let ACTIVE_BOOK_IDX = null;
/* ---------------------------------------------------------
DOM READY
--------------------------------------------------------- */
document.addEventListener("DOMContentLoaded", () => {
console.log("[DASHBOARD] DOMContentLoaded");
bindSidebar();
bindBookCardButtons();
const first = document.querySelector(".book-list-item");
if (first) {
console.log("[DASHBOARD] auto-select", first.dataset.bookIdx);
selectBook(first.dataset.bookIdx);
}
});
/* ---------------------------------------------------------
Sidebar
--------------------------------------------------------- */
function bindSidebar() {
console.log("[DASHBOARD] bindSidebar()");
document.querySelectorAll(".book-list-item").forEach((item) => {
item.onclick = () => selectBook(item.dataset.bookIdx);
});
}
function selectBook(bookIdx) {
if (!bookIdx || bookIdx === ACTIVE_BOOK_IDX) return;
ACTIVE_BOOK_IDX = bookIdx;
console.log("[DASHBOARD] selectBook", bookIdx);
document.querySelectorAll(".book-list-item").forEach((el) => {
el.classList.toggle("active", el.dataset.bookIdx === bookIdx);
});
refreshBook(bookIdx);
}
/* ---------------------------------------------------------
Book refresh (NO POLLING)
--------------------------------------------------------- */
async function refreshBook(bookIdx) {
console.log("[DASHBOARD] refreshBook", bookIdx);
const logs = await apiGet(`/api/book/${bookIdx}/logs`);
if (logs) safeUpdateLogs(logs);
refreshBookCards();
}
/* ---------------------------------------------------------
Bookcard buttons
--------------------------------------------------------- */
function bindBookCardButtons() {
console.log("[DASHBOARD] bindBookCardButtons()");
document.querySelectorAll(".icon-start").forEach((btn) => {
if (btn.dataset.bound) return;
btn.dataset.bound = "1";
btn.onclick = (e) => {
e.preventDefault();
const card = btn.closest(".book-card");
if (!card) return;
startBook(card.dataset.bookIdx);
};
});
document.querySelectorAll(".icon-abort").forEach((btn) => {
if (btn.dataset.bound) return;
btn.dataset.bound = "1";
btn.onclick = (e) => {
e.preventDefault();
const card = btn.closest(".book-card");
if (!card) return;
abortBook(card.dataset.bookIdx);
};
});
}
/* ---------------------------------------------------------
START
--------------------------------------------------------- */
function startBook(bookIdx) {
console.log("[DASHBOARD] START", bookIdx);
fetch("/start", {
method: "POST",
headers: { "Content-Type": "application/x-www-form-urlencoded" },
body: `book_idx=${bookIdx}`,
}).then(() => refreshBook(bookIdx));
}
/* ---------------------------------------------------------
ABORT
--------------------------------------------------------- */
function abortBook(bookIdx) {
if (!confirm(`Abort book ${bookIdx}?`)) return;
console.log("[DASHBOARD] ABORT", bookIdx);
fetch(`/abort/${bookIdx}`, { method: "POST" }).then(() =>
refreshBook(bookIdx)
);
}
/* ---------------------------------------------------------
Bookcard UI refresh (non-progress)
--------------------------------------------------------- */
async function refreshBookCards() {
console.log("[DASHBOARD] refreshBookCards()");
const books = await apiGet("/api/books");
if (!books) return;
document.querySelectorAll(".book-card").forEach((card) => {
const idx = card.dataset.bookIdx;
const info = books.find((b) => b.book_idx === idx);
if (!info) return;
console.log("[DASHBOARD] card status", idx, info.status);
card.className = `book-card ${info.status}`;
const abortBtn = card.querySelector(".icon-abort");
if (abortBtn) {
abortBtn.disabled = ![
"processing",
"downloading",
"parsing",
"audio",
].includes(info.status);
}
});
}

@ -0,0 +1,13 @@
/* =======================================================================
File: static/js/helpers.js
Purpose:
Shared DOM helpers for all JS files.
======================================================================= */
window.$ = (sel) => document.querySelector(sel);
window.$$ = (sel) => document.querySelectorAll(sel);
window.autoScroll = function (el) {
if (!el) return;
el.scrollTop = el.scrollHeight;
};

@ -0,0 +1,101 @@
/* ============================================================
File: static/js/inspect_state.js
Purpose:
- Receive merged state via state_updater.js
- Update ONLY the right-side state tables
- NO polling, NO fetch
============================================================ */
console.log("[inspect_state] JS loaded (subscriber mode)");
/* ------------------------------------------------------------
State subscription
------------------------------------------------------------ */
window.addEventListener("state:update", (e) => {
const entries = e.detail;
if (!Array.isArray(entries)) {
console.warn("[inspect_state] state:update payload is not array", entries);
return;
}
console.log("[inspect_state] state:update received entries:", entries.length);
updateInspectTables(entries);
});
/* ------------------------------------------------------------
Update tables
------------------------------------------------------------ */
function updateInspectTables(entries) {
console.log("[inspect_state] updating tables");
entries.forEach((entry) => {
const bookIdx = entry.book_idx;
if (bookIdx == null) {
console.warn("[inspect_state] entry without book_idx", entry);
return;
}
const block = document.querySelector(
`.state-block[data-book-idx="${bookIdx}"]`
);
if (!block) {
console.warn("[inspect_state] no state-block for book_idx", bookIdx);
return;
}
const table = block.querySelector(".state-table");
if (!table) {
console.warn("[inspect_state] no state-table for book_idx", bookIdx);
return;
}
console.log("[inspect_state] updating table for book_idx", bookIdx);
const sql = entry.sqlite || {};
const redis = entry.redis || {};
const merged = entry.would_merge_to || {};
table.innerHTML = `
<tr>
<th>Field</th>
<th>SQLite</th>
<th>Redis</th>
<th>Merged</th>
</tr>
${row("status", sql, redis, merged)}
${row("chapters_total", sql, redis, merged)}
${row("downloaded", sql, redis, merged)}
${row("chapters_download_done", sql, redis, merged)}
${row("chapters_download_skipped", sql, redis, merged)}
${row("parsed", sql, redis, merged)}
${row("chapters_parsed_done", sql, redis, merged)}
${row("audio_done", sql, redis, merged)}
${row("audio_skipped", sql, redis, merged)}
${row("last_update", sql, redis, merged)}
`;
});
}
/* ------------------------------------------------------------
Row helper
------------------------------------------------------------ */
function row(field, sql, redis, merged) {
const s = sql[field] ?? "";
const r = redis[field] ?? "";
const m = merged[field] ?? "";
const cls = String(s) === String(r) ? "same" : "diff";
return `
<tr>
<th>${field}</th>
<td class="${cls}">${s}</td>
<td class="${cls}">${r}</td>
<td>${m}</td>
</tr>
`;
}

@ -0,0 +1,130 @@
/* =======================================================================
File: static/js/log_view.js
Purpose:
High-performance rolling log viewer
- efficient delta polling
- append-only mode (no DOM reset)
- rolling limit (prevents memory freeze)
- supports both global logs and per-book logs
======================================================================= */
console.log(">>> log_view.js LOADING…");
/* ---------------------------------------------------------
Global log viewer state
--------------------------------------------------------- */
let LOG_FILTER = "ALL";
let LAST_LOG_INDEX = -1; // delta offset
const MAX_LOG_LINES = 600;
/* ---------------------------------------------------------
Apply filter on existing log lines
--------------------------------------------------------- */
function applyLogFilter() {
const lines = $$(".log-line");
lines.forEach((line) => {
const text = line.innerText;
const show = LOG_FILTER === "ALL" || (text && text.includes(LOG_FILTER));
line.style.display = show ? "block" : "none";
});
}
/* ---------------------------------------------------------
DOM Ready bind clear/filter
--------------------------------------------------------- */
document.addEventListener("DOMContentLoaded", () => {
console.log(">>> log_view.js DOMContentLoaded");
const clearBtn = $("#log-clear");
const output = $("#log-output");
if (!output) {
console.log(">>> log_view.js: No #log-output → viewer disabled");
return;
}
if (clearBtn) {
clearBtn.addEventListener("click", () => {
console.log(">>> log_view.js: Clear log viewer");
output.innerHTML = "";
LAST_LOG_INDEX = -1;
});
}
});
/* ---------------------------------------------------------
Append ONE line
--------------------------------------------------------- */
function rollingAppend(lineText) {
const output = $("#log-output");
if (!output) return;
const div = document.createElement("div");
div.classList.add("log-line");
// Type detection
if (lineText.includes("[DL]") || lineText.includes("[DOWNLOAD]"))
div.classList.add("dl");
else if (lineText.includes("[PARSE]")) div.classList.add("parse");
else if (lineText.includes("[SAVE]")) div.classList.add("save");
else if (lineText.includes("[AUDIO]")) div.classList.add("audio");
else if (lineText.includes("[CTRL]")) div.classList.add("ctrl");
else if (lineText.includes("[ERROR]")) div.classList.add("error");
else div.classList.add("default");
div.textContent = lineText;
output.appendChild(div);
// Rolling limit
while (output.childNodes.length > MAX_LOG_LINES) {
output.removeChild(output.firstChild);
}
}
/* ---------------------------------------------------------
Primary entry: updateLogs()
Accepts:
{ logs:[...], last:N }
OR legacy:
{ lines:[...], last:N }
--------------------------------------------------------- */
function updateLogs(packet) {
const output = $("#log-output");
if (!output || !packet) return;
let lines = packet.logs || packet.lines || [];
if (!Array.isArray(lines)) return;
lines.forEach((line) => rollingAppend(line));
// Correct unified delta index handling
if (packet.last !== undefined) {
LAST_LOG_INDEX = packet.last;
}
applyLogFilter();
autoScroll(output);
}
/* ---------------------------------------------------------
Delta polling global logs ONLY
(dashboard.js overrides logs per-book)
--------------------------------------------------------- */
function pollLogs() {
fetch(`/logs?last_index=${LAST_LOG_INDEX}`)
.then((r) => r.json())
.then((data) => {
const lines = data.lines || [];
if (lines.length > 0) {
lines.forEach((line) => rollingAppend(line));
LAST_LOG_INDEX = data.last;
}
})
.catch((err) => {
console.warn(">>> log_view.js pollLogs() error:", err);
});
}
setInterval(pollLogs, 2800);
console.log(">>> log_view.js LOADED");

@ -0,0 +1,98 @@
/* ========================================================
File: static/js/state_updater.js
Purpose:
- Poll /api/state/all
- Dispatch merged state to subscribers
(bookcard_controller, inspect_state, others)
- Pause polling when tab inactive
======================================================== */
console.log("[STATE-UPDATER] loaded");
const STATE_POLL_INTERVAL_MS = 2500;
const STATE_ENDPOINT = "/api/state/all";
let STATE_TIMER = null;
/* ========================================================
INIT
======================================================== */
document.addEventListener("DOMContentLoaded", () => {
initStateUpdater();
});
function initStateUpdater() {
const cards = document.querySelectorAll(".book-card");
if (cards.length === 0) {
console.log("[STATE-UPDATER] No bookcards found — skipping");
return;
}
console.log(`[STATE-UPDATER] Starting updater for ${cards.length} bookcards`);
startPolling(true);
document.addEventListener("visibilitychange", () => {
document.hidden ? stopPolling() : startPolling(true);
});
}
/* ========================================================
DISPATCH
======================================================== */
function dispatchState(entries) {
console.debug("[STATE] dispatch", entries.length);
// 1. Bookcards
if (typeof window.updateBookCardsFromState === "function") {
window.updateBookCardsFromState(entries);
}
// 2. Inspect state tables / other subscribers
window.dispatchEvent(new CustomEvent("state:update", { detail: entries }));
}
/* ========================================================
POLLING CONTROL
======================================================== */
function startPolling(immediate = false) {
if (STATE_TIMER) return;
console.log("[STATE-UPDATER] Start polling");
if (immediate) pollState();
STATE_TIMER = setInterval(pollState, STATE_POLL_INTERVAL_MS);
}
function stopPolling() {
if (!STATE_TIMER) return;
console.log("[STATE-UPDATER] Stop polling (tab inactive)");
clearInterval(STATE_TIMER);
STATE_TIMER = null;
}
/* ========================================================
POLL API
======================================================== */
async function pollState() {
if (document.hidden) return;
try {
const resp = await fetch(STATE_ENDPOINT, { cache: "no-store" });
if (!resp.ok) return;
const entries = await resp.json();
if (!Array.isArray(entries)) return;
dispatchState(entries);
} catch (e) {
console.error("[STATE-UPDATER] poll error", e);
}
}

@ -0,0 +1,35 @@
<!-- File: templates/base.html -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>BookScraper</title>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<!-- CSS -->
<link rel="stylesheet" href="/static/css/style.css" />
<link rel="stylesheet" href="/static/css/dashboard.css" />
</head>
<body>
<!-- Global Navigation -->
{% include "components/nav.html" %}
<!-- Main Content Area -->
<main class="container">{% block content %}{% endblock %}</main>
<!-- JS -->
<script src="/static/js/app.js"></script>
<script src="/static/js/log_view.js"></script>
<script src="/static/js/dashboard.js"></script>
<!-- GLOBAL STATE UPDATER -->
<script src="/static/js/state_updater.js"></script>
<script>
document.addEventListener("DOMContentLoaded", () => {
if (typeof initStateUpdater === "function") {
initStateUpdater();
}
});
</script>
</body>
</html>

@ -0,0 +1,66 @@
<!-- =======================================================================
File: templates/components/book_list_item.html
Purpose:
Dashboard weergave van één boek in de lijst.
Variabelen komen binnen via:
book.<veld>
→ Boek gebruikt nu uitsluitend book_idx als primaire sleutel
======================================================================= -->
<div class="book-list-item" data-book-idx="{{ book.book_idx }}">
<!-- Left area: title + metadata -->
<div class="book-info">
<div class="book-title">{{ book.title }}</div>
<div class="book-meta">
<span class="meta-label">IDX:</span> {{ book.book_idx }} {% if
book.last_update %}
<span class="meta-separator"></span>
<span class="meta-label">Updated:</span> {{ book.last_update }} {% endif
%}
</div>
</div>
<!-- Center area: Status -->
<div class="book-status">
<span class="status-badge status-{{ book.status|lower }}">
{{ book.status|capitalize }}
</span>
</div>
<!-- Right area: progress mini-bars -->
<div class="book-progress-mini">
<!-- Download progress -->
<div class="progress-mini-row">
<span class="mini-label">DL:</span>
{% set pct_dl = 0 %} {% if book.download_total > 0 %} {% set pct_dl = (100
* book.download_done / book.download_total) | round(0) %} {% endif %}
<div class="progress-mini-bar">
<div class="fill" style="width: {{ pct_dl }}%;"></div>
</div>
<span class="mini-value">{{ pct_dl }}%</span>
</div>
<!-- Audio progress -->
<div class="progress-mini-row">
<span class="mini-label">AU:</span>
{% set pct_au = 0 %} {% if book.audio_total > 0 %} {% set pct_au = (100 *
book.audio_done / book.audio_total) | round(0) %} {% endif %}
<div class="progress-mini-bar audio">
<div class="fill audio-fill" style="width: {{ pct_au }}%;"></div>
</div>
<span class="mini-value">{{ pct_au }}%</span>
</div>
</div>
<!-- Abort button -->
<div class="book-abort-area">
<button class="abort-btn" onclick="abortBookAjax('{{ book.book_idx }}')">
Abort
</button>
</div>
</div>

@ -0,0 +1,90 @@
{# ============================================================ File:
templates/components/bookcard.html Purpose: Eén enkele boekkaart (dumb
component) ============================================================ #}
<div class="book-card {{ b.status }}" data-book-idx="{{ b.book_idx }}">
<!-- HIDE -->
<form
action="/hide/{{ b.book_idx }}"
method="POST"
class="hide-form"
onsubmit="return confirm('Dit boek verbergen?')"
>
<button class="icon-btn icon-hide" title="Verbergen">
<i class="fa-solid fa-xmark"></i>
</button>
</form>
<!-- COVER -->
<div class="book-cover">
{% if b.cover_path %}
<img
src="/{{ b.cover_path }}"
class="book-img"
data-field="cover"
alt="cover"
/>
{% else %}
<div class="book-img placeholder" data-field="cover">?</div>
{% endif %}
</div>
<!-- META -->
<div class="book-meta">
<!-- STATUS BADGE -->
{% if b.status %}
<span
class="status-badge status-{{ b.status }}"
title="
{% if b.status == 'done' %}Alle chapters en audio zijn compleet{% endif %}
{% if b.status == 'audio' %}Downloads compleet, audio wordt nog gegenereerd{% endif %}
{% if b.status == 'downloading' %}Bezig met downloaden{% endif %}
"
>
{{ b.status | upper }}
</span>
{% endif %}
<div class="book-title" data-field="title">{{ b.title }}</div>
<div class="book-author" data-field="author">{{ b.author }}</div>
<div class="book-created">
Geregistreerd: <span data-field="created_at">{{ b.created_at }}</span>
</div>
<!-- ACTIONS -->
<div class="book-actions">
<!-- START -->
<form action="/start" method="POST">
<input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button class="icon-btn icon-start" title="Start" data-action="start">
<i class="fa-solid fa-play"></i>
</button>
</form>
<!-- ABORT -->
<form action="/abort/{{ b.book_idx }}" method="POST">
<input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button class="icon-btn icon-abort" title="Abort" data-action="abort">
<i class="fa-solid fa-stop"></i>
</button>
</form>
<form
method="post"
action="/inspect/statuscheck/{{ b.book_idx }}"
style="display: inline-block"
>
<button
type="submit"
class="icon-btn icon-statuscheck"
title="Herbereken status op basis van bestanden"
>
<i class="fa-solid fa-magnifying-glass-chart"></i>
</button>
</form>
</div>
</div>
<!-- PROGRESS -->
<div class="book-progress">{% include "components/progress_box.html" %}</div>
</div>

@ -0,0 +1,44 @@
<!-- =======================================================================
File: templates/components/log_view.html
Purpose: Reusable log viewer component for any page (dashboard/start/book)
Notes:
- Requires JS: /static/js/log_view.js
- Supports filtering by tag (e.g. [DL], [PARSE], [AUDIO], [CTRL], ...)
- Template expects optional variable `logs` (list[str])
======================================================================= -->
<div id="log-viewer" class="log-viewer">
<!-- ========================== HEADER ========================== -->
<div class="log-header">
<h2>Live Log</h2>
<div class="log-filters">
<label for="log-filter">Filter:</label>
<select id="log-filter">
<option value="ALL">All</option>
<option value="[DL]">Download</option>
<option value="[PARSE]">Parse</option>
<option value="[SAVE]">Save</option>
<option value="[AUDIO]">Audio</option>
<option value="[CTRL]">Controller</option>
<option value="[SCRAPING]">Scraping</option>
<option value="[BOOK]">Book</option>
<option value="[ERROR]">Errors</option>
</select>
<button id="log-clear" class="btn-small">Clear</button>
</div>
</div>
<!-- ========================== OUTPUT ========================== -->
<div id="log-output" class="log-output">
{% if logs and logs|length > 0 %} {% for line in logs %}
<div class="log-line">{{ line }}</div>
{% endfor %} {% else %}
<div class="log-empty">No logs yet…</div>
{% endif %}
</div>
</div>
<script src="/static/js/log_view.js"></script>

@ -0,0 +1,40 @@
<!-- =======================================================================
File: templates/components/nav.html
Purpose: Global navigation bar for BookScraper UI (improved version)
======================================================================= -->
<nav class="navbar">
<div class="nav-inner">
<!-- Branding / Home -->
<div class="nav-brand">
<a href="/">BookScraper</a>
</div>
<!-- Main navigation -->
<ul class="nav-links">
<li>
<a href="/dashboard" class="nav-item"> Dashboard </a>
</li>
<li>
<a href="/api/books" class="nav-item"> Active Books </a>
</li>
<li>
<a href="/debug/inspect_state" class="nav-item"> State overview </a>
</li>
<!-- Tools dropdown -->
<li class="nav-dropdown">
<span class="nav-item">Tools ▾</span>
<ul class="dropdown-menu">
<li><a href="/api/db/books">DB Viewer</a></li>
<li><a href="/debug/inspect_state">Inspect State</a></li>
<li><a href="/debug/sync_state">Sync State</a></li>
<li><a href="/debug/redis-keys">Redis Keys</a></li>
<li><a href="/debug/queues">queues</a></li>
</ul>
</li>
</ul>
</div>
</nav>

@ -0,0 +1,34 @@
<!-- =======================================================================
File: templates/components/progress_box.html
Purpose:
Dumb progress UI for a book card.
Initial values via Jinja, live updates via state_updater.js
======================================================================= -->
<div class="progress-box">
<!-- DOWNLOAD -->
<div class="progress-row">
<div class="progress-label">Download</div>
<div class="progressbar">
<div
class="progressbar-fill download"
data-field="download_pct"
style="width: 0%"
></div>
<div class="progressbar-text" data-field="download_text">0 / 0</div>
</div>
</div>
<!-- AUDIO -->
<div class="progress-row">
<div class="progress-label">Audio</div>
<div class="progressbar">
<div
class="progressbar-fill audio"
data-field="audio_pct"
style="width: 0%"
></div>
<div class="progressbar-text" data-field="audio_text">0 / 0</div>
</div>
</div>
</div>

@ -0,0 +1,21 @@
{# ============================================================ File:
templates/components/registered_books.html Purpose: Toon een grid van
geregistreerde boeken. Elke kaart wordt gerenderd via bookcard.html.
============================================================ #}
<section class="dashboard-section">
<h2>Geregistreerde boeken</h2>
{% if registered and registered|length > 0 %}
<div class="registered-grid">
{% for b in registered %} {% include "components/bookcard.html" %} {% endfor
%}
</div>
{% else %}
<p>Geen geregistreerde boeken.</p>
{% endif %}
</section>

@ -0,0 +1,22 @@
<!-- =======================================================================
File: templates/components/url_input.html
Purpose:
Reusable component for entering a book URL.
Used on landing pages or detail pages.
======================================================================= -->
<form method="POST" action="/init" class="url-form">
<label for="urls" class="url-label"> Book URL(s) one per line: </label>
<textarea
id="urls"
name="urls"
class="url-input"
rows="5"
placeholder="https://www.piaotia.com/bookinfo/6/6072.html
https://www.piaotia.com/bookinfo/3/3785.html"
required
></textarea>
<button type="submit" class="btn-primary url-submit">Register book(s)</button>
</form>

@ -0,0 +1,38 @@
<!-- =======================================================================
File: templates/dashboard/book_detail.html
Purpose:
Detailpagina voor één book_idx.
Toont progress (download/audio) + filters + live logs.
======================================================================= -->
{% extends "layout.html" %} {% block content %}
<div class="dashboard-detail">
<h1 class="page-title">{{ title }}</h1>
<p class="breadcrumb">
<a href="/dashboard">← Terug naar dashboard</a>
</p>
<!-- Progress box -->
<section id="progressSection">
{% include "components/progress_box.html" with book_idx=book_idx,
title=title, download_total=download_total, download_done=download_done,
audio_total=audio_total, audio_done=audio_done %}
</section>
<!-- Log view -->
<section class="log-section">
<h2>Live Log</h2>
{% include "components/log_view.html" %}
</section>
</div>
<!-- PAGE-SPECIFIC JS -->
<script>
const BOOK_IDX = "{{ book_idx }}";
</script>
<script src="/static/js/log_view.js"></script>
<script src="/static/js/dashboard.js"></script>
{% endblock %}

@ -0,0 +1,46 @@
{% extends "layout.html" %} {% block content %}
<div class="dashboard-container">
<!-- =======================================================================
File: templates/dashboard/dashboard.html
Purpose:
Functioneel dashboard:
• Start nieuwe scrape via URL input component
• Toont lijst van actieve boeken (actieve state model)
• Toont globale live logs
Vereist:
- books: lijst van actieve boeken
- logs: lijst van globale logs (optioneel)
======================================================================= -->
<!-- ===========================================================
URL INPUT — Start nieuwe scrape
=========================================================== -->
<section class="dashboard-section">
<h2>Start nieuwe scrape</h2>
{% include "components/url_input.html" %}
</section>
<hr />
<!-- ===========================================================
BOOK LIST
=========================================================== -->
{% include "components/registered_books.html" %}
<hr />
<!-- ===========================================================
GLOBAL LIVE LOG VIEW
=========================================================== -->
<section class="dashboard-section">
<h2>Live log (globaal)</h2>
{# log_view verwacht altijd 'logs' — garandeer list #} {% set logs = logs or
[] %} {% include "components/log_view.html" %}
</section>
</div>
<!-- JS -->
<script src="/static/js/dashboard.js"></script>
{% endblock %}

@ -0,0 +1,95 @@
{# ============================================================ File:
templates/debug/inspect_state.html Purpose: Inspect SQLite vs Redis state per
book_idx - Initial render via Jinja - Live updates via inspect_state.js -
BookCard is server-rendered and NEVER replaced - Only the right-side state table
is updated dynamically
============================================================ #} {% extends
"layout.html" %} {% block content %}
<h1>State Inspection (SQL vs Redis)</h1>
<style>
.state-block {
display: grid;
grid-template-columns: 380px 1fr;
gap: 20px;
margin-bottom: 35px;
padding: 18px;
border: 1px solid #444;
background: #222;
border-radius: 8px;
}
.state-table {
width: 100%;
border-collapse: collapse;
}
.state-table th,
.state-table td {
border: 1px solid #555;
padding: 6px 10px;
}
.state-table th {
background: #333;
color: #fff;
}
.state-table td {
background: #2a2a2a;
color: #ddd;
}
.same {
color: #9f9 !important;
}
.diff {
color: #ff7b7b !important;
font-weight: bold;
}
</style>
<div id="state-container">
{% for entry in results %}
<div class="state-block" data-book-idx="{{ entry.book_idx }}">
<!-- LEFT: BookCard (server-rendered, NEVER replaced) -->
<div>
{% if entry.card %} {% with b = entry.card %} {% include
"components/bookcard.html" %} {% endwith %} {% else %}
<strong>{{ entry.book_idx }}</strong>
{% endif %}
</div>
<!-- RIGHT: State table (updated by JS) -->
<div>
<table class="state-table">
<tr>
<th>Field</th>
<th>SQLite</th>
<th>Redis</th>
<th>Merged</th>
</tr>
{% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged
= entry.would_merge_to %} {% for field in [ "status", "chapters_total",
"downloaded", "chapters_download_done", "chapters_download_skipped",
"parsed", "chapters_parsed_done", "audio_done", "audio_skipped",
"last_update" ] %}
<tr>
<th>{{ field }}</th>
<td>{{ sql.get(field, "") }}</td>
<td>{{ redis.get(field, "") }}</td>
<td>{{ merged.get(field, "") }}</td>
</tr>
{% endfor %}
</table>
</div>
</div>
{% endfor %}
</div>
{% endblock %} {% block scripts %}
<script src="/static/js/inspect_state.js"></script>
{% endblock %}

@ -0,0 +1,91 @@
{% extends "layout.html" %} {% block content %}
<h1>Celery Queue Debug</h1>
<style>
.debug-section {
margin-bottom: 40px;
}
.debug-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 20px;
}
.debug-table th,
.debug-table td {
border: 1px solid #444;
padding: 6px 10px;
}
.debug-table th {
background: #333;
color: #fff;
}
pre {
background: #1e1e1e;
color: #ddd;
padding: 10px;
overflow-x: auto;
}
code {
color: #9cf;
}
</style>
<div class="debug-section">
<h2>Workers</h2>
<h3>Active Tasks</h3>
<pre>{{ workers_active | tojson(indent=2) }}</pre>
<h3>Reserved</h3>
<pre>{{ workers_reserved | tojson(indent=2) }}</pre>
<h3>Scheduled</h3>
<pre>{{ workers_scheduled | tojson(indent=2) }}</pre>
</div>
<hr />
<div class="debug-section">
<h2>Queues</h2>
{% for q in queues %}
<div class="debug-queue">
<h3>{{ q.name }} ({{ q.length }} items)</h3>
<table class="debug-table">
<tr>
<th>Redis Key</th>
<td>{{ q.redis_key }}</td>
</tr>
<tr>
<th>Length</th>
<td>{{ q.length }}</td>
</tr>
<tr>
<th>Items (first 30)</th>
<td>
{% if q["items"] %}
<ul style="margin: 0; padding-left: 20px">
{% for item in q["items"] %}
<li><code>{{ item | e }}</code></li>
{% endfor %}
</ul>
{% else %}
<i>No items</i>
{% endif %}
</td>
</tr>
</table>
</div>
{% endfor %}
</div>
<script>
setInterval(() => {
window.location.reload();
}, 5000);
</script>
{% endblock %}

@ -0,0 +1,23 @@
<!-- =======================================================================
File: templates/home.html
Purpose:
New landing page for starting a scrape.
Does NOT replace existing index.html.
Uses reusable components (url_input).
Redirects to /start?url=...
======================================================================= -->
{% extends "layout.html" %} {% block content %}
<div class="landing-container">
<h1 class="landing-title">Start a New Book Scrape</h1>
<!-- Reusable URL input component -->
{% include "components/url_input.html" %}
<div class="landing-links">
<a href="/dashboard">→ Go to Dashboard</a>
</div>
</div>
{% endblock %}

@ -1,34 +1,53 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="nl"> <html lang="nl">
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8" />
<title>BookScraper</title> <title>BookScraper</title>
<style> <style>
body { font-family: Arial, sans-serif; padding: 40px; max-width: 600px; margin: auto; } body {
h1 { margin-bottom: 20px; } font-family: Arial, sans-serif;
input[type="text"] { padding: 40px;
width: 100%; padding: 12px; font-size: 16px; max-width: 600px;
border: 1px solid #ccc; border-radius: 6px; margin: auto;
} }
button { h1 {
margin-top: 20px; margin-bottom: 20px;
padding: 12px 20px; }
background: #007bff; color: white; input[type="text"] {
border: none; border-radius: 6px; width: 100%;
font-size: 16px; cursor: pointer; padding: 12px;
} font-size: 16px;
button:hover { background: #0056b3; } border: 1px solid #ccc;
border-radius: 6px;
}
button {
margin-top: 20px;
padding: 12px 20px;
background: #007bff;
color: white;
border: none;
border-radius: 6px;
font-size: 16px;
cursor: pointer;
}
button:hover {
background: #0056b3;
}
</style> </style>
</head> </head>
<body> <body>
<h1>BookScraper WebGUI</h1>
<h1>BookScraper WebGUI</h1> <form action="/init" method="POST">
<label for="url">Geef een boek-URL op:</label><br /><br />
<form action="/start" method="POST"> <input
<label for="url">Geef een boek-URL op:</label><br><br> type="text"
<input type="text" id="url" name="url" placeholder="https://example.com/book/12345" required> id="url"
<button type="submit">Start Scraping</button> name="url"
</form> placeholder="https://example.com/book/12345"
required
</body> />
<button type="submit">Start Scraping</button>
</form>
</body>
</html> </html>

@ -0,0 +1,115 @@
{% extends "layout.html" %} {% block content %}
<h2>Statuscheck Inspect</h2>
{% if error %}
<div class="error"><strong>Fout:</strong> {{ error }}</div>
{% else %}
<!-- ===================================================== -->
<!-- BOEK -->
<!-- ===================================================== -->
<h3>Boek</h3>
<table class="kv">
<tr>
<th>Book idx</th>
<td>{{ result.book_idx }}</td>
</tr>
<tr>
<th>Pad</th>
<td>{{ result.filesystem.book_dir }}</td>
</tr>
<tr>
<th>Bestaat</th>
<td>{{ result.filesystem.exists }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- FILESYSTEM -->
<!-- ===================================================== -->
<h3>Filesystem (source of truth)</h3>
<table class="kv">
<tr>
<th>Volumes</th>
<td>{{ result.filesystem.volumes }}</td>
</tr>
<tr>
<th>Chapters (.txt)</th>
<td>{{ result.filesystem.chapters_txt }}</td>
</tr>
<tr>
<th>Audio (.m4b)</th>
<td>{{ result.filesystem.audio_files }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- SQL -->
<!-- ===================================================== -->
<h3>SQL snapshot</h3>
<h4>Voor</h4>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ result.sql_before.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ result.sql_before.audio_done }}</td>
</tr>
<tr>
<th>Status</th>
<td>{{ result.sql_before.status }}</td>
</tr>
</table>
<h4>Na</h4>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ result.sql_after.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ result.sql_after.audio_done }}</td>
</tr>
<tr>
<th>Last update</th>
<td>{{ result.sql_after.last_update }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- REPOSITORY -->
<!-- ===================================================== -->
<h3>Repository merged state (UI input)</h3>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ repo_state.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ repo_state.audio_done }}</td>
</tr>
<tr>
<th>Chapters total</th>
<td>{{ repo_state.chapters_total }}</td>
</tr>
</table>
<details>
<summary>Raw repository state</summary>
<pre>{{ repo_state | tojson(indent=2) }}</pre>
</details>
{% endif %}
<hr />
<a href="/dashboard">← Terug naar dashboard</a>
{% endblock %}

@ -0,0 +1,44 @@
<!DOCTYPE html>
<html lang="nl">
<head>
<!-- =======================================================================
File: templates/layout.html
Purpose:
Globale layout voor alle paginas
======================================================================= -->
<meta charset="UTF-8" />
<title>BookScraper</title>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<!-- CSS -->
<link rel="stylesheet" href="/static/css/style.css" />
<link rel="stylesheet" href="/static/css/dashboard.css" />
<link rel="stylesheet" href="/static/css/bookcard.css" />
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css"
/>
<!-- GLOBAL HELPERS (moet ALTIJD boven alles geladen worden) -->
<script src="/static/js/helpers.js"></script>
</head>
<body>
{% include "components/nav.html" %}
<main class="container">{% block content %}{% endblock %}</main>
<footer class="footer">
BookScraper © 2025 — Powered by Celery + Redis
</footer>
{% block scripts %}{% endblock %}
<script src="/static/js/bookcard_controller.js"></script>
<script src="/static/js/state_updater.js"></script>
<script src="/static/js/dashboard.js"></script>
<!-- GLOBAL APP LOGIC (altijd als laatste) -->
<script src="/static/js/app.js"></script>
</body>
</html>

@ -66,6 +66,7 @@
display: none; display: none;
} }
</style> </style>
s
</head> </head>
<body> <body>

@ -57,7 +57,9 @@ Copyright ©=
本站立场无关= 本站立场无关=
均由网友发表或上传= 均由网友发表或上传=
感谢各位书友的支持,您的支持就是我们最大的动力 感谢各位书友的支持,您的支持就是我们最大的动力
飘天文学www.piaotia.com
感谢各位书友的支持
您的支持就是我们最大的动力
# ---------- COMMON NOISE ---------- # ---------- COMMON NOISE ----------
广告= 广告=
广告位= 广告位=

@ -0,0 +1,13 @@
#!/bin/sh
# mp4info shim for m4b-tool (ffprobe-based)
if [ -z "$1" ]; then
echo "Usage: mp4info <file>" >&2
exit 1
fi
# ffprobe outputs float seconds; m4b-tool expects an integer
ffprobe -v error \
-show_entries format=duration \
-of default=noprint_wrappers=1:nokey=1 \
"$1" | awk '{ printf "%d\n", ($1 + 0.5) }'

@ -5,7 +5,7 @@ import requests
from io import BytesIO from io import BytesIO
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scraper.logger import log_debug from scraper.logger import log_debug
from scraper.utils import clean_text from scraper.utils.utils import clean_text
from urllib.parse import urljoin from urllib.parse import urljoin
@ -103,8 +103,11 @@ class ChapterDownloader:
collecting = True collecting = True
continue continue
text = sib.get_text("\n", strip=True) if hasattr( text = (
sib, "get_text") else str(sib).strip() sib.get_text("\n", strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text: if text:
parts.append(text) parts.append(text)
@ -121,6 +124,7 @@ class ChapterDownloader:
vdir = f"{output_base}/v{volume}" vdir = f"{output_base}/v{volume}"
import os import os
os.makedirs(vdir, exist_ok=True) os.makedirs(vdir, exist_ok=True)
fname = f"{number:05d}_{title}.txt" fname = f"{number:05d}_{title}.txt"

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save