Compare commits

..

No commits in common. 'main' and 'feat/download-progress-abort' have entirely different histories.

4
.gitignore vendored

@ -11,6 +11,4 @@
# Negeer alle .env bestanden # Negeer alle .env bestanden
.env .env
**/.env **/.env
log.txt log.txt
**/static/covers/

@ -1,4 +1 @@
output/ output/
venv/
*.log
__pycache__/

@ -123,43 +123,7 @@ docker run \
bookscraper bookscraper
```
docker compose down --remove-orphans
docker image prune -f
docker builder prune -af
docker volume prune -f
docker compose build --no-cache
docker compose up
docker compose down docker compose down
docker compose build docker compose build --no-cache
docker compose up docker compose up
```
docker compose up -d
docker compose build --no-cache web && docker compose up web
docker compose build worker_download && docker compose up worker_download
docker compose down --remove-orphans
docker compose build --no-cache worker_m4b
docker compose up -d worker_m4b
docker compose up web
docker compose build web
docker compose restart web
tar \
--exclude="**pycache**" \
--exclude="_/**pycache**/_" \
--exclude="\*.pyc" \
--exclude=".venv" \
--exclude="venv" \
-czvf project.tar.gz .
docker compose down
docker image rm bookscraper-worker_m4b || true
docker builder prune -af
docker compose build --no-cache worker_m4b
docker compose up -d worker_m4b

@ -1,529 +1,110 @@
# ============================================ # ============================================
# File: bookscraper/app.py (ASYNC SCRAPING) # File: bookscraper/app.py (ASYNC SCRAPING)
# ============================================ # ============================================
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
import os
from flask import (
Flask,
render_template,
request,
jsonify,
send_from_directory,
redirect,
url_for,
)
print(">>> [WEB] Importing celery_app …") print(">>> [WEB] Importing celery_app …")
from celery_app import celery_app from celery_app import celery_app
from celery.result import AsyncResult
from db.db import init_db
from db.repository import (
get_registered_books,
fetch_book,
fetch_all_books,
get_progress,
)
from logbus.publisher import log from flask import Flask, render_template, request, jsonify
from scraper.logger import log_debug from scraper.logger import log_debug
# Abort + Progress (per book_id)
from scraper.abort import set_abort from scraper.abort import set_abort
from scraper.ui_log import get_ui_logs, reset_ui_logs from scraper.progress import get_progress
from scraper.state import state as r
from scraper.logger_decorators import logcall
from scraper.utils.state_sync import sync_books_from_redis
from scraper.services.init_service import InitService
# UI LOGS (GLOBAL — no book_id)
from scraper.ui_log import get_ui_logs
# INIT DB from celery.result import AsyncResult
init_db()
app = Flask(__name__) app = Flask(__name__)
# =====================================================
# STATIC FILE SERVING
# =====================================================
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@app.route("/output/<path:filename>")
@logcall
def serve_output(filename):
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
# ===================================================== # =====================================================
# SECTION 1 — NAVIGATION / HTML PAGES # HOME PAGE
# ===================================================== # =====================================================
@app.route("/", methods=["GET"]) @app.route("/", methods=["GET"])
@logcall
def index(): def index():
return redirect(url_for("dashboard")) return render_template("index.html")
@app.route("/dashboard", methods=["GET"])
@logcall
def dashboard():
logs_list = get_ui_logs() or []
registered_books = get_registered_books()
log(f"[WEB] Registered books: {registered_books}")
from db.repository import fetch_all_books
from pprint import pprint
pprint(fetch_all_books())
pprint(get_registered_books())
# reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
books=list_active_books(),
registered=registered_books,
logs=logs_list,
)
@app.route("/book/<book_idx>")
@logcall
def book_detail(book_idx):
title = r.get(f"book:{book_idx}:title") or book_idx
return render_template(
"dashboard/book_detail.html",
book_id=book_idx,
title=title,
logs=get_ui_logs(),
)
# ===================================================== # =====================================================
# SECTION 2 — ACTION ROUTES (INIT, START, ABORT, HIDE) # START SCRAPING (async via Celery)
# ===================================================== # =====================================================
@app.route("/init", methods=["POST"])
@logcall
def init_book():
# -------------------------------------------------
# Accept single URL (legacy) OR multi-line URLs
# -------------------------------------------------
raw_urls = request.form.get("urls") or request.form.get("url") or ""
urls = [line.strip() for line in raw_urls.splitlines() if line.strip()]
if not urls:
return render_template(
"dashboard/dashboard.html",
error="Geen URL(s) opgegeven.",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
# -------------------------------------------------
# Duplicate check: existing book_ids
# -------------------------------------------------
existing_books = {b["book_idx"] for b in fetch_all_books()}
results = []
# -------------------------------------------------
# Process each URL independently
# -------------------------------------------------
for url in urls:
try:
book_id = InitService.derive_book_id(url)
if book_id in existing_books:
results.append(
{
"url": url,
"status": "skipped",
"book_id": book_id,
"message": "Al geregistreerd",
}
)
continue
result = InitService.execute(url)
results.append(
{
"url": url,
"status": "registered",
"book_id": result.get("book_id"),
"title": result.get("title"),
}
)
except Exception as e:
log_debug(f"[INIT] ERROR for url={url}: {e}")
results.append(
{
"url": url,
"status": "error",
"error": str(e),
}
)
# -------------------------------------------------
# Summary message
# -------------------------------------------------
ok = sum(1 for r in results if r["status"] == "registered")
skipped = sum(1 for r in results if r["status"] == "skipped")
failed = sum(1 for r in results if r["status"] == "error")
message = f"Geregistreerd: {ok}, overgeslagen: {skipped}, fouten: {failed}"
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
message=message,
init_results=results, # optioneel voor UI-weergave
books=list_active_books(),
registered=reg,
logs=get_ui_logs(),
)
@app.route("/hide/<book_idx>", methods=["POST"])
@logcall
def hide_registered_book(book_idx):
# intentionally left disabled
pass
@app.route("/start", methods=["POST"]) @app.route("/start", methods=["POST"])
@logcall
def start_scraping(): def start_scraping():
# 1) Form field: book_idx url = request.form.get("url", "").strip()
book_idx = request.form.get("book_idx")
log(f"[WEB][START] Received start request for book_idx={book_idx}")
if not book_idx:
msg = "book_idx ontbreekt in formulier"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 400
# 2) Fetch boek uit SQLite
try:
book = fetch_book(book_idx)
log(f"[WEB][START] Fetched book from DB: {book}")
except Exception as e:
log(f"[WEB][START] DB ERROR: {e}")
return jsonify({"status": "error", "message": "DB fout"}), 500
if not book:
msg = f"Boek '{book_idx}' niet gevonden in DB"
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 404
# 3) Boek moet een URL hebben
url = book.get("book_url")
if not url: if not url:
msg = f"Boek '{book_idx}' heeft geen book_url in DB" return render_template("result.html", error="Geen URL opgegeven.")
log(f"[WEB][START] ERROR: {msg}")
return jsonify({"status": "error", "message": msg}), 500
# 4) Reset UI logs
reset_ui_logs()
# 5) Logging
log(f"[WEB][START] Starting full scrape book_idx={book_idx}, url={url}")
log_debug(f"[WEB][START] DEBUG: book data = {book}")
# 6) Celery controller taak starten log_debug(f"[WEB] Scraping via Celery: {url}")
try:
async_result = celery_app.send_task(
"scraper.tasks.controller_tasks.start_full_scrape",
args=[book_idx],
queue="controller",
)
except Exception as e:
log(f"[WEB][START] Celery ERROR: {e}")
return jsonify({"status": "error", "message": f"Celery fout: {e}"}), 500
# 7) Successfully dispatched task async_result = celery_app.send_task(
log(f"[WEB][START] Task dispatched: {async_result.id}") "scraper.tasks.scraping.start_scrape_book",
args=[url],
reg = [b for b in get_registered_books() if b.get("status") != "hidden"] queue="scraping",
)
return render_template( return render_template(
"dashboard/dashboard.html", "result.html",
message="Scraping gestart.",
scraping_task_id=async_result.id, scraping_task_id=async_result.id,
books=list_active_books(),
registered=reg,
logs=get_ui_logs(),
) )
@app.route("/abort/<book_idx>", methods=["POST"])
@logcall
def abort_download(book_idx):
log_debug(f"[WEB] Abort requested for book: {book_idx}")
set_abort(book_idx)
return jsonify({"status": "ok", "aborted": book_idx})
# ===================================================== # =====================================================
# SECTION 3 — API ROUTES (JSON) # ABORT (per book_id)
# ===================================================== # =====================================================
@app.route("/abort/<book_id>", methods=["POST"])
def abort_download(book_id):
@app.route("/api/state/all", methods=["GET"]) log_debug(f"[WEB] Abort requested for book: {book_id}")
@logcall set_abort(book_id)
def api_state_all(): return jsonify({"status": "ok", "aborted": book_id})
"""
Returns the merged SQL + Redis state for all books
(same logic as /debug/inspect_state but JSON-only).
"""
from scraper.utils.state_sync import inspect_books_state
return jsonify(inspect_books_state())
@app.route("/api/books")
@logcall
def api_books():
return jsonify(list_active_books())
@app.route("/api/book/<book_idx>/status")
@logcall
def api_book_status(book_idx):
return jsonify(getStatus(book_idx))
@app.route("/api/book/<book_idx>/logs")
@logcall
def api_book_logs(book_idx):
logs = r.lrange(f"logs:{book_idx}", 0, -1) or []
return jsonify(logs)
@app.route("/progress/<book_idx>")
@logcall
def progress(book_idx):
return jsonify(get_progress(book_idx))
@app.route("/celery-result/<task_id>")
@logcall
def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app)
if result.successful():
return jsonify({"ready": True, "result": result.get()})
if result.failed():
return jsonify({"ready": True, "error": "failed"})
return jsonify({"ready": False})
@app.route("/clear-logs", methods=["POST"])
@logcall
def clear_logs():
reset_ui_logs()
return jsonify({"status": "ok"})
@app.route("/logs", methods=["GET"])
@logcall
def logs():
try:
last_index = int(request.args.get("last_index", -1))
except:
last_index = -1
all_logs = get_ui_logs() or []
new_lines = []
new_last = last_index
for idx, line in enumerate(all_logs):
if idx > last_index:
new_lines.append(line)
new_last = idx
return jsonify({"lines": new_lines, "last": new_last})
from flask import render_template
from scraper.services.status_check_service import StatusCheckService
from logbus.publisher import log
from db.repository import get_book_state
@app.route("/inspect/statuscheck/<book_idx>", methods=["POST"])
@logcall
def inspect_statuscheck(book_idx):
try:
StatusCheckService.run(book_idx)
return ("", 204) # background action, geen UI
except Exception as e:
log(f"[STATUSCHECK] ERROR book_idx={book_idx}: {e}")
return jsonify({"error": str(e)}), 500
# ===================================================== # =====================================================
# SECTION 4 — DEBUG ROUTES # PROGRESS (per book_id)
# ===================================================== # =====================================================
@app.route("/progress/<book_id>", methods=["GET"])
def progress(book_id):
@app.route("/debug/sync_state", methods=["GET"]) return jsonify(get_progress(book_id))
def debug_sync_state():
results = sync_books_from_redis()
return {"status": "ok", "synced": results}
from scraper.utils.state_sync import inspect_books_state
@app.route("/debug/inspect_state", methods=["GET"])
def debug_inspect_state():
results = inspect_books_state()
return render_template("debug/inspect_state.html", results=results)
@app.route("/debug/redis-keys")
@logcall
def debug_redis_keys():
cursor = 0
results = {}
while True:
cursor, keys = r.scan(cursor, match="*", count=200)
for k in keys:
try:
results[k] = r.get(k)
except:
results[k] = "<non-string value>"
if cursor == 0:
break
return jsonify(results)
# ===================================================== # =====================================================
# DB DEBUG # LOGS — GLOBAL UI LOGS
# ===================================================== # =====================================================
@app.route("/logs", methods=["GET"])
def logs():
@app.route("/api/db/books") return jsonify({"logs": get_ui_logs()})
@logcall
def api_db_books():
try:
books = fetch_all_books()
return jsonify({"status": "ok", "books": books})
except Exception as e:
return jsonify({"status": "error", "message": str(e)}), 500
# =============================================
# DEBUG QUEUE VIEW (HTML)
# =============================================
from flask import render_template
from urllib.parse import urlparse
import redis
from celery_app import celery_app
@app.route("/debug/queues")
def debug_queues():
insp = celery_app.control.inspect()
workers_active = insp.active() or {}
workers_scheduled = insp.scheduled() or {}
workers_reserved = insp.reserved() or {}
redis_url = os.getenv("REDIS_BROKER")
parsed = urlparse(redis_url)
r2 = redis.Redis(
host=parsed.hostname,
port=parsed.port,
db=int(parsed.path.strip("/") or 0),
decode_responses=True,
)
queue_names = ["scraping", "controller", "download", "parse", "save", "audio"]
queues = []
for q in queue_names:
key = f"celery:{q}"
try:
queues.append(
{
"name": q,
"redis_key": key,
"length": r2.llen(key),
"items": r2.lrange(key, 0, 30),
}
)
except Exception as e:
queues.append(
{
"name": q,
"redis_key": key,
"length": "ERR",
"items": [str(e)],
}
)
return render_template(
"debug/queues.html",
queues=queues,
workers_active=workers_active,
workers_reserved=workers_reserved,
workers_scheduled=workers_scheduled,
)
# ===================================================== # =====================================================
# SECTION 5 — INTERNAL HELPERS # CELERY RESULT → return book_id when scraping finishes
# ===================================================== # =====================================================
@app.route("/celery-result/<task_id>", methods=["GET"])
def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app)
if result.successful():
return jsonify({"ready": True, "result": result.get()})
@logcall if result.failed():
def getStatus(book_idx): return jsonify({"ready": True, "error": "failed"})
state = r.hgetall(f"book:{book_idx}:state")
status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_idx
return {
"book_id": book_idx,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": dl_total,
}
@logcall return jsonify({"ready": False})
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_idx = key[first + 1 : second]
books.append(getStatus(book_idx))
return books
# ===================================================== # =====================================================
# SECTION 6 — FLASK RUNNER # RUN FLASK
# ===================================================== # =====================================================
if __name__ == "__main__": if __name__ == "__main__":
import os
debug = os.getenv("FLASK_DEBUG", "0") == "1" debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0") host = os.getenv("HOST", "0.0.0.0")
port = int(os.getenv("PORT", "5000")) port = int(os.getenv("PORT", "5000"))

@ -1,66 +0,0 @@
#!/usr/bin/env python3
"""
Local macOS Audio Worker runs outside Docker so macOS 'say' works.
"""
import os
import subprocess
from dotenv import load_dotenv
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
ENV_FILE = os.path.join(BASE_DIR, ".env")
# Load .env if present
if os.path.exists(ENV_FILE):
load_dotenv(ENV_FILE)
print(f"[AUDIO-LOCAL] Loaded .env from {ENV_FILE}")
else:
print("[AUDIO-LOCAL] WARNING: no .env found")
def main():
print("=====================================================")
print(" LOCAL macOS AUDIO WORKER")
print(" Queue : audio")
print(" Voice :", os.getenv("AUDIO_VOICE"))
print(" Rate :", os.getenv("AUDIO_RATE"))
print("=====================================================")
# ----------------------------------------------------------
# OVERRIDES: Local Redis instead of Docker internal hostname
# ----------------------------------------------------------
broker = os.getenv("REDIS_BROKER_LOCAL", "redis://127.0.0.1:6379/0")
backend = os.getenv("REDIS_BACKEND_LOCAL", "redis://127.0.0.1:6379/1")
os.environ["CELERY_BROKER_URL"] = broker
os.environ["CELERY_RESULT_BACKEND"] = backend
print(f"[AUDIO-LOCAL] Using Redis broker : {broker}")
print(f"[AUDIO-LOCAL] Using Redis backend: {backend}")
# ----------------------------------------------------------
# Celery command
# macOS requires prefork pool, and we use a single-line list.
# ----------------------------------------------------------
cmd = [
"celery",
"-A",
"celery_app",
"worker",
"-Q",
"audio",
"-n",
"audio_local@%h",
"-l",
"INFO",
"--pool=prefork",
"--concurrency=2",
]
print("[AUDIO-LOCAL] Launching Celery via subprocess…")
subprocess.run(cmd, check=False)
if __name__ == "__main__":
main()

@ -5,9 +5,6 @@ from dotenv import load_dotenv
print(">>> [celery_app] Loading .env BEFORE initializing Celery...") print(">>> [celery_app] Loading .env BEFORE initializing Celery...")
load_dotenv() load_dotenv()
from db.db import init_db
init_db() # ensures DB exists for all workers
BROKER = os.getenv("REDIS_BROKER") BROKER = os.getenv("REDIS_BROKER")
BACKEND = os.getenv("REDIS_BACKEND") BACKEND = os.getenv("REDIS_BACKEND")
@ -25,34 +22,15 @@ celery_app = Celery(
"scraper.tasks.download_tasks", "scraper.tasks.download_tasks",
"scraper.tasks.parse_tasks", "scraper.tasks.parse_tasks",
"scraper.tasks.save_tasks", "scraper.tasks.save_tasks",
# --------------------------------------------------------
# AUDIO TASKS (NEW)
# --------------------------------------------------------
"scraper.tasks.audio_tasks",
], ],
) )
# >>>>> PLAATS DIT HIER <<<<<
celery_app.conf.update(
worker_redirect_stdouts_level="WARNING",
task_send_sent_event=False,
resultrepr_maxsize=0,
worker_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s",
worker_task_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s",
)
# >>>>> TOT HIER <<<<<
celery_app.conf.task_routes = { celery_app.conf.task_routes = {
"scraper.tasks.scraping.*": {"queue": "scraping"}, "scraper.tasks.scraping.*": {"queue": "scraping"},
"scraper.tasks.controller_tasks.*": {"queue": "controller"}, "scraper.tasks.controller_tasks.*": {"queue": "controller"},
"scraper.tasks.download_tasks.*": {"queue": "download"}, "scraper.tasks.download_tasks.*": {"queue": "download"},
"scraper.tasks.parse_tasks.*": {"queue": "parse"}, "scraper.tasks.parse_tasks.*": {"queue": "parse"},
"scraper.tasks.save_tasks.*": {"queue": "save"}, "scraper.tasks.save_tasks.*": {"queue": "save"},
# ------------------------------------------------------------
# AUDIO ROUTING (NEW)
# ------------------------------------------------------------
"scraper.tasks.audio_tasks.*": {"queue": "audio"},
} }
# ------------------------------------------------------------ # ------------------------------------------------------------

@ -1,160 +0,0 @@
# ============================================================
# File: db/db.py (UPDATED for book_idx-only architecture)
# Purpose:
# Raw SQLite engine for BookScraper.
# - Connection management
# - init_db() schema creation + safe schema upgrade
# - upsert_book() atomic write (now uses book_idx)
# - raw fetch helpers
# ============================================================
import os
import sqlite3
from threading import Lock
DB_PATH = os.environ.get("BOOKSCRAPER_DB", "/app/data/books.db")
# Ensure directory exists
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
# Per-process connection cache
_connection_cache = {}
_connection_lock = Lock()
# ------------------------------------------------------------
# Connection handling
# ------------------------------------------------------------
def get_db():
pid = os.getpid()
if pid not in _connection_cache:
with _connection_lock:
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.row_factory = sqlite3.Row
enable_wal_mode(conn)
_connection_cache[pid] = conn
return _connection_cache[pid]
def enable_wal_mode(conn):
conn.execute("PRAGMA journal_mode=DELETE;")
conn.execute("PRAGMA synchronous=NORMAL;")
conn.commit()
# ------------------------------------------------------------
# Schema creation + SAFE schema upgrades
# ------------------------------------------------------------
def init_db():
conn = get_db()
# --------------------------------------------------------
# BASE SCHEMA — book_idx is now PRIMARY KEY
# --------------------------------------------------------
conn.execute(
"""
CREATE TABLE IF NOT EXISTS books (
book_idx INTEGER PRIMARY KEY,
title TEXT,
author TEXT,
description TEXT,
cover_url TEXT,
cover_path TEXT,
book_url TEXT,
chapters_total INTEGER,
status TEXT,
downloaded INTEGER DEFAULT 0,
parsed INTEGER DEFAULT 0,
audio_done INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
processdate DATETIME,
last_update DATETIME
);
"""
)
conn.commit()
# --------------------------------------------------------
# SCHEMA UPGRADE UTILITY
# --------------------------------------------------------
def add_column(name, type_):
try:
conn.execute(f"ALTER TABLE books ADD COLUMN {name} {type_};")
except:
pass # column already exists
cols = conn.execute("PRAGMA table_info(books);").fetchall()
colnames = [c[1] for c in cols]
# --------------------------------------------------------
# UPGRADE NEW FIELDS — future-proof, matched with Redis state model
# --------------------------------------------------------
# (book_idx already exists as PRIMARY KEY — no need to add)
add_column("description", "TEXT")
add_column("cover_path", "TEXT")
add_column("book_url", "TEXT")
# Download counters
add_column("chapters_download_done", "INTEGER DEFAULT 0")
add_column("chapters_download_skipped", "INTEGER DEFAULT 0")
# Audio counters
add_column("audio_skipped", "INTEGER DEFAULT 0")
# Optional future fields
add_column("audio_total", "INTEGER DEFAULT 0")
conn.commit()
# ------------------------------------------------------------
# WRITE OPERATIONS (book_idx-based UPSERT)
# ------------------------------------------------------------
def upsert_book(book_idx, **fields):
"""
UPSERT by book_idx.
Replaces old upsert that used book_id.
"""
conn = get_db()
keys = ["book_idx"] + list(fields.keys())
values = [book_idx] + list(fields.values())
placeholders = ",".join(["?"] * len(values))
updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()])
sql = f"""
INSERT INTO books ({','.join(keys)})
VALUES ({placeholders})
ON CONFLICT(book_idx)
DO UPDATE SET {updates},
last_update = CURRENT_TIMESTAMP;
"""
conn.execute(sql, values)
conn.commit()
# ------------------------------------------------------------
# RAW READ OPERATIONS
# ------------------------------------------------------------
def _raw_get_book(book_idx):
conn = get_db()
row = conn.execute(
"SELECT * FROM books WHERE book_idx = ?;", (book_idx,)
).fetchone()
return dict(row) if row else None
def _raw_get_all_books():
conn = get_db()
cur = conn.execute("SELECT * FROM books ORDER BY created_at DESC;")
return [dict(row) for row in cur.fetchall()]

@ -1,320 +0,0 @@
# ============================================================
# File: db/repository.py
# Purpose:
# Unified façade for BookScraper database state.
#
# Responsibilities:
# - Route metadata → SQLite
# - Route counters → Redis (live) + SQLite (snapshot)
# - Provide a clean API for tasks and Flask UI
# ============================================================
# ============================================================
# UPDATED — canonical read model via get_book_state
# ============================================================
from scraper.logger_decorators import logcall
from logbus.publisher import log
import redis
import os
# ============================================================
# SQL low-level engines (snapshot storage)
# ============================================================
from db.state_sql import (
sql_fetch_book,
sql_fetch_all_books,
sql_set_status,
sql_set_chapters_total,
sql_register_book,
sql_update_book,
)
# ============================================================
# REDIS low-level engines (live counters)
# ============================================================
from db.state_redis import (
redis_set_status,
redis_set_chapters_total,
redis_inc_download_done,
redis_inc_download_skipped,
redis_inc_parsed_done,
redis_inc_audio_done,
redis_inc_audio_skipped,
)
# ============================================================
# Redis client (read-only for legacy + guards)
# ============================================================
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
_r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ============================================================
# LEGACY PROGRESS (UI only, unchanged)
# ============================================================
def _legacy_get_progress(book_idx):
return {
"book_idx": book_idx,
"total": int(_r.get(f"progress:{book_idx}:total") or 0),
"completed": int(_r.get(f"progress:{book_idx}:completed") or 0),
"skipped": int(_r.get(f"progress:{book_idx}:skipped") or 0),
"failed": int(_r.get(f"progress:{book_idx}:failed") or 0),
"abort": _r.exists(f"abort:{book_idx}") == 1,
"failed_list": _r.lrange(f"progress:{book_idx}:failed_list", 0, -1),
}
@logcall
def get_progress(book_idx):
return _legacy_get_progress(book_idx)
# ============================================================
# FETCH (SQLite snapshot)
# ============================================================
@logcall
def fetch_book(book_idx):
return sql_fetch_book(book_idx)
@logcall
def fetch_all_books():
return sql_fetch_all_books()
# ============================================================
# INIT / UPDATE METADATA
# ============================================================
@logcall
def register_book(
book_idx,
title,
author=None,
description=None,
cover_url=None,
cover_path=None,
book_url=None,
):
sql_register_book(
book_idx,
{
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path,
"book_url": book_url,
"chapters_total": 0,
"status": "registered",
},
)
@logcall
def update_book_after_full_scrape(
book_idx,
title=None,
author=None,
description=None,
cover_url=None,
chapters_total=None,
):
fields = {}
if title is not None:
fields["title"] = title
if author is not None:
fields["author"] = author
if description is not None:
fields["description"] = description
if cover_url is not None:
fields["cover_url"] = cover_url
if chapters_total is not None:
fields["chapters_total"] = chapters_total
fields["status"] = "active"
sql_update_book(book_idx, fields)
# ============================================================
# STATUS
# ============================================================
@logcall
def set_status(book_idx, status):
redis_set_status(book_idx, status)
sql_set_status(book_idx, status)
# ============================================================
# TOTALS
# ============================================================
@logcall
def set_chapters_total(book_idx, total):
redis_set_chapters_total(book_idx, total)
sql_set_chapters_total(book_idx, total)
# ============================================================
# COUNTERS — WRITE ONLY
# ============================================================
@logcall
def inc_download_done(book_idx, amount=1):
redis_inc_download_done(book_idx, amount)
@logcall
def inc_download_skipped(book_idx, amount=1):
redis_inc_download_skipped(book_idx, amount)
@logcall
def inc_parsed_done(book_idx, amount=1):
redis_inc_parsed_done(book_idx, amount)
@logcall
def inc_audio_done(book_idx, amount=1):
redis_inc_audio_done(book_idx, amount)
@logcall
def inc_audio_skipped(book_idx, amount=1):
redis_inc_audio_skipped(book_idx, amount)
# ============================================================
# CANONICAL READ MODEL
# ============================================================
@logcall
def get_book_state(book_idx):
"""
Canonical merged read model.
Rules:
- SQL = snapshot baseline
- Redis = live counters
- merged = max(sql, redis)
- capped at chapters_total
"""
sqlite_row = sql_fetch_book(book_idx) or {}
redis_state = _r.hgetall(f"book:{book_idx}:state") or {}
def _int(v):
try:
return int(v)
except Exception:
return 0
chapters_total = _int(sqlite_row.get("chapters_total"))
# SQL snapshot
sql_downloaded = _int(sqlite_row.get("downloaded"))
sql_audio_done = _int(sqlite_row.get("audio_done"))
sql_audio_skipped = _int(sqlite_row.get("audio_skipped"))
# Redis live
redis_downloaded = _int(redis_state.get("chapters_download_done")) + _int(
redis_state.get("chapters_download_skipped")
)
redis_audio_done = _int(redis_state.get("audio_done"))
redis_audio_skipped = _int(redis_state.get("audio_skipped"))
# Merge
merged_downloaded = max(sql_downloaded, redis_downloaded)
merged_audio_done = max(sql_audio_done, redis_audio_done)
merged_audio_skipped = max(sql_audio_skipped, redis_audio_skipped)
if chapters_total > 0:
merged_downloaded = min(merged_downloaded, chapters_total)
merged_audio_done = min(merged_audio_done, chapters_total)
merged_audio_skipped = min(merged_audio_skipped, chapters_total)
audio_completed = merged_audio_done + merged_audio_skipped
# Build state
state = dict(sqlite_row)
state.update(
{
"downloaded": merged_downloaded,
"audio_done": merged_audio_done,
"audio_skipped": merged_audio_skipped,
"chapters_total": chapters_total,
}
)
# Derived status
status = sqlite_row.get("status") or "unknown"
if chapters_total > 0:
if merged_downloaded < chapters_total:
status = "downloading"
elif merged_downloaded == chapters_total and audio_completed < chapters_total:
status = "audio"
elif audio_completed >= chapters_total:
status = "done"
state["status"] = status
return state
# ============================================================
# READ HELPERS (VIA get_book_state ONLY)
# ============================================================
@logcall
def get_chapters_total(book_idx):
return int(get_book_state(book_idx).get("chapters_total", 0))
@logcall
def get_audio_done(book_idx):
return int(get_book_state(book_idx).get("audio_done", 0))
@logcall
def get_audio_completed_total(book_idx):
state = get_book_state(book_idx)
return int(state.get("audio_done", 0)) + int(state.get("audio_skipped", 0))
# ============================================================
# STATUSCHECK GUARD (INTENTIONAL DIRECT REDIS)
# ============================================================
@logcall
def try_trigger_statuscheck(book_idx):
return bool(_r.set(f"book:{book_idx}:statuscheck:triggered", "1", nx=True))
# ============================================================
# ACTIVE / REGISTERED BOOK LISTS (UI API)
# ============================================================
@logcall
def get_registered_books():
"""
Books visible in the 'registered' list in the UI.
"""
all_books = sql_fetch_all_books()
HIDDEN_STATES = {"hidden"}
return [b for b in all_books if b.get("status") not in HIDDEN_STATES]
@logcall
def get_active_books():
"""
Books currently active in the dashboard.
"""
all_books = sql_fetch_all_books()
HIDDEN_STATES = {"hidden", "done"}
return [b for b in all_books if b.get("status") not in HIDDEN_STATES]
@logcall
def store_m4b_error(book_idx: str, volume: str, error_text: str):
"""
Passive storage of m4b errors.
No logic, no retries, no state transitions.
"""
key = f"book:{book_idx}:m4b:errors"
entry = f"{volume}: {error_text}"
_r.rpush(key, entry)

@ -1,130 +0,0 @@
# ============================================================
# File: db/state_redis.py (UPDATED for book_idx-only architecture)
# Purpose:
# Low-level Redis counters/state for BookScraper.
# Used ONLY by db.repository façade.
# ============================================================
import os
import time
import redis
from logbus.publisher import log
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ------------------------------------------------------------
# INTERNAL KEY BUILDER
# ------------------------------------------------------------
def _key(book_idx: str) -> str:
return f"book:{book_idx}:state"
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def redis_set_status(book_idx: str, status: str):
log(f"[DB-REDIS] Setting status for {book_idx} to {status}")
key = _key(book_idx)
r.hset(key, "status", status)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# TOTAL CHAPTERS
# ------------------------------------------------------------
def redis_set_chapters_total(book_idx: str, total: int):
key = _key(book_idx)
r.hset(key, "chapters_total", total)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# DOWNLOAD COUNTERS
# ------------------------------------------------------------
def redis_inc_download_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_download_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_download_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download skipped for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_download_skipped", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# PARSE COUNTERS
# ------------------------------------------------------------
def redis_inc_parsed_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing parsed done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "chapters_parsed_done", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# AUDIO COUNTERS
# ------------------------------------------------------------
def redis_inc_audio_done(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio done for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "audio_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_audio_skipped(book_idx: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio skipped for {book_idx} by {amount}")
key = _key(book_idx)
r.hincrby(key, "audio_skipped", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# INITIALISE BOOK STATE
# ------------------------------------------------------------
def init_book_state(book_idx: str, title: str, url: str, chapters_total: int):
"""
Initialiseert de complete Redis state voor een nieuw boek.
LET OP:
- Als een key al bestaat NIET resetten (progress behouden).
- Alleen missende velden worden toegevoegd.
"""
key = f"book:{book_idx}:state"
# Bestaat al? Dan vullen we alleen missende velden aan.
exists = r.exists(key)
pipeline = r.pipeline()
# Basis metadata
pipeline.hsetnx(key, "book_id", book_idx)
pipeline.hsetnx(key, "title", title or "")
pipeline.hsetnx(key, "url", url or "")
# State
pipeline.hsetnx(key, "status", "registered")
# Counters
pipeline.hsetnx(key, "chapters_total", chapters_total)
pipeline.hsetnx(key, "chapters_download_done", 0)
pipeline.hsetnx(key, "chapters_download_skipped", 0)
pipeline.hsetnx(key, "chapters_parsed_done", 0)
pipeline.hsetnx(key, "audio_done", 0)
pipeline.hsetnx(key, "audio_skipped", 0)
# Timestamp
pipeline.hset(key, "last_update", int(time.time()))
pipeline.execute()
if exists:
log(f"[DB-REDIS] init_book_state(): UPDATED existing state for {book_idx}")
else:
log(f"[DB-REDIS] init_book_state(): CREATED new state for {book_idx}")

@ -1,178 +0,0 @@
# ============================================================
# File: db/state_sql.py (UPDATED for book_idx-only architecture)
# Purpose:
# Low-level SQLite snapshot layer for BookScraper metadata.
# Used ONLY through db.repository façade.
# ============================================================
import sqlite3
import os
from logbus.publisher import log
# Must match db/db.py
DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/data/books.db")
# ------------------------------------------------------------
# INTERNAL HELPERS
# ------------------------------------------------------------
def _connect():
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
# ------------------------------------------------------------
# FETCH
# ------------------------------------------------------------
def sql_fetch_book(book_idx):
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
row = cur.fetchone()
conn.close()
return dict(row) if row else None
def sql_fetch_all_books():
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books ORDER BY created_at DESC")
rows = cur.fetchall()
conn.close()
return [dict(r) for r in rows]
# ------------------------------------------------------------
# REGISTER / UPDATE
# ------------------------------------------------------------
def sql_register_book(book_idx, fields: dict):
"""
Insert or replace entire book record.
book_idx is the PRIMARY KEY.
"""
conn = _connect()
cur = conn.cursor()
cols = ", ".join(["book_idx"] + list(fields.keys()))
placeholders = ", ".join(["?"] * (1 + len(fields)))
values = [book_idx] + list(fields.values())
cur.execute(
f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})",
values,
)
conn.commit()
conn.close()
def sql_update_book(book_idx, fields: dict):
if not fields:
return
conn = _connect()
cur = conn.cursor()
set_clause = ", ".join([f"{k} = ?" for k in fields])
params = list(fields.values()) + [book_idx]
cur.execute(
f"UPDATE books SET {set_clause} WHERE book_idx = ?",
params,
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def sql_set_status(book_idx, status: str):
conn = _connect()
cur = conn.cursor()
cur.execute(
"UPDATE books SET status = ? WHERE book_idx = ?",
(status, book_idx),
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# CHAPTER TOTAL (snapshot)
# ------------------------------------------------------------
def sql_set_chapters_total(book_idx, total: int):
conn = _connect()
cur = conn.cursor()
cur.execute(
"UPDATE books SET chapters_total = ? WHERE book_idx = ?",
(total, book_idx),
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# COUNTERS (SNAPSHOT-ONLY)
# ------------------------------------------------------------
def sql_inc_downloaded(book_idx, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET downloaded = COALESCE(downloaded,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_parsed(book_idx, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET parsed = COALESCE(parsed,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_audio_done(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio_done for {book_idx} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_done = COALESCE(audio_done,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()
def sql_inc_audio_skipped(book_idx, amount=1):
log(f"[DB-SQL] Incrementing audio_skipped for {book_idx} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_skipped = COALESCE(audio_skipped,0) + ?
WHERE book_idx = ?
""",
(amount, book_idx),
)
conn.commit()
conn.close()

@ -1,22 +1,10 @@
services: services:
# ---------------------------------------------------------- # ----------------------------------------------------------
# Redis broker & backend (NO SNAPSHOTS, NO AOF) # Redis broker & backend
# ---------------------------------------------------------- # ----------------------------------------------------------
redis: redis:
image: redis:7 image: redis:7
container_name: bookscraper_redis container_name: bookscraper_redis
command:
[
"redis-server",
"--save",
"",
"--appendonly",
"no",
"--stop-writes-on-bgsave-error",
"no",
]
ports:
- "6379:6379"
healthcheck: healthcheck:
test: ["CMD", "redis-cli", "ping"] test: ["CMD", "redis-cli", "ping"]
interval: 2s interval: 2s
@ -42,8 +30,7 @@ services:
- PYTHONUNBUFFERED=1 - PYTHONUNBUFFERED=1
volumes: volumes:
- .:/app - .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books - /Users/peter/Desktop/books:/app/output
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
restart: "no" restart: "no"
# ---------------------------------------------------------- # ----------------------------------------------------------
@ -56,8 +43,7 @@ services:
container_name: bookscraper_web container_name: bookscraper_web
volumes: volumes:
- .:/app - .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books - /Users/peter/Desktop/books:/app/output
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -80,8 +66,7 @@ services:
container_name: worker_download container_name: worker_download
volumes: volumes:
- .:/app - .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books - /Users/peter/Desktop/books:/app/output
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -100,8 +85,7 @@ services:
container_name: worker_parse container_name: worker_parse
volumes: volumes:
- .:/app - .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books - /Users/peter/Desktop/books:/app/output
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -120,8 +104,7 @@ services:
container_name: worker_save container_name: worker_save
volumes: volumes:
- .:/app - .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books - /Users/peter/Desktop/books:/app/output
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -131,40 +114,39 @@ services:
restart: "no" restart: "no"
# ---------------------------------------------------------- # ----------------------------------------------------------
# Scraping Worker # Audio Worker (macOS only)
# ---------------------------------------------------------- # ----------------------------------------------------------
worker_scraping: worker_audio:
build: build:
context: . context: .
dockerfile: docker/Dockerfile.scraper dockerfile: docker/Dockerfile.audio
container_name: worker_scraping container_name: worker_audio
volumes: volumes:
- .:/app - .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books - /Users/peter/Desktop/books:/app/output
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
env_file: env_file:
- .env - .env
command: celery -A celery_app worker -Q scraping -n scraping@%h -l INFO command: celery -A celery_app worker -Q audio -n audio@%h -l INFO
restart: "no" restart: "no"
# ---------------------------------------------------------- # ----------------------------------------------------------
# M4B Worker (Finalization) # Scraping Worker
# ---------------------------------------------------------- # ----------------------------------------------------------
worker_m4b: worker_scraping:
build: build:
context: . context: .
dockerfile: docker/Dockerfile.m4b dockerfile: docker/Dockerfile.scraper
container_name: worker_m4b container_name: worker_scraping
command: celery -A celery_app worker -Q m4b -n m4b@%h -l INFO volumes:
- .:/app
- /Users/peter/Desktop/books:/app/output
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
env_file: env_file:
- .env - .env
volumes: command: celery -A celery_app worker -Q scraping -n scraping@%h -l INFO
- .:/app
- /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
restart: "no" restart: "no"

@ -1,15 +1,9 @@
FROM python:3.12-slim FROM python:3.12-slim
WORKDIR /app WORKDIR /app
# Install audio worker dependencies
COPY requirements.audio.txt /app/requirements.audio.txt COPY requirements.audio.txt /app/requirements.audio.txt
RUN pip install --no-cache-dir -r /app/requirements.audio.txt RUN pip install --no-cache-dir -r /app/requirements.audio.txt
# Celery is noodzakelijk voor de worker
RUN pip install --no-cache-dir celery
# Copy project
COPY . /app COPY . /app
# Start the AUDIO Celery worker CMD ["python3", "-c", "print('audio worker ready')"]
CMD ["celery", "-A", "celery_app", "worker", "-Q", "audio", "-n", "audio@%h", "-l", "INFO"]

@ -1,70 +0,0 @@
FROM debian:12
ENV DEBIAN_FRONTEND=noninteractive
# ----------------------------------------------------------
# System + PHP (PHP 8.2 native)
# ----------------------------------------------------------
RUN apt-get update && apt-get install -y \
ffmpeg \
curl \
ca-certificates \
bash \
php-cli \
php-intl \
php-json \
php-mbstring \
php-xml \
php-curl \
php-zip \
python3 \
python3-pip \
python3-venv \
\
# build deps for mp4v2
git \
build-essential \
autoconf \
automake \
libtool \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# ----------------------------------------------------------
# Python venv (PEP 668 compliant)
# ----------------------------------------------------------
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:/usr/local/bin:$PATH"
# ----------------------------------------------------------
# Build & install mp4v2 (mp4info)
# ----------------------------------------------------------
WORKDIR /tmp
RUN git clone https://github.com/sandreas/mp4v2 \
&& cd mp4v2 \
&& ./configure \
&& make -j$(nproc) \
&& make install \
&& echo "/usr/local/lib" > /etc/ld.so.conf.d/mp4v2.conf \
&& ldconfig \
&& cd / \
&& rm -rf /tmp/mp4v2
# ----------------------------------------------------------
# Install m4b-tool
# ----------------------------------------------------------
RUN curl -L https://github.com/sandreas/m4b-tool/releases/latest/download/m4b-tool.phar \
-o /usr/local/bin/m4b-tool \
&& chmod +x /usr/local/bin/m4b-tool
# ----------------------------------------------------------
# App
# ----------------------------------------------------------
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY . /app
CMD ["bash"]

@ -1,31 +1,9 @@
# logbus/publisher.py # logbus/publisher.py
import logging import logging
import os
logger = logging.getLogger("logbus") logger = logging.getLogger("logbus")
logger.setLevel(logging.WARNING)
# ============================================================
# FILE LOGGER — log.txt in BOOKSCRAPER_OUTPUT_DIR
# ============================================================
try:
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
os.makedirs(root, exist_ok=True)
file_path = os.path.join(root, "log.txt")
file_handler = logging.FileHandler(file_path, mode="a", encoding="utf-8")
file_formatter = logging.Formatter("%(message)s") # exact zoals input
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
except Exception:
# Logging naar file mag nooit de app laten crashen
pass
def log(message: str): def log(message: str):
""" """
@ -49,32 +27,3 @@ def log(message: str):
push_ui(message) push_ui(message)
except Exception: except Exception:
pass pass
# ============================================================
# Delta-based log retrieval using Redis indexes
# ============================================================
def get_ui_logs_delta(last_index: int):
"""
Returns (new_lines, total_count)
Only returns log lines AFTER last_index.
Example:
last_index = 10 returns logs with Redis indexes 11..end
"""
# Determine total lines in buffer
total = r.llen(UI_LOG_KEY)
if total == 0:
return [], 0
# First load OR index invalid → send entire buffer
if last_index < 0 or last_index >= total:
logs = r.lrange(UI_LOG_KEY, 0, -1)
return logs, total
# Only new lines:
new_lines = r.lrange(UI_LOG_KEY, last_index + 1, -1)
return new_lines, total

@ -1 +0,0 @@
Subproject commit 480a73324f53d0d24bea4931c3902097f8e2a663

@ -1,208 +0,0 @@
# ============================================================
# File: scraper/progress.py
# Purpose: Track chapter counters for WebGUI progress +
# Book State Model (Redis-backed).
# ============================================================
import os
import time
import redis
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ============================================================
# LEGACY PROGRESS FUNCTIONS (ONAANGEROERD BEHOUDEN)
# ============================================================
# ------------------------------------------------------------
# SET TOTAL
# ------------------------------------------------------------
def set_total(book_id: str, total: int):
r.set(f"progress:{book_id}:total", total)
# ------------------------------------------------------------
# COUNTERS legacy
# ------------------------------------------------------------
def inc_completed(book_id: str):
r.incr(f"progress:{book_id}:completed")
def inc_skipped(book_id: str):
r.incr(f"progress:{book_id}:skipped")
def inc_failed(book_id: str):
r.incr(f"progress:{book_id}:failed")
# ------------------------------------------------------------
# FAILED CHAPTER LIST
# ------------------------------------------------------------
def add_failed_chapter(book_id: str, chapter: int, reason: str):
entry = f"Chapter {chapter}: {reason}"
r.rpush(f"progress:{book_id}:failed_list", entry)
def get_failed_list(book_id: str):
return r.lrange(f"progress:{book_id}:failed_list", 0, -1)
# ------------------------------------------------------------
# READ STRUCT FOR UI (legacy view)
# ------------------------------------------------------------
def get_progress(book_id: str):
total = int(r.get(f"progress:{book_id}:total") or 0)
completed = int(r.get(f"progress:{book_id}:completed") or 0)
skipped = int(r.get(f"progress:{book_id}:skipped") or 0)
failed = int(r.get(f"progress:{book_id}:failed") or 0)
abort = r.exists(f"abort:{book_id}") == 1
failed_list = get_failed_list(book_id)
return {
"book_id": book_id,
"total": total,
"completed": completed,
"skipped": skipped,
"failed": failed,
"failed_list": failed_list,
"abort": abort,
}
# ============================================================
# BOOK STATE MODEL (NIEUWE FUNCTIES — GEEN BREAKING CHANGES)
# ============================================================
# ------------------------------------------------------------
# Initialize book state at start of scrape
# ------------------------------------------------------------
def init_book_state(
book_id: str, title: str = "", url: str = "", chapters_total: int = 0
):
key = f"book:{book_id}:state"
now = int(time.time())
r.hset(
key,
mapping={
"book_id": book_id,
"title": title or "",
"url": url or "",
"status": "scraping",
"chapters_total": chapters_total,
"chapters_done": 0,
"chapters_download_skipped": 0,
"audio_total": 0,
"audio_done": 0,
"last_update": now,
},
)
# Track in library list
r.sadd("books", book_id)
# ------------------------------------------------------------
# Status + timestamps
# ------------------------------------------------------------
def set_status(book_id: str, status: str):
key = f"book:{book_id}:state"
r.hset(key, "status", status)
r.hset(key, "last_update", int(time.time()))
def set_last_update(book_id: str):
r.hset(f"book:{book_id}:state", "last_update", int(time.time()))
# ------------------------------------------------------------
# Chapter counters new model
# ------------------------------------------------------------
def set_chapter_total(book_id: str, total: int):
key = f"book:{book_id}:state"
r.hset(key, "chapters_total", total)
set_last_update(book_id)
def inc_chapter_download_skipped(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_download_skipped", 1)
set_last_update(book_id)
def inc_chapter_done(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_download_done", 1)
set_last_update(book_id)
# ------------------------------------------------------------
# Audio counters
# ------------------------------------------------------------
def set_audio_total(book_id: str, total: int):
key = f"book:{book_id}:state"
r.hset(key, "audio_total", total)
set_last_update(book_id)
def inc_audio_done(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "audio_done", 1)
set_last_update(book_id)
def inc_audio_skipped(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "audio_skipped", 1)
set_last_update(book_id)
# ------------------------------------------------------------
# Skip reasons
# ------------------------------------------------------------
def save_skip_reason(book_id: str, chapter: int, reason: str):
"""
Store explicit skip reason for transparency in UI.
"""
r.hset(f"book:{book_id}:skip_reasons", chapter, reason)
set_last_update(book_id)
# ------------------------------------------------------------
# Full state readout
# ------------------------------------------------------------
def get_state(book_id: str):
"""
Read global Book State Model + legacy progress, merged but not mixed.
"""
key = f"book:{book_id}:state"
state = r.hgetall(key) or {}
# Numeric conversions
numeric_fields = [
"chapters_total",
"chapters_download_done",
"chapters_download_skipped",
"audio_total",
"audio_skipped",
"audio_done",
]
for field in numeric_fields:
if field in state:
try:
state[field] = int(state[field])
except ValueError:
pass
# Skip reasons
state["skip_reasons"] = r.hgetall(f"book:{book_id}:skip_reasons") or {}
# Attach legacy progress separately
state["legacy_progress"] = get_progress(book_id)
return state

@ -1,124 +1,82 @@
import os import os
import redis import redis
from scraper.logger_decorators import logcall
from scraper.ui_log import push_ui
# --------------------------------------------------------- # ---------------------------------------------------------
# Default Redis connection (Docker workers) # Redis connection
# --------------------------------------------------------- # ---------------------------------------------------------
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True) r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# Debug mode (optional)
ABORT_DEBUG = os.getenv("ABORT_DEBUG", "1") == "1"
# Avoid duplicate spam
_seen_debug_keys = set()
# =========================================================
# INTERNAL DEBUGGING
# =========================================================
def _debug(msg: str):
print(msg)
push_ui(msg)
# ========================================================= # =========================================================
# ABORT FLAG — unified book_idx # ABORT FLAG
# ========================================================= # =========================================================
def set_abort(book_idx: str): def set_abort(book_id: str):
"""Enable abort mode for book_idx."""
key = f"abort:{book_idx}"
r.set(key, "1")
if ABORT_DEBUG:
_debug(f"[ABORT] SET {key}")
def clear_abort(book_idx: str):
"""Clear abort flag."""
key = f"abort:{book_idx}"
r.delete(key)
if ABORT_DEBUG:
_debug(f"[ABORT] CLEAR {key}")
def abort_requested(book_idx: str, redis_client=None) -> bool:
""" """
Check whether abort flag is active for book_idx. Enable abort mode for this book.
All download tasks that haven't started yet will immediately exit.
redis_client:
- Docker workers None use default Redis (r)
- Local macOS audio worker passes Redis(host=127.0.0.1)
""" """
client = redis_client or r r.set(f"abort:{book_id}", "1")
key = f"abort:{book_idx}"
try:
exists = client.exists(key)
if ABORT_DEBUG:
# Log only once per book def clear_abort(book_id: str):
if key not in _seen_debug_keys: """
try: Clear abort flag so future runs are unaffected.
conn = client.connection_pool.connection_kwargs """
host = conn.get("host") r.delete(f"abort:{book_id}")
port = conn.get("port")
db = conn.get("db")
_debug(
# f"[ABORT_DEBUG] first check book_idx={book_idx} "
f"redis={host}:{port} db={db}"
)
except Exception:
_debug(f"[ABORT_DEBUG] first check book_idx={book_idx}")
_seen_debug_keys.add(key)
# Log ACTIVE state
if exists == 1:
_debug(f"[ABORT] ACTIVE for {book_idx}")
return exists == 1
except Exception as e: def abort_requested(book_id: str) -> bool:
if ABORT_DEBUG: """
_debug(f"[ABORT_DEBUG] ERROR checking {key}: {e}") True if abort flag is set for this book.
return False """
return r.exists(f"abort:{book_id}") == 1
# ========================================================= # =========================================================
# PER-CHAPTER STATE — unified book_idx # PER-CHAPTER STATE
# ========================================================= # =========================================================
# We mark a chapter "started" once its download task begins.
# If abort is activated AFTER download start:
# → download must complete
# → parse must complete
# → save must complete
# All subsequent chapters will skip.
def mark_chapter_started(book_idx: str, chapter_num: int): def mark_chapter_started(book_id: str, chapter_num: int):
key = f"started:{book_idx}:{chapter_num}" """
Mark this chapter as started. Parse/save will always run after this,
even if abort has been activated afterwards.
"""
key = f"started:{book_id}:{chapter_num}"
r.set(key, "1") r.set(key, "1")
def chapter_started(book_idx: str, chapter_num: int) -> bool: def chapter_started(book_id: str, chapter_num: int) -> bool:
key = f"started:{book_idx}:{chapter_num}" """
Return True if this chapter has already started downloading.
"""
key = f"started:{book_id}:{chapter_num}"
return r.exists(key) == 1 return r.exists(key) == 1
# ========================================================= # =========================================================
# RESET STATE FOR BOOK_IDX # UTILITY: RESET FOR A BOOK
# ========================================================= # =========================================================
def reset_book_state(book_idx: str): def reset_book_state(book_id: str):
""" """
Remove abort flag and all per-chapter started markers. Optional utility: remove abort flag and all started-chapter markers.
Useful during testing or manual cleanup.
""" """
# abort flag # Remove abort flag
r.delete(f"abort:{book_idx}") r.delete(f"abort:{book_id}")
# chapter markers # Remove all "started:*" keys for this book
pattern = f"started:{book_idx}:*" pattern = f"started:{book_id}:*"
for k in r.scan_iter(pattern): for key in r.scan_iter(pattern):
r.delete(k) r.delete(key)

@ -1,55 +1,151 @@
# ============================================================ # scraper/book_scraper.py
# File: scraper/book_scraper.py
# Purpose:
# Backwards-compatible wrapper giving the SAME public API
# as the old BookScraper, but internally uses ScrapeEngine.
#
# execute() → full metadata + chapterlist (NO book_idx creation)
#
# ID management is now handled exclusively by InitService.
# ============================================================
from scraper.logger_decorators import logcall import requests
from scraper.services.scrape_engine import ScrapeEngine from bs4 import BeautifulSoup
from urllib.parse import urljoin
from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements
from scraper.models.book_state import Chapter
class BookScraper: class BookScraper:
""" """
Backwards-compatible BookScraper façade. Minimal scraper: only metadata + chapter list.
The DownloadController handles Celery pipelines for:
- download
- parse
- save
"""
Old responsibilities (metadata, chapters, covers, downloads) def __init__(self, site, url):
are now split: self.site = site
self.url = url
ScrapeEngine metadata + chapterlist self.book_title = ""
Download tasks handle download/parse/save self.book_author = ""
InitService determines book_idx (single source of truth) self.book_description = ""
self.cover_url = ""
self.chapter_base = None
This wrapper intentionally does NOT generate a book_idx or book_id. self.chapters = []
It only returns metadata/chapters in legacy-compatible dict format.
"""
@logcall # Load custom replacements
def __init__(self, site_scraper, url: str): extra = load_replacements("replacements.txt")
self.site = site_scraper self.site.replacements.update(extra)
self.url = url
@logcall # ------------------------------------------------------------
def execute(self): def execute(self):
""" """Main entry point. Returns metadata + chapter URLs."""
Legacy public API: soup = self._fetch(self.url)
Return metadata + chapter list EXACTLY as before,
but without generating any book_id. self._parse_title(soup)
""" self._parse_author(soup)
self._parse_description(soup)
self._parse_cover(soup)
chapter_page = self.get_chapter_page(soup)
self.parse_chapter_links(chapter_page)
data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url) log_debug(f"[BookScraper] Completed metadata parse")
# Legacy structure preserved, unchanged:
return { return {
"title": data.get("title"), "title": self.book_title,
"author": data.get("author"), "author": self.book_author,
"description": data.get("description"), "description": self.book_description,
"cover_url": data.get("cover_url"), "cover_url": self.cover_url,
"chapters": data.get("chapters", []), "book_url": self.url,
"chapters_total": data.get("chapters_total", 0), "chapters": [
"book_url": data.get("book_url"), # used later by parse/save tasks {"num": ch.number, "title": ch.title, "url": ch.url}
for ch in self.chapters
],
} }
# ------------------------------------------------------------
def _fetch(self, url):
log_debug(f"[BookScraper] Fetch: {url}")
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
resp.encoding = self.site.encoding
return BeautifulSoup(resp.text, "lxml")
# ------------------------------------------------------------
def _parse_title(self, soup):
h1 = soup.find("h1")
self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
log_debug(f"[BookScraper] Title = {self.book_title}")
def _parse_author(self, soup):
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
self.book_author = raw.split("")[1] if "" in raw else "UnknownAuthor"
log_debug(f"[BookScraper] Author = {self.book_author}")
def _parse_description(self, soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
self.book_description = ""
log_debug("[BookScraper] Description not found")
return
parts = []
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
self.book_description = clean_text("\n".join(parts), self.site.replacements)
log_debug(f"[BookScraper] Description length = {len(self.book_description)}")
# ------------------------------------------------------------
def _parse_cover(self, soup):
img = soup.find("img", src=lambda v: v and "files/article/image" in v)
if not img:
log_debug("[BookScraper] No cover found")
return
self.cover_url = urljoin(self.site.root, img.get("src"))
log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
# ------------------------------------------------------------
def get_chapter_page(self, soup):
"""Return BeautifulSoup of the main chapter list page."""
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
chapter_url = urljoin(self.site.root, href)
# base for chapter links
parts = chapter_url.rsplit("/", 1)
self.chapter_base = parts[0] + "/"
return self._fetch(chapter_url)
# ------------------------------------------------------------
def parse_chapter_links(self, soup):
cont = soup.select_one(self.site.chapter_list_selector)
items = cont.select("ul li a[href]")
self.chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full = urljoin(self.chapter_base, href)
self.chapters.append(Chapter(idx, title, full))
idx += 1
log_debug(f"[BookScraper] Found {len(self.chapters)} chapters")

@ -1,54 +1,44 @@
# ========================================================= # =========================================================
# File: scraper/download_controller.py # File: scraper/download_controller.py
# Purpose: # Purpose:
# Build Celery pipelines for all chapters using book_idx # Build Celery pipelines for all chapters
# Handles: # and pass book_id for abort/progress/log functionality.
# • volume assignment
# • cover download + replication
# • script generation
# • Redis Book State Model init
# • abort tracking
# ========================================================= # =========================================================
from celery import group from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline from scraper.tasks.pipeline import build_chapter_pipeline
# ❗ IMPORTANT:
# generate_all_scripts MUST NOT import DownloadController, otherwise circular import.
# We keep the import, but scriptgen must be clean.
from scraper import scriptgen
from logbus.publisher import log from logbus.publisher import log
import os import os
import requests
import shutil
from scraper.abort import abort_requested
from db.state_redis import init_book_state
from db.repository import set_status, set_chapters_total
class DownloadController: class DownloadController:
""" """
Coordinates all chapter pipelines (download parse save). Coordinates all chapter pipelines (download parse save),
including:
- volume splitting
- consistent meta propagation
- book_id-based abort + progress tracking
""" """
def __init__(self, book_idx: str, scrape_result: dict): def __init__(self, book_id: str, scrape_result: dict):
self.book_idx = str(book_idx) self.book_id = book_id
self.scrape_result = scrape_result self.scrape_result = scrape_result
# Metadata # Core metadata
self.title = scrape_result.get("title", "UnknownBook") self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or [] self.chapters = scrape_result.get("chapters", []) or []
self.cover_url = scrape_result.get("cover_url")
# Output folder # Output base dir
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
# Volume size
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
# Base folder for the whole book
self.book_base = os.path.join(root, self.title) self.book_base = os.path.join(root, self.title)
os.makedirs(self.book_base, exist_ok=True) os.makedirs(self.book_base, exist_ok=True)
# Meta passed downstream # Meta passed to parse/save stage
self.meta = { self.meta = {
"title": self.title, "title": self.title,
"author": scrape_result.get("author"), "author": scrape_result.get("author"),
@ -56,121 +46,52 @@ class DownloadController:
"book_url": scrape_result.get("book_url"), "book_url": scrape_result.get("book_url"),
} }
log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}")
# Init Redis Book State Model
try:
init_book_state(
book_id=self.book_idx,
title=self.title,
url=self.meta["book_url"],
chapters_total=len(self.chapters),
)
except Exception as e:
log(f"[CTRL_STATE] init_book_state FAILED: {e}")
# --------------------------------------------------------- # ---------------------------------------------------------
def download_cover(self): # Volume isolation
if not self.cover_url:
return log(f"[CTRL] No cover URL for '{self.title}'")
cover_path = os.path.join(self.book_base, "cover.jpg")
headers = {
"User-Agent": "Mozilla/5.0",
"Referer": self.scrape_result.get("book_url") or "",
}
try:
log(f"[CTRL] Downloading cover: {self.cover_url}")
resp = requests.get(self.cover_url, timeout=10, headers=headers)
resp.raise_for_status()
with open(cover_path, "wb") as f:
f.write(resp.content)
log(f"[CTRL] Cover saved: {cover_path}")
except Exception as e:
log(f"[CTRL] Cover download failed: {e}")
# ---------------------------------------------------------
def replicate_cover_to_volumes(self):
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
return
for entry in os.listdir(self.book_base):
if entry.lower().startswith("volume_"):
dst = os.path.join(self.book_base, entry, "cover.jpg")
try:
shutil.copyfile(src, dst)
log(f"[CTRL] Cover replicated → {dst}")
except Exception as e:
log(f"[CTRL] Cover replication failed: {e}")
# ---------------------------------------------------------
def store_cover_in_static(self):
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
return
os.makedirs("static/covers", exist_ok=True)
dst = os.path.join("static/covers", f"{self.book_idx}.jpg")
try:
shutil.copyfile(src, dst)
log(f"[CTRL] Cover stored for UI: {dst}")
except Exception as e:
log(f"[CTRL] Failed storing cover: {e}")
# --------------------------------------------------------- # ---------------------------------------------------------
def get_volume_path(self, chapter_num: int) -> str: def get_volume_path(self, chapter_num: int) -> str:
"""Returns the correct volume directory for a chapter."""
vol_index = (chapter_num - 1) // self.max_vol + 1 vol_index = (chapter_num - 1) // self.max_vol + 1
vol_name = f"Volume_{vol_index:03d}" vol_name = f"Volume_{vol_index:03d}"
vol_path = os.path.join(self.book_base, vol_name) vol_path = os.path.join(self.book_base, vol_name)
os.makedirs(vol_path, exist_ok=True) os.makedirs(vol_path, exist_ok=True)
return vol_path return vol_path
# ---------------------------------------------------------
# Pipeline launcher
# --------------------------------------------------------- # ---------------------------------------------------------
def start(self): def start(self):
total = len(self.chapters) total = len(self.chapters)
log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)")
# Update Redis/SQLite state
try:
set_status(self.book_idx, "downloading")
set_chapters_total(self.book_idx, total)
except Exception as e:
log(f"[CTRL_STATE] Unable to set state: {e}")
# Download cover log(
self.download_cover() f"[CTRL] Initialising pipeline for '{self.title}' "
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
)
log(f"[CTRL] Output root: {self.book_base}")
# Build pipeline tasks
tasks = [] tasks = []
for ch in self.chapters: for ch in self.chapters:
num = ch["num"] chapter_num = ch["num"]
chapter_info = { chapter_url = ch["url"]
"num": num,
"url": ch["url"], volume_path = self.get_volume_path(chapter_num)
"title": ch.get("title"),
"volume_path": self.get_volume_path(num), tasks.append(
} build_chapter_pipeline(
tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta)) self.book_id, # ← UUID from scraping.py
chapter_num,
chapter_url,
volume_path,
self.meta,
)
)
async_result = group(tasks).apply_async() async_result = group(tasks).apply_async()
# Replicate cover + place in static log(
self.replicate_cover_to_volumes() f"[CTRL] Pipelines dispatched for '{self.title}' "
self.store_cover_in_static() f"(book_id={self.book_id}, group_id={async_result.id})"
)
# Generate scripts (LATE IMPORT to avoid circular)
try:
scriptgen.generate_all_scripts(
self.book_base, self.title, self.meta["author"]
)
log("[CTRL] Scripts generated")
except Exception as e:
log(f"[CTRL] Script generation failed: {e}")
return async_result return async_result

@ -1,27 +0,0 @@
# ============================================================
# File: scraper/engine/fetcher.py
# Purpose:
# Low-level HTML fetch utility shared by all site scrapers.
# Replaces scattered _fetch() logic inside BookScraper.
# ============================================================
import requests
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
)
}
def fetch_html(url: str, encoding: str = "utf-8", timeout: int = 10) -> BeautifulSoup:
"""
Fetch HTML with a consistent user-agent and encoding.
Returns BeautifulSoup(lxml).
"""
resp = requests.get(url, headers=HEADERS, timeout=timeout)
resp.encoding = encoding
return BeautifulSoup(resp.text, "lxml")

@ -1,65 +0,0 @@
# ============================================================
# File: scraper/engine/parser.py
# Purpose:
# High-level scraping API coordinating metadata extraction
# and chapter extraction using pluggable SiteScraper classes.
#
# This is the new central engine:
# - extract_metadata_only() used by INIT flow
# - extract_metadata_full() used by full scraping pipeline
# ============================================================
from scraper.engine.fetcher import fetch_html
def extract_metadata_only(url: str, site_scraper):
"""
Extract ONLY lightweight metadata:
- title
- author
- description
- cover_url
- chapters_total = 0
"""
soup = fetch_html(url, site_scraper.encoding)
title = site_scraper.parse_title(soup)
author = site_scraper.parse_author(soup)
description = site_scraper.parse_description(soup)
cover_url = site_scraper.parse_cover(soup, url)
return {
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"chapters_total": 0,
"book_url": url,
}
def extract_metadata_full(url: str, site_scraper):
"""
Full scraping (metadata + chapterlist).
Used by the scraping Celery pipeline.
"""
soup = fetch_html(url, site_scraper.encoding)
# metadata
meta = extract_metadata_only(url, site_scraper)
# chapter list
chapter_page_url = site_scraper.extract_chapter_page_url(soup)
chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding)
chapters = site_scraper.parse_chapter_list(chapter_page_soup)
meta["chapters"] = chapters
return meta
def build_book_id(title: str) -> str:
"""
Canonical book_id generator.
SCRAPE currently uses title as ID preserve that behavior.
"""
return title

@ -1,33 +0,0 @@
# ============================================================
# File: scraper/logger_decorators.py
# Purpose: Function-call logging decorator
# ============================================================
from functools import wraps
from scraper.logger import log_debug
def logcall(func):
"""
Decorator: log function name + arguments every time it's called.
Usage: @logcall above any function.
"""
@wraps(func)
def wrapper(*args, **kwargs):
# Naam van de functie
name = func.__qualname__
# Eerste logregel vóór uitvoering
# log_debug(f"[CALL] {name} args={args} kwargs={kwargs}")
log_debug(f"[CALL] {name} args={args}")
# log_debug(f"[CALL] {name}")
result = func(*args, **kwargs)
# Log ná uitvoering
# log_debug(f"[RETURN] {name} → {result}")
return result
return wrapper

@ -0,0 +1,66 @@
# ============================================================
# File: scraper/progress.py
# Purpose: Track chapter counters for WebGUI progress.
# ============================================================
import os
import redis
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ------------------------------------------------------------
# SET TOTAL
# ------------------------------------------------------------
def set_total(book_id: str, total: int):
r.set(f"progress:{book_id}:total", total)
# ------------------------------------------------------------
# COUNTERS
# ------------------------------------------------------------
def inc_completed(book_id: str):
r.incr(f"progress:{book_id}:completed")
def inc_skipped(book_id: str):
r.incr(f"progress:{book_id}:skipped")
def inc_failed(book_id: str):
r.incr(f"progress:{book_id}:failed")
# ------------------------------------------------------------
# FAILED CHAPTER LIST
# ------------------------------------------------------------
def add_failed_chapter(book_id: str, chapter: int, reason: str):
entry = f"Chapter {chapter}: {reason}"
r.rpush(f"progress:{book_id}:failed_list", entry)
def get_failed_list(book_id: str):
return r.lrange(f"progress:{book_id}:failed_list", 0, -1)
# ------------------------------------------------------------
# READ STRUCT FOR UI
# ------------------------------------------------------------
def get_progress(book_id: str):
total = int(r.get(f"progress:{book_id}:total") or 0)
completed = int(r.get(f"progress:{book_id}:completed") or 0)
skipped = int(r.get(f"progress:{book_id}:skipped") or 0)
failed = int(r.get(f"progress:{book_id}:failed") or 0)
abort = r.exists(f"abort:{book_id}") == 1
failed_list = get_failed_list(book_id)
return {
"book_id": book_id,
"total": total,
"completed": completed,
"skipped": skipped,
"failed": failed,
"failed_list": failed_list,
"abort": abort,
}

@ -1,37 +0,0 @@
#scraper/replacements/encoding.txt
# --- fix common encoding artifacts ---
\u3000= # IDEOGRAPHIC SPACE → empty
\u00A0= # non-breaking space → empty
# full-width punctuation
=,
。=.
=!
=?
=;
=:
=(
=)
【=[
】=]
《=<
》=>
# hyphen variants
=-
—=-
―=-
\u3000=
\u00A0=
 =
 =
=
—=—
“="
”="
’='
…=…
•=*
▁=
▲=
 =

@ -1,27 +0,0 @@
#scraper/replacements/html.txt
<br>=\n
<br/>=\n
<br />=\n
&nbsp;=
&nbsp&nbsp=
&nbsp&nbsp&nbsp=
&emsp;=
&ensp;=
&thinsp;=
&ldquo;="
&rdquo;="
&lsquo;='
&rsquo;='
&lt;=<
&gt;=>
&copy;=
&reg;=
&trade;=
fontbigbigbig=
fontbigbig=
font1=
font2=
font3=
strongstrong=
divdiv=
spanspan=

@ -1,95 +0,0 @@
#scraper/replacements/junk.txt
# --- Navigation ---
上一章=
下一章=
上一頁=
下一頁=
返回顶部=
返回目录=
返回书页=
章节目录=
章节列表=
快捷键=
(快捷键 ←)=
(快捷键 →)=
(快捷键)=
(快捷键 ←)=
(快捷键 →)=
上一页=
下一页=
手机阅读=
返回=
上一页阅读=
下一页阅读=
# --- Booksite footer disclaimers ---
重要声明=
所有的文字=
均由网友发表=
均由网友上传=
本站立场无关=
阅读更多小说=
返回飘天文学网=
小说阅读网=
最新章节请返回=
永久地址=
All rights reserved=
Copyright=
飘天文学=
=
…=
# --- Piaotia specific ---
请记住本书域名=
请收藏本书=
加入书签=
加入书架=
收藏本书=
推荐本书=
本章未完=
请稍后=
最新网址=
小说网=
小说阅读=
将本书加入书架=
章节出错=
点此举报=
举报原因=
求收藏=
推荐票=
www.piaotia.com=
www.piaotian.com=
www.=
www=
.com=
piaotia=
.net=
piaotian=
www.piaotia.com=
# --- Ads / QR / watermark ---
关注公众号=
微信扫一扫=
扫码阅读=
二维码=
QQ交流群=
加QQ群=
广告=
广告位=
sponsor=
sponsored=
ADVERTISEMENT=
Advertisment=
Adblock=
bookid=
bookname=
# --- Mode / UI related ---
选择背景颜色=
选择字体大小=
繁體中文=
模式选择=
阅读模式=
冲榜
求票
诸神学徒
感谢各位书友的支持=
您的支持就是我们最大的动力=
感谢各位书友的支持,您的支持就是我们最大的动力=

@ -1,147 +0,0 @@
# scraper/scriptgen.py
# Generates scripts (allinone.txt, makebook.txt, say.txt)
# using external templates + dynamic merge generation.
import os
import stat
from logbus.publisher import log
from scraper.logger_decorators import logcall
TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")
# ------------------------------------------------------------
# Load a template file from scraper/templates/
# ------------------------------------------------------------
def load_template(name: str) -> str:
path = os.path.join(TEMPLATE_DIR, name)
if not os.path.exists(path):
log(f"[SCRIPTGEN] Template missing: {path}")
return ""
with open(path, "r", encoding="utf-8") as f:
return f.read()
# ------------------------------------------------------------
# Detect volumes (Volume_001, Volume_002, ...)
# ------------------------------------------------------------
def detect_volumes(book_base: str):
vols = []
for name in os.listdir(book_base):
p = os.path.join(book_base, name)
if os.path.isdir(p) and name.lower().startswith("volume_"):
try:
num = int(name.split("_")[1])
vols.append((num, name))
except Exception:
continue
vols.sort()
return vols
# ------------------------------------------------------------
# Build the dynamic merge block
# ------------------------------------------------------------
def build_merge_block(title: str, author: str, volumes):
lines = []
# --------------------------------------------------------
# Normalize input (defensive)
# --------------------------------------------------------
title = (title or "").strip()
author = (author or "").strip()
total_vols = len(volumes)
# Padding-regel:
# - altijd minimaal 2 (01, 02)
# - 3 bij >=100
if total_vols >= 100:
pad = 3
else:
pad = 2
for num, dirname in volumes:
vol_num = f"{num:0{pad}d}" # voor filename
series_part = f"{num:0{pad}d}" # voor series-part (string!)
line = (
f"m4b-tool merge --jobs=4 "
f'--writer="{author}" '
f'--sortalbum="{title}" '
f'--albumartist="{author}" '
f'--album="{title}" '
f'--name="{title}" '
f'--series="{title}" '
f'--series-part="{series_part}" '
f'--output-file="{title}-{vol_num}.m4b" '
f'"{dirname}" -vvv'
)
lines.append(line)
if not lines:
return ""
return " \\\n&& ".join(lines) + "\n"
# ------------------------------------------------------------
# Main generator
# ------------------------------------------------------------
@logcall
def generate_all_scripts(book_base: str, title: str, author: str):
# --------------------------------------------------------
# Defensive normalize
# --------------------------------------------------------
title = (title or "").strip()
author = (author or "").strip()
log(f"[SCRIPTGEN] Generating scripts in {book_base}")
# Load templates
say_template = load_template("say.template")
cleanup_template = load_template("cleanup.template")
volumes = detect_volumes(book_base)
log(f"[SCRIPTGEN] Volumes detected: {volumes}")
merge_block = build_merge_block(title, author, volumes)
# --------------------------------------------------------
# allinone.txt = say + cleanup + merge
# --------------------------------------------------------
outfile = os.path.join(book_base, "allinone.txt")
with open(outfile, "w", encoding="utf-8") as f:
f.write(say_template)
f.write("\n")
f.write(cleanup_template)
f.write("\n")
f.write(merge_block)
os.chmod(outfile, os.stat(outfile).st_mode | stat.S_IEXEC)
log(f"[SCRIPTGEN] Created {outfile}")
# --------------------------------------------------------
# makebook.txt = merge only
# --------------------------------------------------------
outfile2 = os.path.join(book_base, "makebook.txt")
with open(outfile2, "w", encoding="utf-8") as f:
f.write(merge_block)
os.chmod(outfile2, os.stat(outfile2).st_mode | stat.S_IEXEC)
log(f"[SCRIPTGEN] Created {outfile2}")
# --------------------------------------------------------
# say.txt = say + cleanup
# --------------------------------------------------------
outfile3 = os.path.join(book_base, "say.txt")
with open(outfile3, "w", encoding="utf-8") as f:
f.write(say_template)
f.write("\n")
f.write(cleanup_template)
os.chmod(outfile3, os.stat(outfile3).st_mode | stat.S_IEXEC)
log(f"[SCRIPTGEN] Created {outfile3}")
log(f"[SCRIPTGEN] All scripts generated successfully for '{title}'")
__all__ = ["generate_all_scripts"]

@ -1,94 +0,0 @@
# ============================================================
# File: scraper/services/audio_completion.py
# Purpose:
# Orchestration hook after audio completion.
#
# Rules (STRICT):
# - ALWAYS read via get_book_state()
# - Use ONLY merged counters from repository
# - NO usage of derived status field
# - Completion rule:
# audio_completed < chapters_total → NOT DONE
# ============================================================
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import (
get_book_state,
try_trigger_statuscheck,
)
from scraper.services.status_check_service import StatusCheckService
from scraper.tasks.m4b_tasks import queue_m4b_for_book
@logcall
def trigger_audio_completion_check(book_idx: str):
"""
Called after inc_audio_done() OR inc_audio_skipped().
Flow:
1. Fetch canonical merged state from repository
2. Evaluate completion via merged counters ONLY
3. Run filesystem validation (authoritative)
4. Apply idempotency guard
5. Queue m4b exactly once
"""
try:
# ----------------------------------------------------
# STEP 1 — CANONICAL MERGED STATE
# ----------------------------------------------------
state = get_book_state(book_idx)
chapters_total = int(state.get("chapters_total", 0))
audio_done = int(state.get("audio_done", 0))
audio_skipped = int(state.get("audio_skipped", 0))
audio_completed = audio_done + audio_skipped
log(
f"[AUDIO-COMPLETION] book={book_idx} "
f"audio_completed={audio_completed} chapters_total={chapters_total}"
)
# ----------------------------------------------------
# STEP 2 — FAST REJECT (MERGED COUNTERS ONLY)
# ----------------------------------------------------
if chapters_total <= 0 or audio_completed < chapters_total:
log(f"[AUDIO-COMPLETION] not yet complete for book={book_idx}")
return
# ----------------------------------------------------
# STEP 3 — FILESYSTEM VALIDATION (AUTHORITATIVE)
# ----------------------------------------------------
result = StatusCheckService.run(book_idx)
fs = result.get("filesystem", {})
audio_files = fs.get("audio_files", 0)
chapters_txt = fs.get("chapters_txt", 0)
effective_audio = audio_files + audio_skipped
if effective_audio < chapters_txt:
log(
f"[AUDIO-COMPLETION] FS validation failed "
f"(audio_files={audio_files}, skipped={audio_skipped}, txt={chapters_txt})"
)
return
# ----------------------------------------------------
# STEP 4 — IDEMPOTENCY GUARD (AFTER FS CONFIRMATION)
# ----------------------------------------------------
if not try_trigger_statuscheck(book_idx):
log(f"[AUDIO-COMPLETION] statuscheck already triggered for {book_idx}")
return
# ----------------------------------------------------
# STEP 5 — FINAL ACTION
# ----------------------------------------------------
log(f"[AUDIO-COMPLETION] DONE → queue m4b for book={book_idx}")
queue_m4b_for_book(book_idx)
except Exception as exc:
# MUST NEVER break audio workers
log(f"[AUDIO-COMPLETION][ERROR] book={book_idx} error={exc}")

@ -1,45 +0,0 @@
# ============================================================
# File: scraper/services/cover_service.py
# ============================================================
import os
import requests
from logbus.publisher import log
from typing import Optional
class CoverService:
@staticmethod
def download_main_cover(cover_url: str, book_id: str) -> Optional[str]:
"""
Downloads cover image into: static/covers/<book_id>.jpg.
Returns local path or None.
"""
if not cover_url:
log(f"[COVER] No cover URL for book={book_id}")
return None
static_dir = os.path.join("static", "covers")
os.makedirs(static_dir, exist_ok=True)
dst_path = os.path.join(static_dir, f"{book_id}.jpg")
try:
log(f"[COVER] Downloading: {cover_url}")
resp = requests.get(
cover_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
)
resp.raise_for_status()
with open(dst_path, "wb") as f:
f.write(resp.content)
log(f"[COVER] Stored: {dst_path}")
return dst_path
except Exception as e:
log(f"[COVER] FAILED ({cover_url}) → {e}")
return None

@ -1,95 +0,0 @@
# ============================================================
# File: scraper/services/init_service.py
# Purpose:
# Orchestrate INIT-flow:
# - resolve site
# - fetch minimal metadata
# - derive book_idx
# - register in SQLite
# - store main cover
# ============================================================
import re
from scraper.services.site_resolver import SiteResolver
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.cover_service import CoverService
from db.repository import register_book
from scraper.logger_decorators import logcall
class InitService:
# ------------------------------------------------------------
# BOOK IDX DERIVATION
# ------------------------------------------------------------
@staticmethod
@logcall
def derive_book_id(url: str) -> str:
"""
PTWXZ URL format ends with /{id}.html.
If no match fallback to sanitized URL.
Returns:
book_idx (string)
"""
m = re.search(r"/(\d+)\.html$", url)
if m:
return m.group(1)
# Fallback — ensures deterministic ID for unknown formats
return url.replace("/", "_").replace(":", "_")
# ------------------------------------------------------------
# MAIN INIT FLOW
# ------------------------------------------------------------
@staticmethod
@logcall
def execute(url: str) -> dict:
"""
INIT entry point.
Returns complete metadata + registration result.
"""
# 1) Resolve site handler
site = SiteResolver.resolve(url)
# 2) Create unified book_idx
book_idx = InitService.derive_book_id(url)
# Some site objects historically expect .book_id — we support it but DO NOT rely on it.
site.book_id = book_idx
# 3) Fetch initial metadata (title/author/description/cover)
meta = ScrapeEngine.fetch_metadata_only(site, url)
title = meta.get("title") or "Unknown"
author = meta.get("author")
description = meta.get("description")
cover_url = meta.get("cover_url")
# 4) Download & store main cover for UI
cover_path = CoverService.download_main_cover(cover_url, book_idx)
# 5) Register in SQLite (book_idx is the SOLE primary ID)
register_book(
book_idx=book_idx,
title=title,
author=author,
description=description,
cover_url=cover_url,
cover_path=cover_path,
book_url=url,
)
# 6) Return metadata for UI / API
return {
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path,
"status": "registered",
}

@ -1,286 +0,0 @@
# ============================================================
# File: scraper/services/scrape_engine.py (C&U — no circular import)
# Purpose:
# Unified scraping engine for INIT-flow and Celery tasks.
# ScrapeEngine does NOT determine book_idx itself.
# ============================================================
import os
import time
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from logbus.publisher import log
from scraper.logger import log_debug
from scraper.logger_decorators import logcall
from scraper.utils.utils import load_replacements
class ScrapeEngine:
"""
Central scraping engine.
Metadata + chapterlist scraping.
All methods logged with @logcall.
IMPORTANT:
- ScrapeEngine NEVER decides book_idx.
- No dependency on InitService (prevents circular import).
"""
# ------------------------------------------------------------
# REPLACEMENTS LOADER
# ------------------------------------------------------------
@staticmethod
@logcall
def _apply_replacements(site):
fp = os.path.join(os.getcwd(), "replacements.txt")
extra = load_replacements(fp)
if not hasattr(site, "replacements"):
site.replacements = {}
site.replacements.update(extra)
return True
# ------------------------------------------------------------
# RATE LIMITER
# ------------------------------------------------------------
MIN_DELAY = 1.0 / float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1"))
@staticmethod
@logcall
def _throttle(last_time=[0]):
now = time.time()
elapsed = now - last_time[0]
if elapsed < ScrapeEngine.MIN_DELAY:
time.sleep(ScrapeEngine.MIN_DELAY - elapsed)
last_time[0] = time.time()
return True
# ------------------------------------------------------------
# HTTP GET
# ------------------------------------------------------------
@staticmethod
@logcall
def _get_doc(url: str, site):
attempt = 1
while True:
ScrapeEngine._throttle()
log_debug(f"[SCRAPER] GET {url} (attempt {attempt})")
try:
resp = requests.get(
url,
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
except Exception as e:
log_debug(f"Network error {e} → retry {attempt + 1}s")
time.sleep(attempt + 1)
attempt += 1
continue
code = resp.status_code
if code == 200:
resp.encoding = getattr(site, "encoding", "utf-8")
return BeautifulSoup(resp.text, "lxml")
if code == 429:
cooldown = 60
log_debug("429 detected — cooldown 60s")
for i in range(cooldown, 0, -1):
log_debug(f" cooldown {i}s…")
time.sleep(1)
attempt += 1
continue
if code in (403, 500):
wait = min(5 * attempt, 30)
log_debug(f"HTTP {code} → retry in {wait}s")
time.sleep(wait)
attempt += 1
continue
wait = attempt + 1
log_debug(f"Unexpected HTTP {code} → sleep {wait}s")
time.sleep(wait)
attempt += 1
# ------------------------------------------------------------
# PARSER HELPERS
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_title(soup):
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownTitle"
@staticmethod
@logcall
def _parse_author(soup):
td = soup.find("td", string=lambda t: t and "" in t)
if td and "" in td.get_text():
return td.get_text(strip=True).split("")[1]
return "UnknownAuthor"
@staticmethod
@logcall
def _parse_description(soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
txt = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if txt:
parts.append(txt)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSER (NO InitService dependency)
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_cover(soup, site):
"""
Extract book index from URL heuristically instead of InitService
(prevents circular import).
"""
# Typical Chinese novel sites embed numeric ID in URL path
try:
parsed = urlparse(site.url)
digits = re.findall(r"\d+", parsed.path)
book_idx = digits[-1] if digits else None
except Exception:
book_idx = None
imgs = soup.find_all("img", src=True)
candidates = []
for img in imgs:
src = img["src"].strip()
filename = os.path.basename(src)
if book_idx and book_idx in filename:
candidates.append((filename, src))
if not candidates:
return None
candidates.sort(key=lambda t: len(t[0])) # smallest filename
return urljoin(site.root, candidates[0][1])
# ------------------------------------------------------------
# RESOLVE CHAPTER PAGE
# ------------------------------------------------------------
@staticmethod
@logcall
def _resolve_chapter_page(soup, site):
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
if not node:
raise ValueError("Could not locate chapter list base node")
href = node.select_one("a").get("href")
url = urljoin(site.root, href)
parsed = urlparse(url)
basepath = parsed.path.rsplit("/", 1)[0] + "/"
chapter_base = f"{parsed.scheme}://{parsed.netloc}{basepath}"
return url, chapter_base
# ------------------------------------------------------------
# PARSE CHAPTER LINKS
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_chapter_links(soup, chapter_base, selector):
cont = soup.select_one(selector)
if not cont:
return []
items = cont.select("ul li a[href]")
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full = urljoin(chapter_base, href)
chapters.append({"num": idx, "title": title, "url": full})
idx += 1
return chapters
# ============================================================
# PUBLIC APIS
# ============================================================
@staticmethod
@logcall
def fetch_metadata_only(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url # needed for cover parsing
return {
"title": ScrapeEngine._parse_title(soup),
"author": ScrapeEngine._parse_author(soup),
"description": ScrapeEngine._parse_description(soup),
"cover_url": ScrapeEngine._parse_cover(soup, site),
"book_url": url,
}
@staticmethod
@logcall
def fetch_metadata_and_chapters(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url
title = ScrapeEngine._parse_title(soup)
author = ScrapeEngine._parse_author(soup)
desc = ScrapeEngine._parse_description(soup)
cover = ScrapeEngine._parse_cover(soup, site)
chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site)
chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site)
chapters = ScrapeEngine._parse_chapter_links(
chapter_soup, chapter_base, site.chapter_list_selector
)
return {
"title": title,
"author": author,
"description": desc,
"cover_url": cover,
"chapters": chapters,
"chapters_total": len(chapters),
"book_url": url,
}
@staticmethod
@logcall
def fetch_chapterlist(site, url: str):
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site)
chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site)
return ScrapeEngine._parse_chapter_links(
chapter_soup, chapter_base, site.chapter_list_selector
)

@ -1,20 +0,0 @@
# ============================================================
# File: scraper/services/site_resolver.py
# Purpose:
# Determine which BookSite implementation applies for a given URL.
# This keeps INIT-flow and SCRAPE-flow site-agnostic.
# ============================================================
from scraper.sites import BookSite # current PTWXZ implementation
class SiteResolver:
"""
Resolves the correct BookSite class based on URL.
Currently only PTWXZ/Piaotian is supported.
"""
@staticmethod
def resolve(url: str):
# Later: add more domain rules for other sources
return BookSite()

@ -1,135 +0,0 @@
# ============================================================
# File: scraper/services/status_check_service.py
# Purpose:
# Handmatige, idempotente statuscheck per boek.
#
# Bepaalt op basis van het filesystem:
# - aantal gedownloade chapters (.txt)
# - aantal gegenereerde audiofiles (.m4b)
#
# En schrijft deze gevalideerde werkelijkheid naar SQL.
#
# LET OP:
# - Geen Redis
# - Geen Celery
# - Geen status-transities
# - Geen pipeline-logica
# ============================================================
import os
from datetime import datetime
from typing import Dict, Any
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.state_sql import sql_fetch_book, sql_update_book
class StatusCheckService:
"""
Statuscheck op basis van filesystem.
Single source of truth = disk.
"""
@staticmethod
@logcall
def run(book_idx: str) -> Dict[str, Any]:
"""
Voer statuscheck uit voor één boek.
Returns een inspecteerbaar dict met:
- filesystem tellingen
- SQL before / after snapshot
"""
# ----------------------------------------------------
# 1. SQL fetch (bestaat het boek?)
# ----------------------------------------------------
sql_before = sql_fetch_book(book_idx)
if not sql_before:
raise ValueError(f"[STATUSCHECK] Book not found in SQL: {book_idx}")
# ----------------------------------------------------
# 2. Bepaal filesystem root
# ----------------------------------------------------
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
title = sql_before.get("title")
book_dir = os.path.join(output_root, title)
if not os.path.isdir(book_dir):
log(
f"[STATUSCHECK] No output directory for book_idx={book_idx} : title='{title}')"
)
chapters_txt = 0
audio_files = 0
volumes = 0
else:
chapters_txt = 0
audio_files = 0
volumes = 0
# ------------------------------------------------
# 3. Scan volumes
# ------------------------------------------------
for entry in os.listdir(book_dir):
if not entry.lower().startswith("volume_"):
continue
volumes += 1
volume_path = os.path.join(book_dir, entry)
if not os.path.isdir(volume_path):
continue
# ---- TXT chapters ----
for fname in os.listdir(volume_path):
if fname.lower().endswith(".txt"):
chapters_txt += 1
# ---- Audio ----
audio_dir = os.path.join(volume_path, "Audio")
if os.path.isdir(audio_dir):
for fname in os.listdir(audio_dir):
if fname.lower().endswith(".m4b"):
audio_files += 1
# ----------------------------------------------------
# 4. SQL update (snapshot)
# ----------------------------------------------------
now = datetime.utcnow().isoformat(timespec="seconds")
update_fields = {
"downloaded": chapters_txt,
"audio_done": audio_files,
"last_update": now,
}
sql_update_book(book_idx, update_fields)
sql_after = sql_fetch_book(book_idx)
# ----------------------------------------------------
# 5. Resultaat voor inspect/debug
# ----------------------------------------------------
result = {
"book_idx": book_idx,
"filesystem": {
"book_dir": book_dir,
"exists": os.path.isdir(book_dir),
"volumes": volumes,
"chapters_txt": chapters_txt,
"audio_files": audio_files,
},
"sql_before": sql_before,
"sql_after": sql_after,
"notes": [],
}
log(
f"[STATUSCHECK] book_idx={book_idx} "
f"chapters={chapters_txt} audio={audio_files}"
)
return result

@ -1,28 +0,0 @@
# ============================================================
# File: scraper/sites/__init__.py
# Purpose:
# Site autodetection based on URL.
# ============================================================
from scraper.sites.piaotian import PiaotianScraper
def get_scraper_for_url(url: str):
"""
Return the correct scraper instance for a given URL.
Later: add more site implementations.
"""
if "ptwxz" in url or "piaotian" in url:
return PiaotianScraper()
raise ValueError(f"No scraper available for URL: {url}")
# ============================================================
# Backwards-compatibility export for legacy BookScraper
# ============================================================
# Old code expects:
# from scraper.sites import BookSite
# We map that to our new PiaotianScraper implementation.
BookSite = PiaotianScraper

@ -1,52 +0,0 @@
# ============================================================
# File: scraper/sites/base.py
# Purpose:
# Abstract interface that every site-specific scraper must implement.
# ============================================================
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from typing import Optional
class SiteScraper(ABC):
"""
Defines the interface for site-specific scrapers.
Each concrete scraper (Piaotian, Biquge, etc.) must implement these.
"""
@property
@abstractmethod
def root(self) -> str: ...
@property
@abstractmethod
def encoding(self) -> str: ...
@property
@abstractmethod
def chapter_list_selector(self) -> str: ...
# --------------------------
# Metadata extraction
# --------------------------
@abstractmethod
def parse_title(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_author(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_description(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]: ...
# --------------------------
# Chapter extraction
# --------------------------
@abstractmethod
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_chapter_list(self, soup: BeautifulSoup) -> list: ...

@ -1,121 +0,0 @@
# ============================================================
# File: scraper/sites/piaotian.py
# Purpose:
# Concrete SiteScraper implementation for ptwxz.com (Piaotian).
# Moves all parsing logic out of BookScraper.
# ============================================================
from scraper.sites.base import SiteScraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from typing import Optional
class PiaotianScraper(SiteScraper):
root = "https://www.ptwxz.com"
encoding = "GB18030"
chapter_list_selector = "div.centent"
# ------------------------------------------------------------
# METADATA PARSING
# ------------------------------------------------------------
def parse_title(self, soup: BeautifulSoup) -> str:
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownBook"
def parse_author(self, soup: BeautifulSoup) -> str:
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
return raw.split("")[1] if "" in raw else "UnknownAuthor"
def parse_description(self, soup: BeautifulSoup) -> str:
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
# stop when next <span> reappears
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSING
# (exactly your BookScraper._parse_cover logic)
# ------------------------------------------------------------
def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]:
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", url)
if not m:
return None
book_id = m.group(1)
# Extract vol (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", url)
volume = m2.group(1) if m2 else None
imgs = soup.find_all("img", src=True)
chosen = None
# Priority 1: match "/files/article/image/{vol}/{book_id}/"
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
break
# Priority 2: endswith "/{book_id}s.jpg"
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
break
if not chosen:
return None
return urljoin(self.root, chosen)
# ------------------------------------------------------------
# CHAPTER EXTRACTION
# ------------------------------------------------------------
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str:
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
return urljoin(self.root, href)
def parse_chapter_list(self, soup: BeautifulSoup) -> list:
cont = soup.select_one(self.chapter_list_selector)
items = cont.select("ul li a[href]") if cont else []
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full_url = urljoin(self.root, href)
chapters.append({"num": idx, "title": title, "url": full_url})
idx += 1
return chapters

@ -1,7 +0,0 @@
# scraper/state.py
import os
import redis
REDIS_STATE_URL = os.getenv("REDIS_STATE", "redis://redis:6379/2")
state = redis.Redis.from_url(REDIS_STATE_URL, decode_responses=True)

@ -0,0 +1,10 @@
# tasks/audio.py
from celery import shared_task
from logbus.publisher import log
@shared_task(bind=True, queue="audio")
def text_to_audio(self, text_file):
log(f"[AUDIO] converting: {text_file}")
# placeholder for macOS "say"
return True

@ -1,220 +0,0 @@
# ============================================================
# File: scraper/tasks/audio_tasks.py
# Purpose: Convert chapter text files into audio using macOS
# “say”, with Redis-based slot control.
# Updated: now uses db.repository for audio counters.
# ============================================================
from celery_app import celery_app
from logbus.publisher import log
import os
import subprocess
import time
import socket
import os
from scraper.abort import abort_requested
from scraper.logger_decorators import logcall
from redis import Redis
from urllib.parse import urlparse
from scraper.services.audio_completion import trigger_audio_completion_check
# NEW — unified repository façade
from db.repository import (
inc_audio_done,
inc_audio_skipped,
)
HOST = socket.gethostname()
# ------------------------------------------------------------
# REDIS CLIENT SETUP
# ------------------------------------------------------------
redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
parsed = urlparse(redis_url)
# Slot locking Redis client
redis_client = Redis(
host=parsed.hostname,
port=parsed.port,
db=parsed.path.strip("/"),
)
# Abort + global progress flags always live in DB 0
backend_client = Redis(
host=parsed.hostname,
port=parsed.port,
db=0,
)
# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
AUDIO_TIMEOUT = int(os.getenv("AUDIO_TIMEOUT_SECONDS", "300"))
AUDIO_VOICE = os.getenv("AUDIO_VOICE", "SinJi")
AUDIO_RATE = int(os.getenv("AUDIO_RATE", "200"))
HOST_PATH = os.getenv("HOST_PATH", "/app/output")
CONTAINER_PREFIX = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "/app/output")
AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1"))
# ============================================================
# CELERY TASK
# ============================================================
@celery_app.task(bind=True, queue="audio", ignore_result=True)
@logcall
def generate_audio(
self, book_id, volume_name, chapter_number, chapter_title, chapter_path
):
"""
chapter_path: absolute container path to chapter text file.
"""
log(f"[AUDIO]({HOST}) CH{chapter_number}: START → {chapter_title}")
# ------------------------------------------------------------
# ABORT CHECK
# ------------------------------------------------------------
if abort_requested(book_id, backend_client):
inc_audio_skipped(book_id)
log(f"[AUDIO]({HOST}) ABORT detected → skip CH{chapter_number}")
return
# ------------------------------------------------------------
# ACQUIRE SLOT
# ------------------------------------------------------------
slot_key = None
ttl = AUDIO_TIMEOUT + 15
for i in range(1, AUDIO_SLOTS + 1):
key = f"audio_slot:{i}"
if redis_client.set(key, "1", nx=True, ex=ttl):
slot_key = key
log(f"[AUDIO] CH{chapter_number}: Acquired slot {i}/{AUDIO_SLOTS}")
break
# Need to wait
if slot_key is None:
log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting…")
start_wait = time.time()
while slot_key is None:
# Try all slots again
for i in range(1, AUDIO_SLOTS + 1):
key = f"audio_slot:{i}"
if redis_client.set(key, "1", nx=True, ex=ttl):
slot_key = key
log(f"[AUDIO] CH{chapter_number}: Slot acquired after wait")
break
# If still no slot
if not slot_key:
if abort_requested(book_id, backend_client):
log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}")
inc_audio_skipped(book_id)
return
if time.time() - start_wait > ttl:
log(f"[AUDIO] CH{chapter_number}: Wait timeout → abort audio")
inc_audio_skipped(book_id)
return
time.sleep(0.25)
# ------------------------------------------------------------
# PATH NORMALISATION
# ------------------------------------------------------------
container_path = chapter_path
if not container_path:
log(f"[AUDIO] CH{chapter_number}: ERROR — no input file path provided")
redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return
# Strip container prefix so that host path is resolvable
if CONTAINER_PREFIX and container_path.startswith(CONTAINER_PREFIX):
relative_path = container_path[len(CONTAINER_PREFIX) :].lstrip("/")
else:
relative_path = container_path
parts = relative_path.split("/")
if len(parts) < 3:
log(
f"[AUDIO] CH{chapter_number}: ERROR — cannot parse book/volume from {relative_path}"
)
redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return
# book_from_path = parts[0] # volume_name passed explicitly anyway
# volume_from_path = parts[1]
host_path = os.path.join(HOST_PATH, relative_path)
# ------------------------------------------------------------
# OUTPUT DIRECTORY
# ------------------------------------------------------------
base_dir = os.path.join(HOST_PATH, parts[0], parts[1], "Audio")
os.makedirs(base_dir, exist_ok=True)
safe_num = f"{chapter_number:04d}"
audio_file = os.path.join(base_dir, f"{safe_num}.m4b")
# Skip if audio already exists
if os.path.exists(audio_file):
log(f"[AUDIO] CH{chapter_number}: Already exists → skip")
redis_client.delete(slot_key)
inc_audio_skipped(book_id)
trigger_audio_completion_check(book_id)
return
# ------------------------------------------------------------
# BUILD TTS COMMAND
# ------------------------------------------------------------
cmd = (
f"say --voice={AUDIO_VOICE} "
f"--input-file='{host_path}' "
f"--output-file='{audio_file}' "
f"--file-format=m4bf "
f"--quality=127 "
f"-r {AUDIO_RATE} "
f"--data-format=aac"
)
log(f"[AUDIO]({HOST}) CH{chapter_number} → output: {audio_file}")
# ------------------------------------------------------------
# EXECUTE
# ------------------------------------------------------------
try:
subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT)
# NEW — repository façade
inc_audio_done(book_id)
trigger_audio_completion_check(book_id)
log(f"trigger_audio_completion_check ")
log(f"[AUDIO]({HOST}) CH{chapter_number}: Completed")
except subprocess.TimeoutExpired:
log(f"[AUDIO]({HOST}) CH{chapter_number}: TIMEOUT → removing file")
if os.path.exists(audio_file):
try:
os.remove(audio_file)
except Exception:
pass
inc_audio_skipped(book_id)
except subprocess.CalledProcessError as e:
log(f"[AUDIO] CH{chapter_number}: ERROR during say → {e}")
inc_audio_skipped(book_id)
except Exception as e:
log(f"[AUDIO] CH{chapter_number}: UNEXPECTED ERROR → {e}")
inc_audio_skipped(book_id)
finally:
if slot_key:
redis_client.delete(slot_key)
log(f"[AUDIO] CH{chapter_number}: Released slot")

@ -1,167 +1,95 @@
# ============================================================ # ============================================================
# File: scraper/tasks/controller_tasks.py # File: scraper/tasks/controller_tasks.py
# Purpose: # Purpose:
# FULL scrape entrypoint + launching download/parse/save pipelines. # Start the download → parse → save pipeline for a scraped book,
# NO result.get() anywhere. Scraping is done inline. # including progress/abort tracking via book_id.
# ONLY THE CONTROLLER UPDATES PROGRESS.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from logbus.publisher import log from logbus.publisher import log
import os
import time
import redis
from urllib.parse import urlparse
from scraper.logger_decorators import logcall
from scraper.abort import abort_requested
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.site_resolver import SiteResolver
from db.repository import fetch_book, set_chapters_total
from scraper.download_controller import DownloadController from scraper.download_controller import DownloadController
from scraper.progress import (
set_total,
inc_completed,
inc_skipped,
inc_failed,
)
from scraper.abort import abort_requested
print(">>> [IMPORT] controller_tasks.py loaded") print(">>> [IMPORT] controller_tasks.py loaded")
# ============================================================= @celery_app.task(bind=True, queue="controller", ignore_result=False)
# 1) PUBLIC ENTRYPOINT — CALLED FROM /start def launch_downloads(self, book_id: str, scrape_result: dict):
# =============================================================
@celery_app.task(
bind=True,
queue="controller",
ignore_result=False,
name="scraper.tasks.controller_tasks.start_full_scrape",
)
@logcall
def start_full_scrape(self, book_idx: str):
"""
FULL SCRAPE ENTRYPOINT.
Scraping is done inline no Celery .get() needed.
""" """
Launch the entire pipeline (download parse save),
AND maintain progress counters.
log(f"[CTRL] start_full_scrape(book_idx={book_idx})") EXPECTS:
book_id: ID generated in scraping.start_scrape_book
# Abort before doing anything scrape_result: dict with title, author, url, chapters[]
if abort_requested(book_idx):
log(f"[CTRL] PRE-ABORT flag detected for {book_idx}")
return {"book_idx": book_idx, "aborted": True, "reason": "pre-abort"}
# --------------------------------------------------------
# 1) Load book metadata from SQLite
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
msg = f"[CTRL] Book '{book_idx}' not found in DB"
log(msg)
raise ValueError(msg)
url = book.get("book_url")
if not url:
msg = f"[CTRL] No book_url stored for {book_idx}"
log(msg)
raise ValueError(msg)
# --------------------------------------------------------
# 2) INLINE SCRAPE (fast, no Celery wait)
# --------------------------------------------------------
site = SiteResolver.resolve(url)
try:
scrape_result = ScrapeEngine.fetch_metadata_and_chapters(site, url)
log(f"[CTRL] Scrape OK for {book_idx}: {scrape_result.get('title')}")
except Exception as e:
log(f"[CTRL] ERROR during scrape of {book_idx}: {e}")
raise
# --------------------------------------------------------
# 3) Continue → dispatch pipelines
# --------------------------------------------------------
return launch_downloads(book_idx, scrape_result)
# =============================================================
# 2) PIPELINE DISPATCH (NOT a Celery task)
# =============================================================
@logcall
def launch_downloads(book_idx: str, scrape_result: dict):
"""
Launches the entire processing pipeline:
- initialize Redis UI state
- initialize SQLite totals
- dispatch per-chapter pipelines via DownloadController
""" """
title = scrape_result.get("title", "UnknownBook") title = scrape_result.get("title", "UnknownBook")
chapters = scrape_result.get("chapters", []) or [] chapters = scrape_result.get("chapters", []) or []
total = len(chapters) total = len(chapters)
# ------------------------------------------------------------ log(f"[CTRL] Book '{title}'{total} chapters (book_id={book_id})")
# INIT REDIS STATE
# ------------------------------------------------------------
broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
parsed = urlparse(broker_url)
r = redis.Redis(
host=parsed.hostname,
port=parsed.port,
db=int(parsed.path.strip("/")),
decode_responses=True,
)
base = f"book:{book_idx}:state"
r.hset(base, "title", title)
r.hset(base, "status", "starting")
r.hset(base, "chapters_total", total)
r.hset(base, "chapters_download_done", 0)
r.hset(base, "chapters_download_skipped", 0)
r.hset(base, "chapters_parsed_done", 0)
r.hset(base, "audio_done", 0)
r.hset(base, "audio_skipped", 0)
r.hset(base, "last_update", int(time.time()))
# ------------------------------------------------------------ # ------------------------------------------------------------
# INIT SQLITE SNAPSHOT # INIT PROGRESS
# ------------------------------------------------------------ # ------------------------------------------------------------
try: set_total(book_id, total)
set_chapters_total(book_idx, total) log(f"[CTRL] Progress initialized for {book_id}: total={total}")
except Exception as e:
log(f"[CTRL] ERROR updating SQLite totals: {e}")
raise
log(f"[CTRL] Initialized totals for {book_idx}: {total}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# ABORT CHECK BEFORE LAUNCHING JOBS # BUILD CONTROLLER
# ------------------------------------------------------------ # ------------------------------------------------------------
if abort_requested(book_idx): ctl = DownloadController(book_id, scrape_result)
log(f"[CTRL] ABORT flag detected — stopping BEFORE dispatch for {book_idx}")
r.hset(base, "status", "aborted")
return {"book_idx": book_idx, "aborted": True, "reason": "abort-before-start"}
# ------------------------------------------------------------ # ------------------------------------------------------------
# BUILD + DISPATCH PER-CHAPTER PIPELINES # RUN PIPELINE IN SYNC LOOP
# (DownloadController.start() returns per-chapter generator)
# ------------------------------------------------------------ # ------------------------------------------------------------
controller = DownloadController(book_idx, scrape_result)
try: try:
group_result = controller.start() for result in ctl.start(): # new generator mode
gid = getattr(group_result, "id", None) ch = result.get("chapter")
log(f"[CTRL] Pipelines dispatched for {book_idx} (group_id={gid})")
except Exception as e: if result.get("skipped"):
log(f"[CTRL] ERROR dispatching pipelines for {book_idx}: {e}") inc_skipped(book_id)
inc_completed(book_id)
log(f"[CTRL] SKIPPED chapter {ch}")
continue
if result.get("failed"):
inc_failed(book_id)
inc_completed(book_id)
log(f"[CTRL] FAILED chapter {ch}")
continue
# Normal success
inc_completed(book_id)
log(f"[CTRL] DONE chapter {ch}")
# Abort requested mid-run?
if abort_requested(book_id):
log(f"[CTRL] ABORT after chapter {ch}")
break
except Exception as exc:
log(f"[CTRL] ERROR while processing pipeline: {exc}")
inc_failed(book_id)
raise raise
# Update UI state to "downloading" # ------------------------------------------------------------
r.hset(base, "status", "downloading") # FINISHED
r.hset(base, "last_update", int(time.time())) # ------------------------------------------------------------
log(f"[CTRL] Pipeline finished for book_id={book_id}")
return { return {
"book_idx": book_idx, "book_id": book_id,
"total": total, "total": total,
"started": True, "completed": int(total), # For safety
"group_id": gid,
} }

@ -1,24 +1,22 @@
# ============================================================ # ============================================================
# File: scraper/tasks/download_tasks.py # File: scraper/tasks/download_tasks.py
# Purpose: # Purpose: Download chapter HTML with global concurrency,
# Download chapter HTML into payload["html"]. # retry/backoff logic, 429 support, and abort-awareness.
# Updated for book_idx unified ID model. #
# Logging:
# - timestamp + book_id in de message
# - message wordt via publisher.py naar console gestuurd
# - message wordt via ui_log.push_ui naar Redis GUI logbuffer gestuurd
#
# publisher.py en ui_log.py blijven DOM.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from scraper.utils.utils import get_save_path from scraper.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started from scraper.abort import abort_requested, chapter_started, mark_chapter_started
# Unified repository façade from logbus.publisher import log # console logging (DOM)
from db.repository import ( from scraper.ui_log import push_ui # GUI logging (DOM)
set_status,
inc_download_done,
inc_download_skipped,
)
from logbus.publisher import log
from scraper.ui_log import push_ui
from scraper.logger_decorators import logcall
import requests import requests
import redis import redis
@ -33,34 +31,54 @@ print(">>> [IMPORT] download_tasks.py loaded")
# ----------------------------------------------------------- # -----------------------------------------------------------
# TIMESTAMPED LOG WRAPPER # TIMESTAMPED LOG WRAPPER
# ----------------------------------------------------------- # -----------------------------------------------------------
def log_msg(book_idx: str, message: str): def log_msg(book_id: str, message: str):
"""
Log with compact timestamp + book_id.
Pushes to:
- console (publisher.log)
- GUI Redis (push_ui)
"""
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
full = f"{ts} [{book_idx}] {message}" full = f"{ts} [{book_id}] {message}"
# console
log(full) log(full)
push_ui(full)
# GUI (Redis rolling list)
push_ui(full) # NO book_id param — ui_log is DOM
# ----------------------------------------------------------- # -----------------------------------------------------------
# ENV CONFIG # Retry parameters (ENV)
# ----------------------------------------------------------- # -----------------------------------------------------------
MAX_RETRIES = int(os.getenv("DOWNLOAD_MAX_RETRIES", "7")) MAX_RETRIES = int(os.getenv("DOWNLOAD_MAX_RETRIES", "7"))
BASE_DELAY = int(os.getenv("DOWNLOAD_BASE_DELAY", "2")) BASE_DELAY = int(os.getenv("DOWNLOAD_BASE_DELAY", "2"))
BACKOFF = int(os.getenv("DOWNLOAD_BACKOFF_MULTIPLIER", "2")) BACKOFF = int(os.getenv("DOWNLOAD_BACKOFF_MULTIPLIER", "2"))
DELAY_429 = int(os.getenv("DOWNLOAD_429_DELAY", "10")) DELAY_429 = int(os.getenv("DOWNLOAD_429_DELAY", "10"))
# -----------------------------------------------------------
# Global concurrency
# -----------------------------------------------------------
MAX_CONCURRENCY = int(os.getenv("DOWNLOAD_MAX_GLOBAL_CONCURRENCY", "1")) MAX_CONCURRENCY = int(os.getenv("DOWNLOAD_MAX_GLOBAL_CONCURRENCY", "1"))
# -----------------------------------------------------------
# Global delay sync
# -----------------------------------------------------------
GLOBAL_DELAY = int(os.getenv("DOWNLOAD_GLOBAL_MIN_DELAY", "1")) GLOBAL_DELAY = int(os.getenv("DOWNLOAD_GLOBAL_MIN_DELAY", "1"))
DELAY_KEY = "download:delay_lock"
# -----------------------------------------------------------
# Redis connection
# -----------------------------------------------------------
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
redis_client = redis.Redis.from_url(REDIS_URL) redis_client = redis.Redis.from_url(REDIS_URL)
SEM_KEY = "download:active" SEM_KEY = "download:active" # semaphore counter
DELAY_KEY = "download:delay_lock"
# ----------------------------------------------------------- # ============================================================
# DELAY + CONCURRENCY HELPERS # GLOBAL DELAY FUNCTIONS
# ----------------------------------------------------------- # ============================================================
def wait_for_global_delay(): def wait_for_global_delay():
if GLOBAL_DELAY <= 0: if GLOBAL_DELAY <= 0:
return return
@ -74,6 +92,9 @@ def set_global_delay():
redis_client.set(DELAY_KEY, "1", nx=True, ex=GLOBAL_DELAY) redis_client.set(DELAY_KEY, "1", nx=True, ex=GLOBAL_DELAY)
# ============================================================
# GLOBAL CONCURRENCY FUNCTIONS
# ============================================================
def acquire_global_slot(max_slots: int, retry_delay: float = 0.5): def acquire_global_slot(max_slots: int, retry_delay: float = 0.5):
while True: while True:
current = redis_client.incr(SEM_KEY) current = redis_client.incr(SEM_KEY)
@ -87,93 +108,79 @@ def release_global_slot():
redis_client.decr(SEM_KEY) redis_client.decr(SEM_KEY)
print(f">>> [CONFIG] Global concurrency = {MAX_CONCURRENCY}")
print(f">>> [CONFIG] Global min delay = {GLOBAL_DELAY}s")
print(
f">>> [CONFIG] Retries: MAX={MAX_RETRIES}, base={BASE_DELAY}, "
f"backoff={BACKOFF}, 429={DELAY_429}"
)
# ============================================================ # ============================================================
# CELERY TASK — Payload v3 (book_idx model) # CELERY TASK: DOWNLOAD CHAPTER
# ============================================================ # ============================================================
@celery_app.task(bind=True, queue="download", ignore_result=False) @celery_app.task(bind=True, queue="download", ignore_result=False)
@logcall def download_chapter(
def download_chapter(self, payload: dict): self, book_id: str, chapter_num: int, chapter_url: str, base_path: str
):
""" """
Payload format: Download chapter HTML.
Abort logic:
{ - If abort active AND chapter not started SKIP
"book_idx": str, - If abort active BUT chapter already started Proceed normally
"chapter": {
"num": int,
"title": str,
"url": str,
"volume_path": str
},
"book_meta": dict,
# fields filled during pipeline:
"html": None | str,
"parsed": None | str,
"skipped": bool,
"path": None | str
}
""" """
if not payload:
raise ValueError("download_chapter received empty payload")
book_idx = payload["book_idx"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
chapter_num = chapter["num"]
chapter_url = chapter["url"]
chapter_title = chapter.get("title") or f"Chapter {chapter_num}"
volume_path = chapter["volume_path"]
# -----------------------------------------------------------
# STATUS UPDATE (book is now in 'downloading')
# -----------------------------------------------------------
set_status(book_idx, "downloading")
# ----------------------------------------------------------- # -----------------------------------------------------------
# ABORT CHECK (skip if not yet started) # ABORT BEFORE START
# ----------------------------------------------------------- # -----------------------------------------------------------
if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num): if abort_requested(book_id) and not chapter_started(book_id, chapter_num):
log_msg(book_idx, f"[ABORT] Skip chapter {chapter_num}") msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)"
log_msg(book_id, msg)
inc_download_skipped(book_idx) return {
"chapter": chapter_num,
payload["html"] = None "url": chapter_url,
payload["skipped"] = True "html": None,
payload["path"] = None "skipped": True,
return payload "path": None,
"abort": True,
}
# Mark started — ensures parse/save must run
mark_chapter_started(book_id, chapter_num)
# Hard delay
if GLOBAL_DELAY > 0:
time.sleep(GLOBAL_DELAY)
mark_chapter_started(book_idx, chapter_num) save_path = get_save_path(chapter_num, base_path)
# ----------------------------------------------------------- # -----------------------------------------------------------
# SKIP IF FILE ALREADY EXISTS # SKIP existing
# ----------------------------------------------------------- # -----------------------------------------------------------
save_path = get_save_path(chapter_num, volume_path)
if os.path.exists(save_path): if os.path.exists(save_path):
log_msg(book_idx, f"[DL] SKIP {chapter_num}{save_path}") wait_for_global_delay()
inc_download_skipped(book_idx) set_global_delay()
log_msg(book_id, f"[DL] SKIP {chapter_num} (exists) → {save_path}")
payload["html"] = None return {
payload["skipped"] = True "chapter": chapter_num,
payload["path"] = save_path "url": chapter_url,
return payload "html": None,
"skipped": True,
# ----------------------------------------------------------- "path": save_path,
# GLOBAL DELAY + CONCURRENCY }
# -----------------------------------------------------------
if GLOBAL_DELAY > 0: # Sync delay
time.sleep(GLOBAL_DELAY)
wait_for_global_delay() wait_for_global_delay()
# Acquire concurrency slot
acquire_global_slot(MAX_CONCURRENCY) acquire_global_slot(MAX_CONCURRENCY)
log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
# -----------------------------------------------------------
# HTTP DOWNLOAD
# -----------------------------------------------------------
try: try:
log_msg(book_idx, f"[DL] Downloading {chapter_num} ({chapter_title})") # -----------------------------------------------------------
# HTTP DOWNLOAD
# -----------------------------------------------------------
log_msg(book_id, f"[DL] Downloading chapter {chapter_num}: {chapter_url}")
resp = requests.get( resp = requests.get(
chapter_url, chapter_url,
@ -185,28 +192,44 @@ def download_chapter(self, payload: dict):
resp.encoding = resp.apparent_encoding or "gb2312" resp.encoding = resp.apparent_encoding or "gb2312"
html = resp.text html = resp.text
log_msg(book_idx, f"[DL] OK {chapter_num}: {len(html)} bytes") log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes")
payload["html"] = html return {
payload["skipped"] = False "chapter": chapter_num,
payload["path"] = save_path "url": chapter_url,
return payload "html": html,
"skipped": False,
"path": save_path,
}
except Exception as exc: except Exception as exc:
attempt = self.request.retries attempt = self.request.retries
delay = BASE_DELAY * (BACKOFF**attempt) delay = BASE_DELAY * (BACKOFF**attempt)
# Handle 429 # 429 hard block
if getattr(getattr(exc, "response", None), "status_code", None) == 429: if (
log_msg(book_idx, f"[DL] 429 → WAIT {DELAY_429}s") hasattr(exc, "response")
and getattr(exc.response, "status_code", None) == 429
):
log_msg(
book_id,
f"[DL] 429 {chapter_num} → WAIT {DELAY_429}s "
f"(attempt {attempt}/{MAX_RETRIES})",
)
time.sleep(DELAY_429) time.sleep(DELAY_429)
set_global_delay() set_global_delay()
raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES) raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES)
# General retry with backoff # Normal error
log_msg(book_idx, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s") log_msg(
book_id,
f"[DL] ERROR {chapter_num}: {exc} → retry in {delay}s "
f"(attempt {attempt}/{MAX_RETRIES})",
)
raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES) raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
finally: finally:
set_global_delay() set_global_delay()
release_global_slot() release_global_slot()
log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")

@ -1,132 +0,0 @@
# ============================================================
# File: scraper/tasks/m4b_tasks.py
# ============================================================
import os
import subprocess
from typing import List
from celery_app import celery_app
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import fetch_book, store_m4b_error
from scraper.scriptgen import build_merge_block
# ------------------------------------------------------------
# Helper: detect volumes (UNCHANGED)
# ------------------------------------------------------------
def detect_volumes(book_base: str) -> List[str]:
volumes = []
for name in os.listdir(book_base):
if name.lower().startswith("volume_"):
full = os.path.join(book_base, name)
if os.path.isdir(full):
volumes.append(name)
volumes.sort()
return volumes
# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="m4b", ignore_result=True)
@logcall
def run_m4btool(self, book_idx: str):
log(f"[M4B] START book_idx={book_idx}")
book = fetch_book(book_idx)
if not book:
log(f"[M4B] Book not found in SQL: book_idx={book_idx}")
return
title = book.get("title", book_idx)
author = book.get("author", "Unknown")
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
book_base = os.path.join(output_root, title)
log(f"[M4B] Book base directory: {book_base}")
if not os.path.isdir(book_base):
log(f"[M4B] Book directory missing: {book_base}")
return
volumes = detect_volumes(book_base)
if not volumes:
log(f"[M4B] No volumes found for book_idx={book_idx}")
return
log(f"[M4B] Volumes detected: {volumes}")
# --------------------------------------------------------
# Build canonical commands via scriptgen
# --------------------------------------------------------
merge_block = build_merge_block(
title, author, [(i + 1, v) for i, v in enumerate(volumes)]
)
commands = [c.strip() for c in merge_block.split("&&") if c.strip()]
for volume, cmd in zip(volumes, commands):
audio_dir = os.path.join(book_base, volume, "Audio")
if not os.path.isdir(audio_dir):
log(f"[M4B] SKIP {volume}: no Audio directory")
continue
log(f"[M4B] Running for volume={volume}")
log(f"[M4B] CMD: {cmd}")
try:
result = subprocess.run(
cmd,
cwd=book_base,
shell=True,
capture_output=True,
text=True,
check=True,
)
if result.stdout:
log(f"[M4B][STDOUT] {result.stdout}")
except subprocess.CalledProcessError as exc:
log(f"[M4B][FAILED] volume={volume}")
if exc.stdout:
log(f"[M4B][STDOUT] {exc.stdout}")
if exc.stderr:
log(f"[M4B][STDERR] {exc.stderr}")
store_m4b_error(
book_idx=book_idx,
volume=volume,
error_text=exc.stderr or str(exc),
)
continue
except Exception as exc:
log(f"[M4B][UNEXPECTED ERROR] volume={volume}: {exc}")
store_m4b_error(
book_idx=book_idx,
volume=volume,
error_text=str(exc),
)
continue
log(f"[M4B] FINISHED book_idx={book_idx}")
# ------------------------------------------------------------
# Orchestration helper (UNCHANGED)
# ------------------------------------------------------------
@logcall
def queue_m4b_for_book(book_idx: str):
log(f"[M4B] Queuing m4b-tool for book_idx={book_idx}")
celery_app.send_task(
"scraper.tasks.m4b_tasks.run_m4btool",
args=[book_idx],
queue="m4b",
)

@ -1,132 +1,63 @@
# ============================================================ # =========================================================
# File: scraper/tasks/parse_tasks.py # File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text. # Purpose: Parse downloaded HTML into clean chapter text.
# Enhanced Piaotia extractor + selector fallback + clean pipeline. #
# Compatible with payload pipeline v3 + book_idx refactor. # Abort Behavior:
# ============================================================ # - parse MUST ALWAYS RUN once download has started
# - even if the user triggers abort afterwards
# - (abort only prevents new chapters from starting)
#
# Logging:
# - Same unified log_msg(book_id, message) as download_tasks
# - publisher.log → console
# - ui_log.push_ui → GUI
# =========================================================
from celery_app import celery_app from celery_app import celery_app
from bs4 import BeautifulSoup, NavigableString, Comment from bs4 import BeautifulSoup
from scraper.tasks.download_tasks import log_msg from scraper.utils import clean_text, load_replacements
from scraper.utils.utils import clean_text, load_all_replacements from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.logger_decorators import logcall
from db.repository import inc_parsed_done
print(">>> [IMPORT] parse_tasks.py loaded")
print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)")
# ============================================================
# PIAOTIA ADVANCED CONTENT EXTRACTOR
# ============================================================
def extract_piaotia_content(soup):
h1 = soup.find("h1")
if not h1:
return None
# Find first table after <h1>
table = None
for sib in h1.next_siblings:
if getattr(sib, "name", None) == "table":
table = sib
break
if not table:
return None
parts = []
for sib in table.next_siblings:
name = getattr(sib, "name", None)
text = None
if hasattr(sib, "get_text"):
text = sib.get_text(strip=True)
# Stop conditions
if isinstance(sib, Comment) and ("翻页" in sib):
break
if name == "div":
sid = sib.get("id", "")
cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break
if text and ("重要声明" in text or "Copyright" in text):
break
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break
if name in ("script", "style"):
continue
if name == "center":
continue
# Accumulate
if isinstance(sib, NavigableString):
s = sib.strip()
if s:
parts.append(s)
elif hasattr(sib, "get_text"):
t = sib.get_text(separator="\n").strip()
if t:
parts.append(t)
return "\n".join(parts).strip()
# ============================================================
# PARSE TASK — PAYLOAD PIPELINE v3 (book_idx)
# ============================================================
@celery_app.task(bind=True, queue="parse", ignore_result=False) @celery_app.task(bind=True, queue="parse", ignore_result=False)
@logcall def parse_chapter(self, download_result: dict, meta: dict):
def parse_chapter(self, payload: dict): """
Parse raw HTML returned by download_chapter into clean chapter text.
if not payload: """
return {"skipped": True, "reason": "empty_payload"}
# NEW MODEL
book_idx = payload["book_idx"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
num = chapter.get("num") # Extract book_id stored by download_tasks
title = chapter.get("title") or f"Chapter {num}" book_id = download_result.get("book_id", "NOBOOK")
html = payload.get("html")
# ------------------------------------------------------------ # ------------------------------------------------------------
# DOWNLOAD SKIPPED → PARSE SKIP # 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
# ------------------------------------------------------------ # ------------------------------------------------------------
if payload.get("skipped"): if download_result.get("skipped"):
log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)") chapter = download_result.get("chapter")
return payload log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
return download_result
if not html: # ------------------------------------------------------------
log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP") # 2) Normal Parsing
payload["parsed"] = None # ------------------------------------------------------------
payload["skipped"] = True chapter_num = download_result["chapter"]
return payload chapter_url = download_result["url"]
html = download_result["html"]
log_msg(book_idx, f"[PARSE] Parsing chapter {num}") log_msg(book_id, f"[PARSE] Parsing chapter {chapter_num}")
soup = BeautifulSoup(html, "lxml") soup = BeautifulSoup(html, "lxml")
# ------------------------------------------------------------
# STRICT SELECTORS
# ------------------------------------------------------------
selectors = [ selectors = [
"#content", "#content",
"div#content",
".content", ".content",
"div#content",
"div.content", "div.content",
"#chaptercontent",
"div#chaptercontent", "div#chaptercontent",
"#chapterContent", "#chapterContent",
".read-content", ".read-content",
"div.read-content",
] ]
node = None node = None
@ -136,70 +67,34 @@ def parse_chapter(self, payload: dict):
node = tmp node = tmp
break break
raw = None raw = node.get_text() if node else soup.get_text()
# strict selectors failed → piaotia extractor
if node is None:
raw = extract_piaotia_content(soup)
else:
raw = node.get_text(separator="\n")
# FINAL FALLBACK
if raw is None:
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
raw = soup.get_text(separator="\n")
# ------------------------------------------------------------ # ------------------------------------------------------------
# MULTIPASS CLEANING VIA replacement-block files # Apply global replacements
# ------------------------------------------------------------ # ------------------------------------------------------------
REPL = load_all_replacements() REPL = load_replacements()
text = clean_text(raw, REPL)
text = raw
for _ in range(5):
text = clean_text(text, REPL)
# ------------------------------------------------------------ # ------------------------------------------------------------
# Collapse double blank lines # Chapter 1 gets full header
# ------------------------------------------------------------ # ------------------------------------------------------------
cleaned = [] if chapter_num == 1:
prev_blank = False book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
for line in text.split("\n"):
s = line.rstrip()
if s == "":
if prev_blank:
continue
prev_blank = True
cleaned.append("")
else:
prev_blank = False
cleaned.append(s)
text = "\n".join(cleaned)
text = f"{title}\n{text}"
# ------------------------------------------------------------
# Header on chapter 1
# ------------------------------------------------------------
if num == 1:
book_url = book_meta.get("book_url") or "UNKNOWN"
header = ( header = (
f"{book_meta.get('title','')}\n" f"{meta.get('title','')}\n"
f"Author: {book_meta.get('author','')}\n" f"Author: {meta.get('author','')}\n"
f"Description:\n{book_meta.get('description','')}\n" f"Description:\n{meta.get('description','')}\n"
f"Book URL: {book_url}\n" + "-" * 50 + "\n\n" f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
) )
text = header + text text = header + text
log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars") log_msg(book_id, f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars")
# ------------------------------------------------------------
# OUTPUT PAYLOAD
# ------------------------------------------------------------
payload["parsed"] = text
payload["skipped"] = False
inc_parsed_done(book_idx)
return payload return {
"book_id": book_id,
"chapter": chapter_num,
"url": chapter_url,
"text": text,
"length": len(text),
}

@ -1,16 +1,10 @@
# ========================================================= # =========================================================
# File: scraper/tasks/pipeline.py # File: scraper/tasks/pipeline.py
# Purpose: # Purpose:
# Build Celery chains for chapter processing using payload dict. # Build Celery chains for chapter processing.
# #
# Pipeline v3: # download → parse → save → update_progress
# download_chapter(payload)
# → parse_chapter(payload)
# → save_chapter(payload)
# #
# NOTE:
# - book_idx is the single authoritative key for all tasks
# - payload travels unchanged through the entire pipeline
# ========================================================= # =========================================================
from celery import chain from celery import chain
@ -18,33 +12,28 @@ from celery import chain
from scraper.tasks.download_tasks import download_chapter from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter from scraper.tasks.save_tasks import save_chapter
from scraper.tasks.progress_tasks import update_progress # NEW
from scraper.logger_decorators import logcall
def build_chapter_pipeline(
@logcall book_id: str,
def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict): chapter_number: int,
""" chapter_url: str,
Create a payload object passed through the pipeline. base_path: str,
Consistent with the chapter_dict-based task signature. meta: dict,
):
""" """
Chapter pipeline:
payload = { download_chapter(book_id, chapter_num, url, base_path)
"book_idx": book_idx, parse_chapter(download_result, meta)
"chapter": chapter_dict, save_chapter(parsed_result, base_path)
"book_meta": book_meta, update_progress(result, book_id)
# Will be filled by download_chapter """
"html": None,
# Will be filled by parse_chapter
"parsed": None,
# Set by download or parse on skip/404/etc
"skipped": False,
# Final path written by save_chapter
"path": None,
}
return chain( return chain(
download_chapter.s(payload), download_chapter.s(book_id, chapter_number, chapter_url, base_path),
parse_chapter.s(), parse_chapter.s(meta),
save_chapter.s(), save_chapter.s(base_path),
update_progress.s(book_id), # ← centrale progress update
) )

@ -0,0 +1,36 @@
# ============================================================
# File: scraper/tasks/progress_tasks.py
# Purpose: Central progress updater for chapter pipelines.
# ============================================================
from celery_app import celery_app
from scraper.progress import inc_completed, inc_skipped, inc_failed
from logbus.publisher import log
print(">>> [IMPORT] progress_tasks.py loaded")
@celery_app.task(bind=False, name="progress.update", queue="controller")
def update_progress(result: dict, book_id: str):
"""
Central progress logic:
- result: output of save_chapter
- book_id: explicitly passed by pipeline
"""
ch = result.get("chapter")
skipped = result.get("skipped", False)
failed = result.get("failed", False)
if failed:
inc_failed(book_id)
log(f"[PROG] FAILED chapter {ch}")
elif skipped:
inc_skipped(book_id)
inc_completed(book_id)
log(f"[PROG] SKIPPED chapter {ch}")
else:
inc_completed(book_id)
log(f"[PROG] DONE chapter {ch}")
return result

@ -1,84 +1,81 @@
# ============================================================ # =========================================================
# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC + book_idx) # File: scraper/tasks/save_tasks.py
# ============================================================ # Purpose: Save parsed chapter text to disk.
# =========================================================
print(">>> [IMPORT] save_tasks.py loaded") print(">>> [IMPORT] save_tasks.py loaded")
from celery import shared_task from celery import shared_task
import os import os
from logbus.publisher import log from scraper.utils import get_save_path
from scraper.logger_decorators import logcall from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.utils.utils import get_save_path from scraper.progress import (
from scraper.tasks.download_tasks import log_msg inc_completed,
from scraper.tasks.audio_tasks import generate_audio inc_skipped,
inc_failed,
from db.repository import inc_download_done, inc_download_skipped add_failed_chapter, # <-- enige noodzakelijke aanvulling
)
@shared_task(bind=True, queue="save", ignore_result=False) @shared_task(bind=True, queue="save", ignore_result=False)
@logcall def save_chapter(self, parsed: dict, base_path: str):
def save_chapter(self, payload: dict): """
Save parsed chapter text to disk.
if not payload:
log("[SAVE] ERROR: payload is None") parsed = {
return {"error": True} "book_id": str,
"chapter": int,
# NEW unified ID "text": str,
book_idx = payload["book_idx"] "url": str,
"skipped": bool,
chapter = payload["chapter"] "path": optional str
parsed = payload.get("parsed") }
path = payload.get("path") """
skipped = payload.get("skipped")
book_id = parsed.get("book_id", "NOBOOK")
num = chapter["num"] chapter = parsed.get("chapter")
title = chapter.get("title") or f"Chapter {num}"
volume = chapter.get("volume_path") # ------------------------------------------------------------
volume_name = os.path.basename(volume.rstrip("/")) # SKIP CASE (from download or parse stage)
# ------------------------------------------------------------
# ============================================================ if parsed.get("skipped"):
# SKIPPED CASE (old behavior restored) path = parsed.get("path", "(no-path)")
# ============================================================ log_msg(book_id, f"[SAVE] SKIP chapter {chapter}{path}")
if skipped or not parsed:
log_msg(book_idx, f"[SAVE] SKIP chapter {num}") inc_skipped(book_id)
inc_download_skipped(book_idx) return {"chapter": chapter, "path": path, "skipped": True}
# OLD behavior: even skipped chapters still queue audio # ------------------------------------------------------------
if path and os.path.exists(path): # NORMAL SAVE
log_msg(book_idx, f"[AUDIO] Queueing audio for SKIPPED chapter {num}") # ------------------------------------------------------------
try:
generate_audio.delay(book_idx, volume_name, num, title, path)
except Exception as exc:
log_msg(book_idx, f"[AUDIO] ERROR queueing skipped audio: {exc}")
return payload
# ============================================================
# NORMAL SAVE CASE
# ============================================================
try: try:
os.makedirs(volume, exist_ok=True) text = parsed.get("text", "")
save_path = get_save_path(num, volume) url = parsed.get("url")
if chapter is None:
raise ValueError("Missing chapter number in parsed payload")
# Ensure folder exists
os.makedirs(base_path, exist_ok=True)
with open(save_path, "w", encoding="utf-8") as f: # Build file path
f.write(parsed) path = get_save_path(chapter, base_path)
log_msg(book_idx, f"[SAVE] Saved chapter {num}{save_path}") # Write chapter text
with open(path, "w", encoding="utf-8") as f:
f.write(text)
inc_download_done(book_idx) log_msg(book_id, f"[SAVE] Saved chapter {chapter}{path}")
# OLD behavior: ALWAYS queue audio inc_completed(book_id)
try:
generate_audio.delay(book_idx, volume_name, num, title, save_path)
log_msg(book_idx, f"[AUDIO] Task queued for chapter {num}")
except Exception as exc:
log_msg(book_idx, f"[AUDIO] ERROR queueing chapter {num}: {exc}")
payload["path"] = save_path return {"book_id": book_id, "chapter": chapter, "path": path}
payload["skipped"] = False
return payload
except Exception as exc: except Exception as exc:
log_msg(book_idx, f"[SAVE] ERROR saving chapter {num}: {exc}") log_msg(book_id, f"[SAVE] ERROR saving chapter {chapter}: {exc}")
inc_failed(book_id)
add_failed_chapter(book_id, chapter, str(exc)) # <-- essentieel
raise raise

@ -1,101 +1,90 @@
# ============================================================ # ============================================================
# File: scraper/tasks/scraping.py # File: scraper/tasks/scraping.py
# Purpose: # Purpose: Scrape metadata + chapter list and initialise
# Scrape ONLY metadata + chapter list. # Redis progress tracking + launch download controller
# Does NOT launch download controller anymore.
# Controller decides when pipelines start.
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
from logbus.publisher import log from logbus.publisher import log
import os import os
import uuid
import redis import redis
from scraper.logger_decorators import logcall
from scraper.sites import BookSite from scraper.sites import BookSite
from scraper.book_scraper import BookScraper from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort from scraper.abort import clear_abort # no circular deps
from scraper.ui_log import reset_ui_logs
from scraper.services.init_service import InitService
print(">>> [IMPORT] scraping.py loaded") print(">>> [IMPORT] scraping.py loaded")
# Redis connection (same DB as Celery broker) # Redis connection (same as Celery broker)
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True) r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
@celery_app.task( @celery_app.task(bind=True, queue="scraping", ignore_result=False)
bind=True,
queue="scraping",
ignore_result=False,
name="scraper.tasks.scraping.start_scrape_book",
)
@logcall
def start_scrape_book(self, url: str): def start_scrape_book(self, url: str):
""" """Scrapes metadata + chapters and prepares download tracking."""
Scrapes metadata + chapters.
DOES NOT START download / pipeline controller.
The controller_tasks.start_full_scrape() task will call this one.
"""
# ------------------------------------------------------------
# CLEAR UI LOG BUFFER
# ------------------------------------------------------------
reset_ui_logs()
log(f"[SCRAPING] Start scraping for: {url}") log(f"[SCRAPING] Start scraping for: {url}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# SCRAPE (old engine) # Book scrape
# ------------------------------------------------------------ # ------------------------------------------------------------
site = BookSite() site = BookSite()
scraper = BookScraper(site, url) scraper = BookScraper(site, url)
result = scraper.execute() # → { title, author, chapters, cover_url, ... } result = scraper.execute() # returns dict with metadata + chapters
chapters = result.get("chapters", []) chapters = result.get("chapters", [])
full_count = len(chapters) full_count = len(chapters)
# ------------------------------------------------------------ # ------------------------------------------------------------
# Compute unified book_idx # DRY RUN
# ------------------------------------------------------------
book_idx = InitService.derive_book_id(url)
result["book_idx"] = book_idx
log(f"[SCRAPING] Assigned book_idx = {book_idx}")
# ------------------------------------------------------------
# DRY RUN TEST LIMIT
# ------------------------------------------------------------ # ------------------------------------------------------------
DRY_RUN = os.getenv("DRY_RUN", "0") == "1" DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5")) TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
if DRY_RUN: if DRY_RUN:
log(f"[SCRAPING] DRY RUN active → limiting chapters to {TEST_LIMIT}") log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}")
result["chapters"] = chapters[:TEST_LIMIT] chapters = chapters[:TEST_LIMIT]
result["chapters"] = chapters
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
# ------------------------------------------------------------ # ------------------------------------------------------------
# LOG RESULTS # BOOK RUN ID
# ------------------------------------------------------------ # ------------------------------------------------------------
log( book_id = str(uuid.uuid4())
f"[SCRAPING] Completed scrape: " result["book_id"] = book_id
f"{len(result['chapters'])}/{full_count} chapters"
) log(f"[SCRAPING] Assigned book_id = {book_id}")
# ------------------------------------------------------------ # ------------------------------------------------------------
# RESET ABORT + INITIALIZE LEGACY PROGRESS # RESET ABORT + INITIALISE PROGRESS
# ------------------------------------------------------------ # ------------------------------------------------------------
clear_abort(book_idx) clear_abort(book_id)
r.set(f"progress:{book_idx}:total", len(result["chapters"])) r.set(f"progress:{book_id}:total", len(chapters))
r.set(f"progress:{book_idx}:done", 0) r.set(f"progress:{book_id}:done", 0)
r.delete(f"logs:{book_id}") # clear old logs if any
r.delete(f"logs:{book_idx}") r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}")
r.rpush(f"logs:{book_idx}", f":: SCRAPING STARTED for {url}") r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters")
r.rpush(f"logs:{book_idx}", f":: Found {len(result['chapters'])} chapters")
# ------------------------------------------------------------ # ------------------------------------------------------------
# IMPORTANT: DO NOT DISPATCH any pipelines here # DISPATCH DOWNLOAD CONTROLLER
# Controller will receive scrape_result and continue.
# ------------------------------------------------------------ # ------------------------------------------------------------
return result # controller task signature = launch_downloads(book_id, scrape_result)
celery_app.send_task(
"scraper.tasks.controller_tasks.launch_downloads",
args=[book_id, result],
queue="controller",
)
log(f"[SCRAPING] Dispatched download controller for {book_id}")
return {
"book_id": book_id,
"title": result.get("title"),
"author": result.get("author"),
"chapters": len(chapters),
}

@ -1,149 +0,0 @@
# ============================================================
# File: scraper/tasks/statuscheck.py
# Purpose:
# Final status check after audio completion.
#
# Responsibilities:
# - Verify Redis counters (sanity check)
# - Verify filesystem (Audio files present)
# - Queue m4btool task
#
# Design rules:
# - Book-scope ONLY
# - No direct Redis usage
# - Repository is the single source of truth
# - Idempotent, defensive, non-blocking
# ============================================================
import os
from celery_app import celery_app
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import (
get_audio_done,
get_chapters_total,
set_status,
fetch_book,
)
from scraper.tasks.m4b_tasks import run_m4btool
# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
@log
def _detect_volumes(book_base: str):
"""
Return sorted list of Volume_XXX directories.
"""
vols = []
for name in os.listdir(book_base):
if name.lower().startswith("volume_"):
full = os.path.join(book_base, name)
if os.path.isdir(full):
vols.append(name)
vols.sort()
return vols
@logcall
def _count_audio_files(audio_dir: str) -> int:
"""
Count .m4b files in an Audio directory.
"""
if not os.path.isdir(audio_dir):
return 0
return len([f for f in os.listdir(audio_dir) if f.lower().endswith(".m4b")])
# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="controller", ignore_result=True)
@logcall
def run_statuscheck(self, book_idx: str):
"""
Final statuscheck before m4btool execution.
Triggered exactly once by audio_completion quickcheck.
"""
log(f"[STATUSCHECK] START book={book_idx}")
# --------------------------------------------------------
# 1. Redis sanity check (via repository)
# --------------------------------------------------------
audio_done = get_audio_done(book_idx)
chapters_total = get_chapters_total(book_idx)
log(
f"[STATUSCHECK] Counters book={book_idx} "
f"audio_done={audio_done} chapters_total={chapters_total}"
)
if chapters_total <= 0:
log(f"[STATUSCHECK] No chapters_total → abort")
return
if audio_done < chapters_total:
# Defensive: should not happen, but never assume
log(
f"[STATUSCHECK] Audio not complete yet "
f"({audio_done}/{chapters_total}) → abort"
)
return
# --------------------------------------------------------
# 2. Fetch book metadata (for paths & m4b meta)
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
log(f"[STATUSCHECK] Book not found in DB: {book_idx}")
return
title = book.get("title") or book_idx
author = book.get("author") or "Unknown"
# Base output directory
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
book_base = os.path.join(root, title)
if not os.path.isdir(book_base):
log(f"[STATUSCHECK] Book directory missing: {book_base}")
return
# --------------------------------------------------------
# 3. Filesystem validation (light, non-blocking)
# --------------------------------------------------------
volumes = _detect_volumes(book_base)
if not volumes:
log(f"[STATUSCHECK] No volumes found for {book_idx}")
# Still allow m4btool to decide (it will no-op)
else:
for vol in volumes:
audio_dir = os.path.join(book_base, vol, "Audio")
count = _count_audio_files(audio_dir)
log(f"[STATUSCHECK] {vol}: " f"{count} audio files detected")
# --------------------------------------------------------
# 4. Queue m4btool (final pipeline step)
# --------------------------------------------------------
log(f"[STATUSCHECK] Queue m4btool for book={book_idx}")
set_status(book_idx, "m4b_running")
run_m4btool.delay(
book_idx=book_idx,
book_base=book_base,
meta={
"title": title,
"author": author,
},
)
log(f"[STATUSCHECK] DONE book={book_idx}")

@ -1,44 +0,0 @@
#!/bin/sh
main_dir="$( cd "$( dirname "$0" )" && pwd )"
shopt -s nocasematch # For case-insensitive regex matching
for subfolder in "$main_dir"/*; do
if [ -d "$subfolder" ]; then
audiofolder="$subfolder/Audio"
mkdir -p "$audiofolder"
for entry in "$subfolder"/*.txt; do
fn=$(basename "$entry")
[[ "${entry##*.}" =~ txt ]]
echo "$fn"
inputfile="$subfolder/$fn"
outputfile="$audiofolder/${fn%.*}.m4b"
now=$(date +"%T")
echo "Current time : $now"
echo "$inputfile ->"
echo "$outputfile" && \
if [ -f $outputfile ]; then
echo $outputfile + "exists: skipping"
else
say --voice=Sinji \
--output-file="$outputfile" \
--input-file="$inputfile" \
--file-format=m4bf \
--quality=127 \
-r 200 \
--data-format=aac
fi
done
fi
done
# CLEANUP WILL BE APPENDED BY scriptgen.py

@ -1,4 +0,0 @@
find . -name "*.m4b" -size -580c | while read fname; do
echo "deleting $(ls -lah \"$fname\")"
rm "$fname"
done

@ -1,38 +0,0 @@
#!/bin/sh
main_dir="$( cd "$( dirname "$0" )" && pwd )"
shopt -s nocasematch
for subfolder in "$main_dir"/*; do
if [ -d "$subfolder" ]; then
audiofolder="$subfolder/Audio"
mkdir -p "$audiofolder"
for entry in "$subfolder"/*.txt; do
fn=$(basename "$entry")
[[ "${entry##*.}" =~ txt ]]
echo "$fn"
inputfile="$subfolder/$fn"
outputfile="$audiofolder/${fn%.*}.m4b"
now=$(date +"%T")
echo "Current time : $now"
echo "$inputfile ->"
echo "$outputfile"
if [ -f "$outputfile" ]; then
echo "$outputfile exists: skipping"
else
say --voice=Sinji \
--output-file="$outputfile" \
--input-file="$inputfile" \
--file-format=m4bf \
--quality=127 \
-r 200 \
--data-format=aac
fi
done
fi
done

@ -34,41 +34,3 @@ def get_ui_logs(limit: int = None):
limit = LOG_BUFFER_SIZE limit = LOG_BUFFER_SIZE
return r.lrange(UI_LOG_KEY, -limit, -1) return r.lrange(UI_LOG_KEY, -limit, -1)
def reset_ui_logs():
"""
Clear the entire UI log buffer.
Used by:
- Clear button in GUI
- Auto-clear when new book scraping starts
"""
r.delete(UI_LOG_KEY)
# ============================================================
# Delta-based log retrieval using Redis indexes
# ============================================================
def get_ui_logs_delta(last_index: int):
"""
Returns (new_lines, total_count).
Only returns log lines AFTER last_index.
Example:
last_index = 10 returns logs with Redis indexes 11..end
"""
total = r.llen(UI_LOG_KEY)
if total == 0:
return [], 0
# First load OR index invalid → send entire buffer
if last_index < 0 or last_index >= total:
logs = r.lrange(UI_LOG_KEY, 0, -1)
return logs, total
# Only new logs
new_lines = r.lrange(UI_LOG_KEY, last_index + 1, -1)
return new_lines, total

@ -0,0 +1,67 @@
import os
import re
from pathlib import Path
# ------------------------------------------------------------
# Load replacements from text_replacements.txt (optional file)
# ------------------------------------------------------------
def load_replacements(filepath="text_replacements.txt") -> dict:
"""
Load key=value style replacements.
Empty or missing file return {}.
Lines starting with '#' are ignored.
"""
path = Path(filepath)
if not path.exists():
return {}
repl = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" in line:
key, val = line.split("=", 1)
repl[key.strip()] = val.strip()
return repl
# ------------------------------------------------------------
# Clean extracted HTML text
# ------------------------------------------------------------
def clean_text(raw: str, repl_dict: dict = None) -> str:
"""
Normalize whitespace, remove junk, apply replacements.
repl_dict is optional {} if none provided.
"""
if repl_dict is None:
repl_dict = {}
txt = raw.replace("\r", "") # normalize CRLF
# Collapse 3+ blank lines → max 1 empty line
txt = re.sub(r"\n{3,}", "\n\n", txt)
# Apply replacements
for key, val in repl_dict.items():
txt = txt.replace(key, val)
return txt.strip()
# ------------------------------------------------------------
# Determine save path for a chapter (shared by download & save)
# ------------------------------------------------------------
def get_save_path(chapter_num: int, base_path: str) -> str:
"""
Returns the filesystem path where this chapter should be saved.
Formats the filename as 0001.txt, 0002.txt, ...
"""
filename = f"{chapter_num:04d}.txt"
return os.path.join(base_path, filename)

@ -1,272 +0,0 @@
# ============================================================
# File: scraper/utils/state_sync.py
# Purpose:
# State inspection + optional sync logic for unified book_idx model.
# Generates full book-card compatible dicts for debug UI.
# ============================================================
import os
import redis
from db.db import get_db
def _build_card(sqlite_row, redis_state, merged):
"""
Creates a dict that matches the fields required by components/bookcard.html:
b.book_idx
b.title
b.author
b.cover_path
b.status
b.created_at
b.download_done
b.download_total
b.audio_done
b.audio_total
"""
return {
"book_idx": sqlite_row.get("book_idx"),
"title": sqlite_row.get("title") or "Unknown",
"author": sqlite_row.get("author"),
"cover_path": sqlite_row.get("cover_path"),
# Use merged status (Redis > SQLite)
"status": merged.get("status") or sqlite_row.get("status") or "unknown",
# Meta
"created_at": sqlite_row.get("created_at"),
# Download counters
"download_done": merged.get("downloaded", 0),
"download_total": merged.get("chapters_total", 0),
# Audio counters
"audio_done": merged.get("audio_done", 0),
"audio_total": merged.get("chapters_total", 0),
}
# ============================================================
# INSPECT ONLY — NO WRITES
# ============================================================
def inspect_books_state_depecrated():
"""
Reads all books from SQLite and fetches Redis progress.
Builds:
entry.sqlite
entry.redis
entry.would_merge_to
entry.card (book-card compatible)
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db()
cur = db.cursor()
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
sqlite_row = dict(row)
book_idx = sqlite_row["book_idx"]
redis_key = f"book:{book_idx}:state"
redis_state = r.hgetall(redis_key) or {}
# ================================
# DRY-RUN MERGE LOGIC
# ================================
merged = sqlite_row.copy()
if redis_state:
merged["downloaded"] = int(
redis_state.get("chapters_download_done", merged.get("downloaded", 0))
)
merged["parsed"] = int(
redis_state.get("chapters_parsed_done", merged.get("parsed", 0))
)
merged["audio_done"] = int(
redis_state.get("audio_done", merged.get("audio_done", 0))
)
merged["chapters_total"] = int(
redis_state.get("chapters_total", merged.get("chapters_total", 0))
)
merged["status"] = redis_state.get(
"status", merged.get("status", "unknown")
)
# ================================
# Build book-card data
# ================================
card = _build_card(sqlite_row, redis_state, merged)
# ================================
# Append final result entry
# ================================
results.append(
{
"book_idx": book_idx,
"title": sqlite_row.get("title"),
"sqlite": sqlite_row,
"redis": redis_state,
"would_merge_to": merged,
"card": card,
}
)
return results
# ============================================================
# INSPECT ONLY — NO WRITES
# ============================================================
def inspect_books_state():
"""
Reads canonical book state from repository.
Builds:
entry.sqlite
entry.redis
entry.would_merge_to
entry.card (book-card compatible)
"""
from db.repository import get_book_state
from db.db import get_db
db = get_db()
cur = db.cursor()
# Alleen nodig om te weten *welke* books er zijn
cur.execute("SELECT book_idx FROM books")
rows = cur.fetchall()
results = []
for row in rows:
book_idx = row["book_idx"]
# --------------------------------
# Canonical state (ENIGE waarheid)
# --------------------------------
state = get_book_state(book_idx)
# SQLite-view = alleen SQLite-kolommen
sqlite_view = {
k: v
for k, v in state.items()
if k
in (
"book_idx",
"title",
"author",
"description",
"cover_path",
"book_url",
"chapters_total",
"status",
"downloaded",
"parsed",
"audio_done",
"created_at",
"processdate",
"last_update",
)
}
# Redis-view = alleen Redis counters/status
redis_view = {
k: v
for k, v in state.items()
if k.startswith("chapters_")
or k in ("status", "audio_done", "audio_skipped")
}
merged = state # letterlijk de canonieke state
card = _build_card(sqlite_view, redis_view, merged)
results.append(
{
"book_idx": book_idx,
"title": state.get("title"),
"sqlite": sqlite_view,
"redis": redis_view,
"would_merge_to": merged,
"card": card,
}
)
return results
# ============================================================
# SYNC REDIS → SQLITE (writes)
# ============================================================
def sync_books_from_redis():
"""
Writes Redis progress values back into SQLite.
Uses unified book_idx as identifier.
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"), decode_responses=True)
db = get_db()
cur = db.cursor()
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
before = dict(row)
book_idx = before["book_idx"]
redis_key = f"book:{book_idx}:state"
redis_state = r.hgetall(redis_key)
if not redis_state:
results.append(
{
"book_idx": book_idx,
"before": before,
"redis": {},
"after": before,
}
)
continue
# Extract progress from Redis
downloaded = int(redis_state.get("chapters_download_done", 0))
parsed = int(redis_state.get("chapters_parsed_done", 0))
audio_done = int(redis_state.get("audio_done", 0))
total = int(redis_state.get("chapters_total", 0))
status = redis_state.get("status", before.get("status"))
# Update SQLite
cur.execute(
"""
UPDATE books
SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now')
WHERE book_idx = ?
""",
(downloaded, parsed, audio_done, total, status, book_idx),
)
db.commit()
cur.execute("SELECT * FROM books WHERE book_idx = ?", (book_idx,))
after = dict(cur.fetchone())
results.append(
{
"book_idx": book_idx,
"before": before,
"redis": redis_state,
"after": after,
}
)
return results

@ -1,114 +0,0 @@
# ============================================================
# File: scraper/utils.py
# Purpose:
# Centralised replacement loader + text cleaner
# using 3 replacement categories:
# 1) HTML replacements
# 2) Encoding replacements
# 3) Junk-term replacements (generic "noise" phrases)
#
# Nothing in this file contains hardcoded cleanup rules.
# EVERYTHING comes from replacement files ONLY.
# ============================================================
import os
import re
from pathlib import Path
# ------------------------------------------------------------
# Generic key=value replacement loader
# ------------------------------------------------------------
def load_replacement_file(path: Path) -> dict:
"""
Loads key=value pairs from a file.
Missing file {}.
Ignores empty lines and lines starting with '#'.
"""
if not path.exists():
return {}
repl = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" in line:
key, val = line.split("=", 1)
repl[key.strip()] = val.strip()
return repl
# ------------------------------------------------------------
# Load all categories (HTML → encoding → junk)
# Order matters: later overrides earlier.
# ------------------------------------------------------------
def load_all_replacements() -> dict:
root = Path(__file__).parent / "replacements"
html_file = root / "html.txt"
enc_file = root / "encoding.txt"
junk_file = root / "junk.txt"
repl = {}
repl.update(load_replacement_file(html_file))
repl.update(load_replacement_file(enc_file))
repl.update(load_replacement_file(junk_file))
return repl
# ------------------------------------------------------------
# Legacy compatibility wrapper
# Many modules still import: from scraper.utils import load_replacements
# This wrapper keeps everything working.
# ------------------------------------------------------------
def load_replacements(filepath=None) -> dict:
"""
Backward-compatible alias.
- If called with no filepath return merged replacements.
- If called with a filepath load that one file only.
"""
if filepath is None:
return load_all_replacements()
else:
# Allow explicit loading of a single file
path = Path(filepath)
return load_replacement_file(path)
# ------------------------------------------------------------
# Clean text using loaded replacements
# ------------------------------------------------------------
def clean_text(raw: str, repl: dict) -> str:
"""
Apply replacements and basic whitespace normalisation.
No hardcoded rules live here.
"""
if not raw:
return ""
txt = raw.replace("\r", "")
# Apply loaded replacements
for key, val in repl.items():
# print(f"Replacing: {key} → {val}")
txt = txt.replace(key, val)
# Collapse 3+ blank lines → max 1
txt = re.sub(r"\n{3,}", "\n\n", txt)
return txt.strip()
# ------------------------------------------------------------
# Determine chapter save path
# ------------------------------------------------------------
def get_save_path(chapter_num: int, base_path: str) -> str:
filename = f"{chapter_num:04d}.txt"
return os.path.join(base_path, filename)

@ -1,66 +0,0 @@
#!/bin/bash
set -e
echo ""
echo "====================================================="
echo " STARTING LOCAL macOS AUDIO WORKER"
echo "====================================================="
echo ""
# ------------------------------------------------------
# Load .env so REDIS_BROKER_LOCAL becomes available
# ------------------------------------------------------
if [ -f ".env" ]; then
set -o allexport
source .env
set +o allexport
fi
# ------------------------------------------------------
# Override Redis to local instance for macOS
# ------------------------------------------------------
export REDIS_BROKER="$REDIS_BROKER_LOCAL"
export REDIS_BACKEND="$REDIS_BACKEND_LOCAL"
echo "[AUDIO] Redis override:"
echo " REDIS_BROKER=$REDIS_BROKER"
echo " REDIS_BACKEND=$REDIS_BACKEND"
echo ""
# ------------------------------------------------------
# Create venv if needed
# ------------------------------------------------------
if [ ! -d ".venv" ]; then
echo "[AUDIO] No .venv found — creating virtualenv..."
python3 -m venv .venv
else
echo "[AUDIO] Existing .venv found"
fi
# Activate virtualenv
echo "[AUDIO] Activating .venv"
source .venv/bin/activate
# ------------------------------------------------------
# Install requirements
# ------------------------------------------------------
REQ="requirements.audio.txt"
if [ ! -f "$REQ" ]; then
echo "[AUDIO] ERROR — $REQ not found!"
exit 1
fi
echo "[AUDIO] Installing audio requirements..."
pip install -r "$REQ"
# Celery must be installed locally too
echo "[AUDIO] Ensuring Celery installed..."
pip install celery
# ------------------------------------------------------
# Start the worker
# ------------------------------------------------------
echo ""
echo "[AUDIO] Starting audio worker..."
python3 audio_worker_local.py

@ -1,310 +0,0 @@
/* =======================================================================
File: static/css/bookcard.css
Purpose:
Styling voor registered book cards:
- status kleuren
- badges
- start/abort/statuscheck
- progress bars
======================================================================= */
/* -----------------------------------------------------------------------
GRID WRAPPER
----------------------------------------------------------------------- */
.registered-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(340px, 1fr));
gap: 20px;
margin-top: 15px;
}
/* -----------------------------------------------------------------------
BOOK CARD BASE
----------------------------------------------------------------------- */
.book-card {
position: relative;
display: grid;
grid-template-columns: 90px auto;
gap: 15px;
padding: 15px;
background: #fff;
border-radius: 10px;
border: 1px solid #e5e5e5;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
transition: border-color 0.25s ease, box-shadow 0.25s ease;
}
/* -----------------------------------------------------------------------
STATUS COLORS (BOOK CARD BORDER)
----------------------------------------------------------------------- */
/* Downloading / actief bezig */
.book-card.downloading {
border-color: #ff9500;
box-shadow: 0 0 6px rgba(255, 149, 0, 0.35);
}
/* Audio fase */
.book-card.audio {
border-color: #ffca28;
box-shadow: 0 0 6px rgba(255, 202, 40, 0.35);
}
/* Volledig klaar */
.book-card.done {
border: 2px solid #4caf50;
box-shadow: 0 0 6px rgba(76, 175, 80, 0.35);
}
/* Afgebroken */
.book-card.aborted {
border-color: #ff3b30;
box-shadow: 0 0 6px rgba(255, 59, 48, 0.35);
}
/* -----------------------------------------------------------------------
COVER
----------------------------------------------------------------------- */
.book-cover {
width: 90px;
}
.book-img {
width: 90px;
height: 130px;
object-fit: cover;
border-radius: 4px;
background: #f4f4f4;
}
.placeholder {
display: flex;
justify-content: center;
align-items: center;
color: #777;
font-size: 12px;
}
/* -----------------------------------------------------------------------
META
----------------------------------------------------------------------- */
.book-meta {
display: flex;
flex-direction: column;
justify-content: space-between;
}
.book-title {
font-size: 16px;
font-weight: bold;
}
.book-author {
font-size: 14px;
color: #444;
margin-bottom: 6px;
}
.book-created {
font-size: 12px;
color: #666;
}
/* -----------------------------------------------------------------------
ACTION BUTTONS
----------------------------------------------------------------------- */
.book-actions {
display: flex;
justify-content: flex-end;
gap: 10px;
margin-top: 10px;
}
.icon-btn {
width: 34px;
height: 34px;
border: none;
border-radius: 8px;
display: flex;
justify-content: center;
align-items: center;
font-size: 16px;
color: #fff;
cursor: pointer;
transition: background 0.15s ease, transform 0.1s ease;
}
/* Start */
.icon-start {
background: #2d8a3d;
}
.icon-start:hover {
background: #226c30;
transform: scale(1.05);
}
.icon-start:disabled {
background: #9bbb9f;
cursor: not-allowed;
opacity: 0.5;
}
/* Abort */
.icon-abort {
background: #c62828;
}
.icon-abort:hover {
background: #a31f1f;
transform: scale(1.05);
}
.icon-abort:disabled {
background: #d8a0a0;
cursor: not-allowed;
opacity: 0.5;
}
/* Hide */
.hide-form {
position: absolute;
top: 6px;
right: 6px;
}
.icon-hide {
background: #777;
}
.icon-hide:hover {
background: #555;
}
/* Statuscheck */
.statuscheck-btn {
background-color: #444;
color: #fff;
border: 1px solid #666;
margin-left: 4px;
padding: 4px 8px;
border-radius: 6px;
font-size: 12px;
cursor: pointer;
}
.statuscheck-btn:hover {
background-color: #333;
}
/* -----------------------------------------------------------------------
PROGRESS (FULL WIDTH)
----------------------------------------------------------------------- */
.book-progress {
grid-column: 1 / -1;
margin-top: 12px;
padding: 10px 12px;
background: #f6f6f6;
border-radius: 8px;
}
.progress-row {
margin-bottom: 4px;
}
.progress-label {
font-size: 12px;
margin-bottom: 4px;
color: #444;
}
/* BAR */
.progressbar {
position: relative;
width: 100%;
height: 14px;
background: #ddd;
border-radius: 7px;
overflow: hidden;
}
.progressbar-fill {
height: 100%;
transition: width 0.4s ease;
}
/* Download */
.progressbar-fill.download {
background: #2196f3;
}
/* Audio */
.progressbar-fill.audio {
background: #4caf50;
}
/* TEXT IN BAR */
.progressbar-text {
position: absolute;
inset: 0;
display: flex;
align-items: center;
justify-content: center;
font-size: 11px;
font-weight: 600;
color: #fff;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.6);
pointer-events: none;
}
/* -----------------------------------------------------------------------
STATUS BADGE
----------------------------------------------------------------------- */
.status-badge {
display: inline-block;
margin-bottom: 6px;
padding: 2px 8px;
font-size: 11px;
font-weight: 600;
border-radius: 10px;
text-transform: uppercase;
letter-spacing: 0.5px;
cursor: default;
}
/* DONE */
.status-badge.status-done {
background-color: #e6f4ea;
color: #2e7d32;
border: 1px solid #4caf50;
}
/* AUDIO */
.status-badge.status-audio {
background-color: #fff8e1;
color: #8d6e00;
border: 1px solid #ffca28;
}
/* DOWNLOADING */
.status-badge.status-downloading {
background-color: #e3f2fd;
color: #1565c0;
border: 1px solid #42a5f5;
}
/* Statuscheck */
.icon-statuscheck {
background: #444;
}
.icon-statuscheck:hover {
background: #333;
transform: scale(1.05);
}

@ -1,312 +0,0 @@
/* =======================================================================
File: static/css/dashboard.css
Purpose:
Clean full-width vertical dashboard layout with large log viewer.
Book-card CSS is now moved to bookcard.css
======================================================================= */
/* -----------------------------------------------------------------------
1) GENERAL PAGE LAYOUT
----------------------------------------------------------------------- */
.dashboard-container {
display: flex;
flex-direction: column;
width: 100%;
max-width: 1200px;
margin: 20px auto;
padding: 0 20px;
gap: 18px;
}
.dashboard-section {
background: #ffffff;
padding: 16px;
border-radius: 6px;
border: 1px solid #ddd;
}
.page-title {
font-size: 22px;
margin-bottom: 15px;
}
/* -----------------------------------------------------------------------
2) ACTIVE BOOK LIST (dashboard left panel)
----------------------------------------------------------------------- */
.book-list {
display: flex;
flex-direction: column;
gap: 12px;
}
.book-list-empty {
padding: 18px;
text-align: center;
color: #777;
}
.book-list-item {
padding: 12px 16px;
background: #f7f7f7;
border-radius: 6px;
border: 1px solid #ccc;
cursor: pointer;
display: flex;
flex-direction: column;
gap: 6px;
transition: background 0.2s, border-color 0.2s;
}
.book-list-item:hover,
.book-list-item.active {
background: #eaf3ff;
border-color: #1e88e5;
}
.book-title {
font-size: 16px;
font-weight: 600;
}
.book-meta {
font-size: 12px;
color: #555;
}
/* -----------------------------------------------------------------------
3) PROGRESS BOX
----------------------------------------------------------------------- */
.progress-box {
background: #fafafa;
border: 1px solid #ddd;
padding: 8px;
border-radius: 6px;
}
.progress-header h2 {
margin-bottom: 5px;
}
.progress-subtitle {
font-size: 14px;
color: #333;
font-weight: 600;
}
.progress-bookid {
font-size: 12px;
color: #777;
margin-bottom: 15px;
}
.progress-bar {
height: 14px;
background: #ddd;
border-radius: 6px;
overflow: hidden;
margin-bottom: 6px;
}
.progress-bar-fill {
height: 100%;
background: #1e88e5;
}
.progress-bar-fill.audio-fill {
background: #e65100;
}
.progress-stats {
display: flex;
justify-content: space-between;
font-size: 12px;
color: #444;
margin-top: 4px;
}
.book-abort-area {
margin-top: 10px;
text-align: right;
}
.abort-btn {
padding: 6px 12px;
border-radius: 4px;
border: 1px solid #cc0000;
background: #ff4444;
color: white;
font-size: 12px;
cursor: pointer;
transition: background 0.2s, border-color 0.2s;
}
.abort-btn:hover {
background: #ff2222;
border-color: #aa0000;
}
/* -----------------------------------------------------------------------
4) LOG VIEWER
----------------------------------------------------------------------- */
.log-viewer {
width: 100%;
max-width: 100%;
overflow: hidden;
}
.log-header {
display: flex;
justify-content: space-between;
align-items: center;
}
.log-filters {
display: flex;
align-items: center;
gap: 8px;
}
.log-output {
flex: 1;
width: 100%;
max-width: 100%;
min-height: 60vh;
max-height: 75vh;
overflow-y: auto;
overflow-x: hidden;
background: #000;
color: #00ff66;
border: 1px solid #0f0;
border-radius: 6px;
padding: 12px;
font-family: "SF Mono", "Consolas", "Courier New", monospace;
font-size: 13px;
line-height: 1.35;
white-space: pre-wrap;
word-break: break-word;
}
.log-line {
white-space: pre-wrap;
padding: 2px 0;
}
.log-line.default {
color: #00ff66;
}
.log-line.dl {
color: #00ccff;
}
.log-line.parse {
color: #ffaa00;
}
.log-line.save {
color: #ffdd33;
}
.log-line.audio {
color: #ff66ff;
}
.log-line.ctrl {
color: #66aaff;
}
.log-line.error {
color: #ff3333;
}
/* -----------------------------------------------------------------------
5) PLACEHOLDER / FOOTER
----------------------------------------------------------------------- */
.dashboard-placeholder {
font-size: 15px;
padding: 20px;
text-align: center;
color: #777;
}
.footer {
text-align: center;
padding: 12px;
color: #666;
margin-top: 25px;
font-size: 12px;
border-top: 1px solid #ddd;
}
/* -----------------------------
DROPDOWN NAVIGATION
------------------------------ */
/* Container for dropdown */
.nav-dropdown {
position: relative;
}
/* The clickable label ("Tools ▾") */
.nav-dropdown > .nav-item {
cursor: pointer;
}
/* Hide dropdown by default */
.dropdown-menu {
display: none;
position: absolute;
top: 100%;
right: 0;
background: #fff; /* zelfde achtergrond als navbar */
border: 1px solid #ddd;
padding: 8px 0;
margin: 0;
list-style: none; /* verwijder bolletjes */
border-radius: 4px;
min-width: 160px;
z-index: 1000;
}
/* Show dropdown when hovering over parent */
.nav-dropdown:hover .dropdown-menu {
display: block;
}
/* Menu item styling */
.dropdown-menu li {
padding: 0;
margin: 0;
}
.dropdown-menu li a {
display: block;
padding: 8px 16px;
white-space: nowrap;
color: #333;
text-decoration: none;
}
/* Hover state */
.dropdown-menu li a:hover {
background: #f0f0f0;
}
table.kv {
border-collapse: collapse;
margin-bottom: 16px;
}
table.kv th {
text-align: left;
padding-right: 12px;
color: #777;
font-weight: normal;
}
table.kv td {
font-weight: 500;
}

@ -1,160 +0,0 @@
/* =======================================================================
File: static/css/style.css
Purpose:
Global base styling for all pages.
Includes typography, buttons, forms, layout primitives.
======================================================================= */
/* ------------------------------
RESET / BASE
------------------------------ */
html,
body {
margin: 0;
padding: 0;
font-family: Arial, Helvetica, sans-serif;
background: #f5f6fa;
color: #222;
}
.container {
max-width: 1100px;
margin: 0 auto;
padding: 20px;
}
h1,
h2,
h3 {
margin: 0 0 15px 0;
font-weight: 600;
}
a {
color: #1e88e5;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
/* ------------------------------
BUTTONS
------------------------------ */
.btn-primary {
background: #1e88e5;
color: #fff;
padding: 10px 18px;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 15px;
transition: background 0.2s ease;
}
.btn-primary:hover {
background: #1669b9;
}
.btn-small {
padding: 5px 10px;
background: #ccc;
border-radius: 4px;
border: none;
font-size: 13px;
}
.btn-small:hover {
background: #bbb;
}
/* ------------------------------
FORM ELEMENTS
------------------------------ */
.url-form {
display: flex;
gap: 10px;
flex-direction: column;
max-width: 550px;
}
.url-label {
font-weight: 600;
}
.url-input {
padding: 10px;
font-size: 15px;
border: 1px solid #bbb;
border-radius: 4px;
}
.url-submit {
align-self: flex-start;
}
/* ------------------------------
NAVBAR
------------------------------ */
.navbar {
background: #ffffff;
border-bottom: 1px solid #ddd;
padding: 12px 20px;
}
.nav-inner {
max-width: 1200px;
margin: 0 auto;
display: flex;
align-items: center;
justify-content: space-between;
}
.nav-brand a {
font-size: 20px;
font-weight: bold;
color: #1e88e5;
}
.nav-links {
list-style: none;
display: flex;
gap: 25px;
margin: 0;
padding: 0;
}
.nav-item {
font-size: 15px;
color: #333;
}
.nav-item:hover {
color: #1e88e5;
}
/* ------------------------------
LANDING PAGE
------------------------------ */
.landing-container {
max-width: 600px;
margin: 40px auto;
background: #fff;
padding: 25px;
border-radius: 6px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.landing-title {
margin-bottom: 20px;
}
.landing-links {
margin-top: 20px;
}

@ -1,33 +0,0 @@
/* =======================================================================
File: static/js/app.js
Purpose:
Global utility functions shared across all scripts.
No page-specific logic here.
======================================================================= */
// Shortcuts
const $ = (sel, parent = document) => parent.querySelector(sel);
const $$ = (sel, parent = document) => parent.querySelectorAll(sel);
// Safe log
function dbg(...args) {
console.log("[APP]", ...args);
}
// AJAX helper
async function apiGet(url) {
try {
const res = await fetch(url, { cache: "no-store" });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return await res.json();
} catch (err) {
console.error("API GET Error:", url, err);
return null;
}
}
// Auto-scroll utility
function autoScroll(el) {
if (!el) return;
el.scrollTop = el.scrollHeight;
}

@ -1,145 +0,0 @@
/* ============================================================
File: static/js/bookcard_controller.js
Purpose:
Single owner for updating book-card DOM from merged state
(would_merge_to)
============================================================ */
console.log("[BOOKCARD] controller loaded");
/* ============================================================
ENTRY POINT (called by state_updater.js)
============================================================ */
function updateBookCardsFromState(stateList) {
console.log("[BOOKCARD] updateBookCardsFromState called");
if (!Array.isArray(stateList)) {
console.warn("[BOOKCARD] Invalid stateList", stateList);
return;
}
const stateById = {};
stateList.forEach((entry) => {
const merged = entry.would_merge_to;
if (!merged || merged.book_idx == null) {
console.warn("[BOOKCARD] entry without merged/book_idx", entry);
return;
}
stateById[String(merged.book_idx)] = merged;
});
document.querySelectorAll(".book-card").forEach((card) => {
const bookIdx = card.dataset.bookIdx;
const state = stateById[bookIdx];
if (!state) {
console.debug("[BOOKCARD] No state for book_idx", bookIdx);
return;
}
console.log("[BOOKCARD] Updating card", bookIdx, state.status);
updateSingleBookCard(card, state);
});
}
/* ============================================================
SINGLE CARD UPDATE
============================================================ */
function updateSingleBookCard(card, state) {
console.log("[BOOKCARD] updateSingleBookCard", state.book_idx);
updateStatus(card, state);
updateStatusBadge(card, state);
updateButtons(card, state);
updateProgress(card, state);
}
/* ============================================================
STATUS
============================================================ */
function updateStatus(card, state) {
console.log("[BOOKCARD][STATUS]", state.book_idx, "→", state.status);
card.className = `book-card ${state.status || ""}`;
}
function updateStatusBadge(card, state) {
const badge = card.querySelector(".status-badge");
if (!badge) return;
const status = (state.status || "").toLowerCase();
badge.textContent = status.toUpperCase();
badge.className = `status-badge status-${status}`;
badge.title =
{
downloading: "Bezig met downloaden",
audio: "Downloads compleet, audio wordt gegenereerd",
done: "Alle chapters en audio zijn compleet",
}[status] || "";
}
/* ============================================================
BUTTONS
============================================================ */
function updateButtons(card, state) {
const startBtn = card.querySelector(".icon-start");
const abortBtn = card.querySelector(".icon-abort");
const busy = ["starting", "downloading", "parsing", "audio"];
console.log("[BOOKCARD][BUTTONS]", state.book_idx, "status:", state.status);
if (startBtn) {
// startBtn.disabled = busy.includes(state.status);
}
if (abortBtn) {
abortBtn.disabled = !busy.includes(state.status);
}
}
/* ============================================================
PROGRESS (DOWNLOAD + AUDIO)
============================================================ */
function updateProgress(card, s) {
const total = Number(s.chapters_total || 0);
// const downloadDone =
// Number(s.chapters_download_done || 0) +
// Number(s.chapters_download_skipped || 0);
const downloadDone = Number(s.downloaded || 0);
const audioDone = Number(s.audio_done || 0) + Number(s.audio_skipped || 0);
const downloadPct =
total > 0 ? Math.min((downloadDone / total) * 100, 100) : 0;
const audioPct = total > 0 ? Math.min((audioDone / total) * 100, 100) : 0;
console.log("[BOOKCARD][PROGRESS]", s.book_idx, {
total,
downloadDone,
audioDone,
downloadPct,
audioPct,
});
/* ---- DOWNLOAD ---- */
const dlBar = card.querySelector('[data-field="download_pct"]');
const dlText = card.querySelector('[data-field="download_text"]');
if (dlBar) dlBar.style.width = `${downloadPct}%`;
if (dlText) dlText.textContent = `${downloadDone} / ${total}`;
/* ---- AUDIO ---- */
const auBar = card.querySelector('[data-field="audio_pct"]');
const auText = card.querySelector('[data-field="audio_text"]');
if (auBar) auBar.style.width = `${audioPct}%`;
if (auText) auText.textContent = `${audioDone} / ${total}`;
}

@ -1,178 +0,0 @@
/* =======================================================================
File: static/js/dashboard.js
Purpose:
- Sidebar selectie
- Start / Abort acties
- UI status updates
NOTE:
- GEEN polling
- state_updater.js is leidend
======================================================================= */
console.log("[DASHBOARD] loaded");
/* ---------------------------------------------------------
Helpers
--------------------------------------------------------- */
async function apiGet(url) {
console.log("[DASHBOARD][API] GET", url);
try {
const r = await fetch(url, { cache: "no-store" });
if (!r.ok) {
console.warn("[DASHBOARD][API] GET failed", url, r.status);
return null;
}
return await r.json();
} catch (e) {
console.error("[DASHBOARD][API] GET error", url, e);
return null;
}
}
function safeUpdateLogs(data) {
if (typeof window.updateLogs === "function") {
console.log("[DASHBOARD] updateLogs()");
window.updateLogs(data);
}
}
/* ---------------------------------------------------------
State
--------------------------------------------------------- */
let ACTIVE_BOOK_IDX = null;
/* ---------------------------------------------------------
DOM READY
--------------------------------------------------------- */
document.addEventListener("DOMContentLoaded", () => {
console.log("[DASHBOARD] DOMContentLoaded");
bindSidebar();
bindBookCardButtons();
const first = document.querySelector(".book-list-item");
if (first) {
console.log("[DASHBOARD] auto-select", first.dataset.bookIdx);
selectBook(first.dataset.bookIdx);
}
});
/* ---------------------------------------------------------
Sidebar
--------------------------------------------------------- */
function bindSidebar() {
console.log("[DASHBOARD] bindSidebar()");
document.querySelectorAll(".book-list-item").forEach((item) => {
item.onclick = () => selectBook(item.dataset.bookIdx);
});
}
function selectBook(bookIdx) {
if (!bookIdx || bookIdx === ACTIVE_BOOK_IDX) return;
ACTIVE_BOOK_IDX = bookIdx;
console.log("[DASHBOARD] selectBook", bookIdx);
document.querySelectorAll(".book-list-item").forEach((el) => {
el.classList.toggle("active", el.dataset.bookIdx === bookIdx);
});
refreshBook(bookIdx);
}
/* ---------------------------------------------------------
Book refresh (NO POLLING)
--------------------------------------------------------- */
async function refreshBook(bookIdx) {
console.log("[DASHBOARD] refreshBook", bookIdx);
const logs = await apiGet(`/api/book/${bookIdx}/logs`);
if (logs) safeUpdateLogs(logs);
refreshBookCards();
}
/* ---------------------------------------------------------
Bookcard buttons
--------------------------------------------------------- */
function bindBookCardButtons() {
console.log("[DASHBOARD] bindBookCardButtons()");
document.querySelectorAll(".icon-start").forEach((btn) => {
if (btn.dataset.bound) return;
btn.dataset.bound = "1";
btn.onclick = (e) => {
e.preventDefault();
const card = btn.closest(".book-card");
if (!card) return;
startBook(card.dataset.bookIdx);
};
});
document.querySelectorAll(".icon-abort").forEach((btn) => {
if (btn.dataset.bound) return;
btn.dataset.bound = "1";
btn.onclick = (e) => {
e.preventDefault();
const card = btn.closest(".book-card");
if (!card) return;
abortBook(card.dataset.bookIdx);
};
});
}
/* ---------------------------------------------------------
START
--------------------------------------------------------- */
function startBook(bookIdx) {
console.log("[DASHBOARD] START", bookIdx);
fetch("/start", {
method: "POST",
headers: { "Content-Type": "application/x-www-form-urlencoded" },
body: `book_idx=${bookIdx}`,
}).then(() => refreshBook(bookIdx));
}
/* ---------------------------------------------------------
ABORT
--------------------------------------------------------- */
function abortBook(bookIdx) {
if (!confirm(`Abort book ${bookIdx}?`)) return;
console.log("[DASHBOARD] ABORT", bookIdx);
fetch(`/abort/${bookIdx}`, { method: "POST" }).then(() =>
refreshBook(bookIdx)
);
}
/* ---------------------------------------------------------
Bookcard UI refresh (non-progress)
--------------------------------------------------------- */
async function refreshBookCards() {
console.log("[DASHBOARD] refreshBookCards()");
const books = await apiGet("/api/books");
if (!books) return;
document.querySelectorAll(".book-card").forEach((card) => {
const idx = card.dataset.bookIdx;
const info = books.find((b) => b.book_idx === idx);
if (!info) return;
console.log("[DASHBOARD] card status", idx, info.status);
card.className = `book-card ${info.status}`;
const abortBtn = card.querySelector(".icon-abort");
if (abortBtn) {
abortBtn.disabled = ![
"processing",
"downloading",
"parsing",
"audio",
].includes(info.status);
}
});
}

@ -1,13 +0,0 @@
/* =======================================================================
File: static/js/helpers.js
Purpose:
Shared DOM helpers for all JS files.
======================================================================= */
window.$ = (sel) => document.querySelector(sel);
window.$$ = (sel) => document.querySelectorAll(sel);
window.autoScroll = function (el) {
if (!el) return;
el.scrollTop = el.scrollHeight;
};

@ -1,101 +0,0 @@
/* ============================================================
File: static/js/inspect_state.js
Purpose:
- Receive merged state via state_updater.js
- Update ONLY the right-side state tables
- NO polling, NO fetch
============================================================ */
console.log("[inspect_state] JS loaded (subscriber mode)");
/* ------------------------------------------------------------
State subscription
------------------------------------------------------------ */
window.addEventListener("state:update", (e) => {
const entries = e.detail;
if (!Array.isArray(entries)) {
console.warn("[inspect_state] state:update payload is not array", entries);
return;
}
console.log("[inspect_state] state:update received entries:", entries.length);
updateInspectTables(entries);
});
/* ------------------------------------------------------------
Update tables
------------------------------------------------------------ */
function updateInspectTables(entries) {
console.log("[inspect_state] updating tables");
entries.forEach((entry) => {
const bookIdx = entry.book_idx;
if (bookIdx == null) {
console.warn("[inspect_state] entry without book_idx", entry);
return;
}
const block = document.querySelector(
`.state-block[data-book-idx="${bookIdx}"]`
);
if (!block) {
console.warn("[inspect_state] no state-block for book_idx", bookIdx);
return;
}
const table = block.querySelector(".state-table");
if (!table) {
console.warn("[inspect_state] no state-table for book_idx", bookIdx);
return;
}
console.log("[inspect_state] updating table for book_idx", bookIdx);
const sql = entry.sqlite || {};
const redis = entry.redis || {};
const merged = entry.would_merge_to || {};
table.innerHTML = `
<tr>
<th>Field</th>
<th>SQLite</th>
<th>Redis</th>
<th>Merged</th>
</tr>
${row("status", sql, redis, merged)}
${row("chapters_total", sql, redis, merged)}
${row("downloaded", sql, redis, merged)}
${row("chapters_download_done", sql, redis, merged)}
${row("chapters_download_skipped", sql, redis, merged)}
${row("parsed", sql, redis, merged)}
${row("chapters_parsed_done", sql, redis, merged)}
${row("audio_done", sql, redis, merged)}
${row("audio_skipped", sql, redis, merged)}
${row("last_update", sql, redis, merged)}
`;
});
}
/* ------------------------------------------------------------
Row helper
------------------------------------------------------------ */
function row(field, sql, redis, merged) {
const s = sql[field] ?? "";
const r = redis[field] ?? "";
const m = merged[field] ?? "";
const cls = String(s) === String(r) ? "same" : "diff";
return `
<tr>
<th>${field}</th>
<td class="${cls}">${s}</td>
<td class="${cls}">${r}</td>
<td>${m}</td>
</tr>
`;
}

@ -1,130 +0,0 @@
/* =======================================================================
File: static/js/log_view.js
Purpose:
High-performance rolling log viewer
- efficient delta polling
- append-only mode (no DOM reset)
- rolling limit (prevents memory freeze)
- supports both global logs and per-book logs
======================================================================= */
console.log(">>> log_view.js LOADING…");
/* ---------------------------------------------------------
Global log viewer state
--------------------------------------------------------- */
let LOG_FILTER = "ALL";
let LAST_LOG_INDEX = -1; // delta offset
const MAX_LOG_LINES = 600;
/* ---------------------------------------------------------
Apply filter on existing log lines
--------------------------------------------------------- */
function applyLogFilter() {
const lines = $$(".log-line");
lines.forEach((line) => {
const text = line.innerText;
const show = LOG_FILTER === "ALL" || (text && text.includes(LOG_FILTER));
line.style.display = show ? "block" : "none";
});
}
/* ---------------------------------------------------------
DOM Ready bind clear/filter
--------------------------------------------------------- */
document.addEventListener("DOMContentLoaded", () => {
console.log(">>> log_view.js DOMContentLoaded");
const clearBtn = $("#log-clear");
const output = $("#log-output");
if (!output) {
console.log(">>> log_view.js: No #log-output → viewer disabled");
return;
}
if (clearBtn) {
clearBtn.addEventListener("click", () => {
console.log(">>> log_view.js: Clear log viewer");
output.innerHTML = "";
LAST_LOG_INDEX = -1;
});
}
});
/* ---------------------------------------------------------
Append ONE line
--------------------------------------------------------- */
function rollingAppend(lineText) {
const output = $("#log-output");
if (!output) return;
const div = document.createElement("div");
div.classList.add("log-line");
// Type detection
if (lineText.includes("[DL]") || lineText.includes("[DOWNLOAD]"))
div.classList.add("dl");
else if (lineText.includes("[PARSE]")) div.classList.add("parse");
else if (lineText.includes("[SAVE]")) div.classList.add("save");
else if (lineText.includes("[AUDIO]")) div.classList.add("audio");
else if (lineText.includes("[CTRL]")) div.classList.add("ctrl");
else if (lineText.includes("[ERROR]")) div.classList.add("error");
else div.classList.add("default");
div.textContent = lineText;
output.appendChild(div);
// Rolling limit
while (output.childNodes.length > MAX_LOG_LINES) {
output.removeChild(output.firstChild);
}
}
/* ---------------------------------------------------------
Primary entry: updateLogs()
Accepts:
{ logs:[...], last:N }
OR legacy:
{ lines:[...], last:N }
--------------------------------------------------------- */
function updateLogs(packet) {
const output = $("#log-output");
if (!output || !packet) return;
let lines = packet.logs || packet.lines || [];
if (!Array.isArray(lines)) return;
lines.forEach((line) => rollingAppend(line));
// Correct unified delta index handling
if (packet.last !== undefined) {
LAST_LOG_INDEX = packet.last;
}
applyLogFilter();
autoScroll(output);
}
/* ---------------------------------------------------------
Delta polling global logs ONLY
(dashboard.js overrides logs per-book)
--------------------------------------------------------- */
function pollLogs() {
fetch(`/logs?last_index=${LAST_LOG_INDEX}`)
.then((r) => r.json())
.then((data) => {
const lines = data.lines || [];
if (lines.length > 0) {
lines.forEach((line) => rollingAppend(line));
LAST_LOG_INDEX = data.last;
}
})
.catch((err) => {
console.warn(">>> log_view.js pollLogs() error:", err);
});
}
setInterval(pollLogs, 2800);
console.log(">>> log_view.js LOADED");

@ -1,98 +0,0 @@
/* ========================================================
File: static/js/state_updater.js
Purpose:
- Poll /api/state/all
- Dispatch merged state to subscribers
(bookcard_controller, inspect_state, others)
- Pause polling when tab inactive
======================================================== */
console.log("[STATE-UPDATER] loaded");
const STATE_POLL_INTERVAL_MS = 2500;
const STATE_ENDPOINT = "/api/state/all";
let STATE_TIMER = null;
/* ========================================================
INIT
======================================================== */
document.addEventListener("DOMContentLoaded", () => {
initStateUpdater();
});
function initStateUpdater() {
const cards = document.querySelectorAll(".book-card");
if (cards.length === 0) {
console.log("[STATE-UPDATER] No bookcards found — skipping");
return;
}
console.log(`[STATE-UPDATER] Starting updater for ${cards.length} bookcards`);
startPolling(true);
document.addEventListener("visibilitychange", () => {
document.hidden ? stopPolling() : startPolling(true);
});
}
/* ========================================================
DISPATCH
======================================================== */
function dispatchState(entries) {
console.debug("[STATE] dispatch", entries.length);
// 1. Bookcards
if (typeof window.updateBookCardsFromState === "function") {
window.updateBookCardsFromState(entries);
}
// 2. Inspect state tables / other subscribers
window.dispatchEvent(new CustomEvent("state:update", { detail: entries }));
}
/* ========================================================
POLLING CONTROL
======================================================== */
function startPolling(immediate = false) {
if (STATE_TIMER) return;
console.log("[STATE-UPDATER] Start polling");
if (immediate) pollState();
STATE_TIMER = setInterval(pollState, STATE_POLL_INTERVAL_MS);
}
function stopPolling() {
if (!STATE_TIMER) return;
console.log("[STATE-UPDATER] Stop polling (tab inactive)");
clearInterval(STATE_TIMER);
STATE_TIMER = null;
}
/* ========================================================
POLL API
======================================================== */
async function pollState() {
if (document.hidden) return;
try {
const resp = await fetch(STATE_ENDPOINT, { cache: "no-store" });
if (!resp.ok) return;
const entries = await resp.json();
if (!Array.isArray(entries)) return;
dispatchState(entries);
} catch (e) {
console.error("[STATE-UPDATER] poll error", e);
}
}

@ -1,35 +0,0 @@
<!-- File: templates/base.html -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>BookScraper</title>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<!-- CSS -->
<link rel="stylesheet" href="/static/css/style.css" />
<link rel="stylesheet" href="/static/css/dashboard.css" />
</head>
<body>
<!-- Global Navigation -->
{% include "components/nav.html" %}
<!-- Main Content Area -->
<main class="container">{% block content %}{% endblock %}</main>
<!-- JS -->
<script src="/static/js/app.js"></script>
<script src="/static/js/log_view.js"></script>
<script src="/static/js/dashboard.js"></script>
<!-- GLOBAL STATE UPDATER -->
<script src="/static/js/state_updater.js"></script>
<script>
document.addEventListener("DOMContentLoaded", () => {
if (typeof initStateUpdater === "function") {
initStateUpdater();
}
});
</script>
</body>
</html>

@ -1,66 +0,0 @@
<!-- =======================================================================
File: templates/components/book_list_item.html
Purpose:
Dashboard weergave van één boek in de lijst.
Variabelen komen binnen via:
book.<veld>
→ Boek gebruikt nu uitsluitend book_idx als primaire sleutel
======================================================================= -->
<div class="book-list-item" data-book-idx="{{ book.book_idx }}">
<!-- Left area: title + metadata -->
<div class="book-info">
<div class="book-title">{{ book.title }}</div>
<div class="book-meta">
<span class="meta-label">IDX:</span> {{ book.book_idx }} {% if
book.last_update %}
<span class="meta-separator"></span>
<span class="meta-label">Updated:</span> {{ book.last_update }} {% endif
%}
</div>
</div>
<!-- Center area: Status -->
<div class="book-status">
<span class="status-badge status-{{ book.status|lower }}">
{{ book.status|capitalize }}
</span>
</div>
<!-- Right area: progress mini-bars -->
<div class="book-progress-mini">
<!-- Download progress -->
<div class="progress-mini-row">
<span class="mini-label">DL:</span>
{% set pct_dl = 0 %} {% if book.download_total > 0 %} {% set pct_dl = (100
* book.download_done / book.download_total) | round(0) %} {% endif %}
<div class="progress-mini-bar">
<div class="fill" style="width: {{ pct_dl }}%;"></div>
</div>
<span class="mini-value">{{ pct_dl }}%</span>
</div>
<!-- Audio progress -->
<div class="progress-mini-row">
<span class="mini-label">AU:</span>
{% set pct_au = 0 %} {% if book.audio_total > 0 %} {% set pct_au = (100 *
book.audio_done / book.audio_total) | round(0) %} {% endif %}
<div class="progress-mini-bar audio">
<div class="fill audio-fill" style="width: {{ pct_au }}%;"></div>
</div>
<span class="mini-value">{{ pct_au }}%</span>
</div>
</div>
<!-- Abort button -->
<div class="book-abort-area">
<button class="abort-btn" onclick="abortBookAjax('{{ book.book_idx }}')">
Abort
</button>
</div>
</div>

@ -1,90 +0,0 @@
{# ============================================================ File:
templates/components/bookcard.html Purpose: Eén enkele boekkaart (dumb
component) ============================================================ #}
<div class="book-card {{ b.status }}" data-book-idx="{{ b.book_idx }}">
<!-- HIDE -->
<form
action="/hide/{{ b.book_idx }}"
method="POST"
class="hide-form"
onsubmit="return confirm('Dit boek verbergen?')"
>
<button class="icon-btn icon-hide" title="Verbergen">
<i class="fa-solid fa-xmark"></i>
</button>
</form>
<!-- COVER -->
<div class="book-cover">
{% if b.cover_path %}
<img
src="/{{ b.cover_path }}"
class="book-img"
data-field="cover"
alt="cover"
/>
{% else %}
<div class="book-img placeholder" data-field="cover">?</div>
{% endif %}
</div>
<!-- META -->
<div class="book-meta">
<!-- STATUS BADGE -->
{% if b.status %}
<span
class="status-badge status-{{ b.status }}"
title="
{% if b.status == 'done' %}Alle chapters en audio zijn compleet{% endif %}
{% if b.status == 'audio' %}Downloads compleet, audio wordt nog gegenereerd{% endif %}
{% if b.status == 'downloading' %}Bezig met downloaden{% endif %}
"
>
{{ b.status | upper }}
</span>
{% endif %}
<div class="book-title" data-field="title">{{ b.title }}</div>
<div class="book-author" data-field="author">{{ b.author }}</div>
<div class="book-created">
Geregistreerd: <span data-field="created_at">{{ b.created_at }}</span>
</div>
<!-- ACTIONS -->
<div class="book-actions">
<!-- START -->
<form action="/start" method="POST">
<input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button class="icon-btn icon-start" title="Start" data-action="start">
<i class="fa-solid fa-play"></i>
</button>
</form>
<!-- ABORT -->
<form action="/abort/{{ b.book_idx }}" method="POST">
<input type="hidden" name="book_idx" value="{{ b.book_idx }}" />
<button class="icon-btn icon-abort" title="Abort" data-action="abort">
<i class="fa-solid fa-stop"></i>
</button>
</form>
<form
method="post"
action="/inspect/statuscheck/{{ b.book_idx }}"
style="display: inline-block"
>
<button
type="submit"
class="icon-btn icon-statuscheck"
title="Herbereken status op basis van bestanden"
>
<i class="fa-solid fa-magnifying-glass-chart"></i>
</button>
</form>
</div>
</div>
<!-- PROGRESS -->
<div class="book-progress">{% include "components/progress_box.html" %}</div>
</div>

@ -1,44 +0,0 @@
<!-- =======================================================================
File: templates/components/log_view.html
Purpose: Reusable log viewer component for any page (dashboard/start/book)
Notes:
- Requires JS: /static/js/log_view.js
- Supports filtering by tag (e.g. [DL], [PARSE], [AUDIO], [CTRL], ...)
- Template expects optional variable `logs` (list[str])
======================================================================= -->
<div id="log-viewer" class="log-viewer">
<!-- ========================== HEADER ========================== -->
<div class="log-header">
<h2>Live Log</h2>
<div class="log-filters">
<label for="log-filter">Filter:</label>
<select id="log-filter">
<option value="ALL">All</option>
<option value="[DL]">Download</option>
<option value="[PARSE]">Parse</option>
<option value="[SAVE]">Save</option>
<option value="[AUDIO]">Audio</option>
<option value="[CTRL]">Controller</option>
<option value="[SCRAPING]">Scraping</option>
<option value="[BOOK]">Book</option>
<option value="[ERROR]">Errors</option>
</select>
<button id="log-clear" class="btn-small">Clear</button>
</div>
</div>
<!-- ========================== OUTPUT ========================== -->
<div id="log-output" class="log-output">
{% if logs and logs|length > 0 %} {% for line in logs %}
<div class="log-line">{{ line }}</div>
{% endfor %} {% else %}
<div class="log-empty">No logs yet…</div>
{% endif %}
</div>
</div>
<script src="/static/js/log_view.js"></script>

@ -1,40 +0,0 @@
<!-- =======================================================================
File: templates/components/nav.html
Purpose: Global navigation bar for BookScraper UI (improved version)
======================================================================= -->
<nav class="navbar">
<div class="nav-inner">
<!-- Branding / Home -->
<div class="nav-brand">
<a href="/">BookScraper</a>
</div>
<!-- Main navigation -->
<ul class="nav-links">
<li>
<a href="/dashboard" class="nav-item"> Dashboard </a>
</li>
<li>
<a href="/api/books" class="nav-item"> Active Books </a>
</li>
<li>
<a href="/debug/inspect_state" class="nav-item"> State overview </a>
</li>
<!-- Tools dropdown -->
<li class="nav-dropdown">
<span class="nav-item">Tools ▾</span>
<ul class="dropdown-menu">
<li><a href="/api/db/books">DB Viewer</a></li>
<li><a href="/debug/inspect_state">Inspect State</a></li>
<li><a href="/debug/sync_state">Sync State</a></li>
<li><a href="/debug/redis-keys">Redis Keys</a></li>
<li><a href="/debug/queues">queues</a></li>
</ul>
</li>
</ul>
</div>
</nav>

@ -1,34 +0,0 @@
<!-- =======================================================================
File: templates/components/progress_box.html
Purpose:
Dumb progress UI for a book card.
Initial values via Jinja, live updates via state_updater.js
======================================================================= -->
<div class="progress-box">
<!-- DOWNLOAD -->
<div class="progress-row">
<div class="progress-label">Download</div>
<div class="progressbar">
<div
class="progressbar-fill download"
data-field="download_pct"
style="width: 0%"
></div>
<div class="progressbar-text" data-field="download_text">0 / 0</div>
</div>
</div>
<!-- AUDIO -->
<div class="progress-row">
<div class="progress-label">Audio</div>
<div class="progressbar">
<div
class="progressbar-fill audio"
data-field="audio_pct"
style="width: 0%"
></div>
<div class="progressbar-text" data-field="audio_text">0 / 0</div>
</div>
</div>
</div>

@ -1,21 +0,0 @@
{# ============================================================ File:
templates/components/registered_books.html Purpose: Toon een grid van
geregistreerde boeken. Elke kaart wordt gerenderd via bookcard.html.
============================================================ #}
<section class="dashboard-section">
<h2>Geregistreerde boeken</h2>
{% if registered and registered|length > 0 %}
<div class="registered-grid">
{% for b in registered %} {% include "components/bookcard.html" %} {% endfor
%}
</div>
{% else %}
<p>Geen geregistreerde boeken.</p>
{% endif %}
</section>

@ -1,22 +0,0 @@
<!-- =======================================================================
File: templates/components/url_input.html
Purpose:
Reusable component for entering a book URL.
Used on landing pages or detail pages.
======================================================================= -->
<form method="POST" action="/init" class="url-form">
<label for="urls" class="url-label"> Book URL(s) one per line: </label>
<textarea
id="urls"
name="urls"
class="url-input"
rows="5"
placeholder="https://www.piaotia.com/bookinfo/6/6072.html
https://www.piaotia.com/bookinfo/3/3785.html"
required
></textarea>
<button type="submit" class="btn-primary url-submit">Register book(s)</button>
</form>

@ -1,38 +0,0 @@
<!-- =======================================================================
File: templates/dashboard/book_detail.html
Purpose:
Detailpagina voor één book_idx.
Toont progress (download/audio) + filters + live logs.
======================================================================= -->
{% extends "layout.html" %} {% block content %}
<div class="dashboard-detail">
<h1 class="page-title">{{ title }}</h1>
<p class="breadcrumb">
<a href="/dashboard">← Terug naar dashboard</a>
</p>
<!-- Progress box -->
<section id="progressSection">
{% include "components/progress_box.html" with book_idx=book_idx,
title=title, download_total=download_total, download_done=download_done,
audio_total=audio_total, audio_done=audio_done %}
</section>
<!-- Log view -->
<section class="log-section">
<h2>Live Log</h2>
{% include "components/log_view.html" %}
</section>
</div>
<!-- PAGE-SPECIFIC JS -->
<script>
const BOOK_IDX = "{{ book_idx }}";
</script>
<script src="/static/js/log_view.js"></script>
<script src="/static/js/dashboard.js"></script>
{% endblock %}

@ -1,46 +0,0 @@
{% extends "layout.html" %} {% block content %}
<div class="dashboard-container">
<!-- =======================================================================
File: templates/dashboard/dashboard.html
Purpose:
Functioneel dashboard:
• Start nieuwe scrape via URL input component
• Toont lijst van actieve boeken (actieve state model)
• Toont globale live logs
Vereist:
- books: lijst van actieve boeken
- logs: lijst van globale logs (optioneel)
======================================================================= -->
<!-- ===========================================================
URL INPUT — Start nieuwe scrape
=========================================================== -->
<section class="dashboard-section">
<h2>Start nieuwe scrape</h2>
{% include "components/url_input.html" %}
</section>
<hr />
<!-- ===========================================================
BOOK LIST
=========================================================== -->
{% include "components/registered_books.html" %}
<hr />
<!-- ===========================================================
GLOBAL LIVE LOG VIEW
=========================================================== -->
<section class="dashboard-section">
<h2>Live log (globaal)</h2>
{# log_view verwacht altijd 'logs' — garandeer list #} {% set logs = logs or
[] %} {% include "components/log_view.html" %}
</section>
</div>
<!-- JS -->
<script src="/static/js/dashboard.js"></script>
{% endblock %}

@ -1,95 +0,0 @@
{# ============================================================ File:
templates/debug/inspect_state.html Purpose: Inspect SQLite vs Redis state per
book_idx - Initial render via Jinja - Live updates via inspect_state.js -
BookCard is server-rendered and NEVER replaced - Only the right-side state table
is updated dynamically
============================================================ #} {% extends
"layout.html" %} {% block content %}
<h1>State Inspection (SQL vs Redis)</h1>
<style>
.state-block {
display: grid;
grid-template-columns: 380px 1fr;
gap: 20px;
margin-bottom: 35px;
padding: 18px;
border: 1px solid #444;
background: #222;
border-radius: 8px;
}
.state-table {
width: 100%;
border-collapse: collapse;
}
.state-table th,
.state-table td {
border: 1px solid #555;
padding: 6px 10px;
}
.state-table th {
background: #333;
color: #fff;
}
.state-table td {
background: #2a2a2a;
color: #ddd;
}
.same {
color: #9f9 !important;
}
.diff {
color: #ff7b7b !important;
font-weight: bold;
}
</style>
<div id="state-container">
{% for entry in results %}
<div class="state-block" data-book-idx="{{ entry.book_idx }}">
<!-- LEFT: BookCard (server-rendered, NEVER replaced) -->
<div>
{% if entry.card %} {% with b = entry.card %} {% include
"components/bookcard.html" %} {% endwith %} {% else %}
<strong>{{ entry.book_idx }}</strong>
{% endif %}
</div>
<!-- RIGHT: State table (updated by JS) -->
<div>
<table class="state-table">
<tr>
<th>Field</th>
<th>SQLite</th>
<th>Redis</th>
<th>Merged</th>
</tr>
{% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged
= entry.would_merge_to %} {% for field in [ "status", "chapters_total",
"downloaded", "chapters_download_done", "chapters_download_skipped",
"parsed", "chapters_parsed_done", "audio_done", "audio_skipped",
"last_update" ] %}
<tr>
<th>{{ field }}</th>
<td>{{ sql.get(field, "") }}</td>
<td>{{ redis.get(field, "") }}</td>
<td>{{ merged.get(field, "") }}</td>
</tr>
{% endfor %}
</table>
</div>
</div>
{% endfor %}
</div>
{% endblock %} {% block scripts %}
<script src="/static/js/inspect_state.js"></script>
{% endblock %}

@ -1,91 +0,0 @@
{% extends "layout.html" %} {% block content %}
<h1>Celery Queue Debug</h1>
<style>
.debug-section {
margin-bottom: 40px;
}
.debug-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 20px;
}
.debug-table th,
.debug-table td {
border: 1px solid #444;
padding: 6px 10px;
}
.debug-table th {
background: #333;
color: #fff;
}
pre {
background: #1e1e1e;
color: #ddd;
padding: 10px;
overflow-x: auto;
}
code {
color: #9cf;
}
</style>
<div class="debug-section">
<h2>Workers</h2>
<h3>Active Tasks</h3>
<pre>{{ workers_active | tojson(indent=2) }}</pre>
<h3>Reserved</h3>
<pre>{{ workers_reserved | tojson(indent=2) }}</pre>
<h3>Scheduled</h3>
<pre>{{ workers_scheduled | tojson(indent=2) }}</pre>
</div>
<hr />
<div class="debug-section">
<h2>Queues</h2>
{% for q in queues %}
<div class="debug-queue">
<h3>{{ q.name }} ({{ q.length }} items)</h3>
<table class="debug-table">
<tr>
<th>Redis Key</th>
<td>{{ q.redis_key }}</td>
</tr>
<tr>
<th>Length</th>
<td>{{ q.length }}</td>
</tr>
<tr>
<th>Items (first 30)</th>
<td>
{% if q["items"] %}
<ul style="margin: 0; padding-left: 20px">
{% for item in q["items"] %}
<li><code>{{ item | e }}</code></li>
{% endfor %}
</ul>
{% else %}
<i>No items</i>
{% endif %}
</td>
</tr>
</table>
</div>
{% endfor %}
</div>
<script>
setInterval(() => {
window.location.reload();
}, 5000);
</script>
{% endblock %}

@ -1,23 +0,0 @@
<!-- =======================================================================
File: templates/home.html
Purpose:
New landing page for starting a scrape.
Does NOT replace existing index.html.
Uses reusable components (url_input).
Redirects to /start?url=...
======================================================================= -->
{% extends "layout.html" %} {% block content %}
<div class="landing-container">
<h1 class="landing-title">Start a New Book Scrape</h1>
<!-- Reusable URL input component -->
{% include "components/url_input.html" %}
<div class="landing-links">
<a href="/dashboard">→ Go to Dashboard</a>
</div>
</div>
{% endblock %}

@ -1,53 +1,34 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="nl"> <html lang="nl">
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8">
<title>BookScraper</title> <title>BookScraper</title>
<style> <style>
body { body { font-family: Arial, sans-serif; padding: 40px; max-width: 600px; margin: auto; }
font-family: Arial, sans-serif; h1 { margin-bottom: 20px; }
padding: 40px; input[type="text"] {
max-width: 600px; width: 100%; padding: 12px; font-size: 16px;
margin: auto; border: 1px solid #ccc; border-radius: 6px;
} }
h1 { button {
margin-bottom: 20px; margin-top: 20px;
} padding: 12px 20px;
input[type="text"] { background: #007bff; color: white;
width: 100%; border: none; border-radius: 6px;
padding: 12px; font-size: 16px; cursor: pointer;
font-size: 16px; }
border: 1px solid #ccc; button:hover { background: #0056b3; }
border-radius: 6px;
}
button {
margin-top: 20px;
padding: 12px 20px;
background: #007bff;
color: white;
border: none;
border-radius: 6px;
font-size: 16px;
cursor: pointer;
}
button:hover {
background: #0056b3;
}
</style> </style>
</head> </head>
<body> <body>
<h1>BookScraper WebGUI</h1>
<form action="/init" method="POST"> <h1>BookScraper WebGUI</h1>
<label for="url">Geef een boek-URL op:</label><br /><br />
<input <form action="/start" method="POST">
type="text" <label for="url">Geef een boek-URL op:</label><br><br>
id="url" <input type="text" id="url" name="url" placeholder="https://example.com/book/12345" required>
name="url" <button type="submit">Start Scraping</button>
placeholder="https://example.com/book/12345" </form>
required
/> </body>
<button type="submit">Start Scraping</button>
</form>
</body>
</html> </html>

@ -1,115 +0,0 @@
{% extends "layout.html" %} {% block content %}
<h2>Statuscheck Inspect</h2>
{% if error %}
<div class="error"><strong>Fout:</strong> {{ error }}</div>
{% else %}
<!-- ===================================================== -->
<!-- BOEK -->
<!-- ===================================================== -->
<h3>Boek</h3>
<table class="kv">
<tr>
<th>Book idx</th>
<td>{{ result.book_idx }}</td>
</tr>
<tr>
<th>Pad</th>
<td>{{ result.filesystem.book_dir }}</td>
</tr>
<tr>
<th>Bestaat</th>
<td>{{ result.filesystem.exists }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- FILESYSTEM -->
<!-- ===================================================== -->
<h3>Filesystem (source of truth)</h3>
<table class="kv">
<tr>
<th>Volumes</th>
<td>{{ result.filesystem.volumes }}</td>
</tr>
<tr>
<th>Chapters (.txt)</th>
<td>{{ result.filesystem.chapters_txt }}</td>
</tr>
<tr>
<th>Audio (.m4b)</th>
<td>{{ result.filesystem.audio_files }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- SQL -->
<!-- ===================================================== -->
<h3>SQL snapshot</h3>
<h4>Voor</h4>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ result.sql_before.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ result.sql_before.audio_done }}</td>
</tr>
<tr>
<th>Status</th>
<td>{{ result.sql_before.status }}</td>
</tr>
</table>
<h4>Na</h4>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ result.sql_after.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ result.sql_after.audio_done }}</td>
</tr>
<tr>
<th>Last update</th>
<td>{{ result.sql_after.last_update }}</td>
</tr>
</table>
<!-- ===================================================== -->
<!-- REPOSITORY -->
<!-- ===================================================== -->
<h3>Repository merged state (UI input)</h3>
<table class="kv">
<tr>
<th>Downloaded</th>
<td>{{ repo_state.downloaded }}</td>
</tr>
<tr>
<th>Audio done</th>
<td>{{ repo_state.audio_done }}</td>
</tr>
<tr>
<th>Chapters total</th>
<td>{{ repo_state.chapters_total }}</td>
</tr>
</table>
<details>
<summary>Raw repository state</summary>
<pre>{{ repo_state | tojson(indent=2) }}</pre>
</details>
{% endif %}
<hr />
<a href="/dashboard">← Terug naar dashboard</a>
{% endblock %}

@ -1,44 +0,0 @@
<!DOCTYPE html>
<html lang="nl">
<head>
<!-- =======================================================================
File: templates/layout.html
Purpose:
Globale layout voor alle paginas
======================================================================= -->
<meta charset="UTF-8" />
<title>BookScraper</title>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<!-- CSS -->
<link rel="stylesheet" href="/static/css/style.css" />
<link rel="stylesheet" href="/static/css/dashboard.css" />
<link rel="stylesheet" href="/static/css/bookcard.css" />
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css"
/>
<!-- GLOBAL HELPERS (moet ALTIJD boven alles geladen worden) -->
<script src="/static/js/helpers.js"></script>
</head>
<body>
{% include "components/nav.html" %}
<main class="container">{% block content %}{% endblock %}</main>
<footer class="footer">
BookScraper © 2025 — Powered by Celery + Redis
</footer>
{% block scripts %}{% endblock %}
<script src="/static/js/bookcard_controller.js"></script>
<script src="/static/js/state_updater.js"></script>
<script src="/static/js/dashboard.js"></script>
<!-- GLOBAL APP LOGIC (altijd als laatste) -->
<script src="/static/js/app.js"></script>
</body>
</html>

@ -31,21 +31,6 @@
border-radius: 6px; border-radius: 6px;
font-size: 13px; font-size: 13px;
} }
/* NEW: Clear button */
#clearLogBtn {
margin-bottom: 10px;
padding: 8px 16px;
background: #777;
color: white;
border: none;
border-radius: 6px;
cursor: pointer;
}
#clearLogBtn:hover {
background: #555;
}
#abortBtn { #abortBtn {
padding: 12px 20px; padding: 12px 20px;
background: #d9534f; background: #d9534f;
@ -66,7 +51,6 @@
display: none; display: none;
} }
</style> </style>
s
</head> </head>
<body> <body>
@ -84,24 +68,6 @@
<div class="box">{{ message }}</div> <div class="box">{{ message }}</div>
{% endif %} {% endif %}
<!-- COVER -->
{% if book_title %}
<div class="box">
<strong>Cover:</strong><br />
<img
src="/output/{{ book_title }}/cover.jpg"
alt="Cover"
style="
margin-top: 10px;
max-width: 250px;
border: 1px solid #ccc;
border-radius: 4px;
"
onerror="this.style.display='none'"
/>
</div>
{% endif %}
<div id="statusBox" class="box hidden"> <div id="statusBox" class="box hidden">
<div id="statusLine">Status: bezig…</div> <div id="statusLine">Status: bezig…</div>
<div id="progressText"></div> <div id="progressText"></div>
@ -119,11 +85,7 @@
</div> </div>
<div class="box"> <div class="box">
<strong>Live log:</strong><br /> <strong>Live log:</strong>
<!-- NEW BUTTON -->
<button id="clearLogBtn" onclick="clearLogs()">Clear logs</button>
<div id="logbox" class="logbox"></div> <div id="logbox" class="logbox"></div>
</div> </div>
@ -160,6 +122,7 @@
} }
function pollProgress() { function pollProgress() {
// FIX → UI blijft renderen, polling stopt alleen herhaling
if (!bookId) return; if (!bookId) return;
fetch(`/progress/${bookId}`) fetch(`/progress/${bookId}`)
@ -174,6 +137,7 @@
p.skipped || 0 p.skipped || 0
} | Failed: ${p.failed || 0}`; } | Failed: ${p.failed || 0}`;
// FAILED LIST
const failedBox = document.getElementById("failedBox"); const failedBox = document.getElementById("failedBox");
const failedList = document.getElementById("failedList"); const failedList = document.getElementById("failedList");
@ -187,6 +151,7 @@
}); });
} }
// STATUS
if (p.abort) { if (p.abort) {
document.getElementById("statusLine").innerText = "ABORTED"; document.getElementById("statusLine").innerText = "ABORTED";
polling = false; polling = false;
@ -197,6 +162,7 @@
document.getElementById("statusLine").innerText = "Bezig…"; document.getElementById("statusLine").innerText = "Bezig…";
} }
// STOP repetitieve polling, maar blijf renderen
if (polling) setTimeout(pollProgress, 1000); if (polling) setTimeout(pollProgress, 1000);
}) })
.catch(() => { .catch(() => {
@ -224,17 +190,6 @@
}) })
.catch(() => setTimeout(pollLogs, 1500)); .catch(() => setTimeout(pollLogs, 1500));
} }
// =========================================================
// NEW: Clear logs button handler
// =========================================================
function clearLogs() {
fetch("/clear-logs", { method: "POST" })
.then(() => {
document.getElementById("logbox").innerHTML = "";
})
.catch((e) => console.error("Clear logs failed:", e));
}
</script> </script>
</body> </body>
</html> </html>

@ -57,9 +57,7 @@ Copyright ©=
本站立场无关= 本站立场无关=
均由网友发表或上传= 均由网友发表或上传=
感谢各位书友的支持,您的支持就是我们最大的动力 感谢各位书友的支持,您的支持就是我们最大的动力
飘天文学www.piaotia.com
感谢各位书友的支持
您的支持就是我们最大的动力
# ---------- COMMON NOISE ---------- # ---------- COMMON NOISE ----------
广告= 广告=
广告位= 广告位=

@ -1,13 +0,0 @@
#!/bin/sh
# mp4info shim for m4b-tool (ffprobe-based)
if [ -z "$1" ]; then
echo "Usage: mp4info <file>" >&2
exit 1
fi
# ffprobe outputs float seconds; m4b-tool expects an integer
ffprobe -v error \
-show_entries format=duration \
-of default=noprint_wrappers=1:nokey=1 \
"$1" | awk '{ printf "%d\n", ($1 + 0.5) }'

@ -5,7 +5,7 @@ import requests
from io import BytesIO from io import BytesIO
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scraper.logger import log_debug from scraper.logger import log_debug
from scraper.utils.utils import clean_text from scraper.utils import clean_text
from urllib.parse import urljoin from urllib.parse import urljoin
@ -103,11 +103,8 @@ class ChapterDownloader:
collecting = True collecting = True
continue continue
text = ( text = sib.get_text("\n", strip=True) if hasattr(
sib.get_text("\n", strip=True) sib, "get_text") else str(sib).strip()
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text: if text:
parts.append(text) parts.append(text)
@ -124,7 +121,6 @@ class ChapterDownloader:
vdir = f"{output_base}/v{volume}" vdir = f"{output_base}/v{volume}"
import os import os
os.makedirs(vdir, exist_ok=True) os.makedirs(vdir, exist_ok=True)
fname = f"{number:05d}_{title}.txt" fname = f"{number:05d}_{title}.txt"

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save