extra book metadata

feature/bookstate-progress-fix
peter.fong 1 week ago
parent 292c9246a1
commit feb8ca60d7

@ -135,6 +135,16 @@ docker compose down
docker compose build
docker compose up
docker compose up -d
docker compose build web && docker compose up web
docker compose build worker_download && docker compose up worker_download
docker compose up web
docker compose build web
docker compose restart web
tar \
--exclude="**pycache**" \
--exclude="_/**pycache**/_" \

@ -6,29 +6,43 @@ from dotenv import load_dotenv
load_dotenv()
import os
import redis
from flask import Flask, render_template, request, jsonify, send_from_directory
from flask import (
Flask,
render_template,
request,
jsonify,
send_from_directory,
redirect,
url_for,
)
print(">>> [WEB] Importing celery_app …")
from celery_app import celery_app
from db.db import init_db
from celery.result import AsyncResult
from db.db import init_db
from db.repository import (
get_registered_books,
fetch_book,
fetch_all_books,
get_progress,
)
from scraper.logger import log_debug
from scraper.abort import set_abort
from scraper.progress import get_progress
from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta
from scraper.state import state as r
from scraper.logger_decorators import logcall
from scraper.utils.state_sync import sync_books_from_redis
from scraper.services.init_service import InitService
from db.repository import get_registered_books
# INIT DB
init_db()
app = Flask(__name__)
# =====================================================
# STATIC FILE SERVING
# =====================================================
@ -36,6 +50,7 @@ OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@app.route("/output/<path:filename>")
@logcall
def serve_output(filename):
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
@ -46,22 +61,29 @@ def serve_output(filename):
@app.route("/", methods=["GET"])
@logcall
def index():
return render_template("index.html")
return redirect(url_for("dashboard"))
@app.route("/dashboard", methods=["GET"])
@logcall
def dashboard():
logs_list = get_ui_logs() or []
# Filter hidden books ONLY for GUI
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
books=list_active_books(), # Redis
registered=get_registered_books(), # SQLite INIT results
registered=reg, # SQLite (filtered)
logs=logs_list,
)
@app.route("/book/<book_id>")
@logcall
def book_detail(book_id):
title = r.get(f"book:{book_id}:title") or book_id
return render_template(
@ -73,20 +95,19 @@ def book_detail(book_id):
# =====================================================
# SECTION 2 — ACTION ROUTES (INIT, START, ABORT)
# SECTION 2 — ACTION ROUTES (INIT, START, ABORT, HIDE)
# =====================================================
# CORRECT PATH — services/ is root-level
@app.route("/init", methods=["POST"])
@logcall
def init_book():
"""
INIT-flow:
- user enters URL
- lightweight metadata fetch
- metadata fetch
- insert into SQLite as 'registered'
- return dashboard HTML (NOT JSON)
- return dashboard
"""
url = request.form.get("url", "").strip()
@ -103,39 +124,63 @@ def init_book():
result = InitService.execute(url)
msg = f"Boek geregistreerd: {result.get('title')}"
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
message=msg,
books=list_active_books(), # Redis
registered=get_registered_books(), # SQLite INIT results
books=list_active_books(),
registered=reg,
logs=get_ui_logs(),
)
except Exception as e:
log_debug(f"[INIT] ERROR: {e}")
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
error=f"INIT mislukt: {e}",
books=list_active_books(),
registered=get_registered_books(),
registered=reg,
logs=get_ui_logs(),
)
@app.route("/hide/<book_id>", methods=["POST"])
@logcall
def hide_registered_book(book_id):
"""
Soft-delete/hide voor GUI.
De DB blijft intact.
"""
# try:
# hide_book(book_id)
# return redirect("/dashboard")
# # return jsonify({"status": "ok", "hidden": book_id})
# except Exception as e:
# return jsonify({"status": "error", "message": str(e)}), 500
@app.route("/start", methods=["POST"])
@logcall
def start_scraping():
url = request.form.get("url", "").strip()
"""
Start FULL scraping vanuit een geregistreerd INIT-record.
"""
book_id = request.form.get("book_id")
if not book_id:
return jsonify({"status": "error", "message": "book_id ontbreekt"}), 400
book = fetch_book(book_id)
if not book:
return jsonify({"status": "error", "message": "Boek niet gevonden"}), 404
url = book.get("book_url")
if not url:
return render_template(
"dashboard/dashboard.html",
error="Geen URL opgegeven.",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
return jsonify({"status": "error", "message": "book_url ontbreekt"}), 500
reset_ui_logs()
log_debug(f"[WEB] Scraping via Celery: {url}")
log_debug(f"[WEB] Starting FULL scrape for book_id={book_id}, url={url}")
async_result = celery_app.send_task(
"scraper.tasks.scraping.start_scrape_book",
@ -143,16 +188,19 @@ def start_scraping():
queue="scraping",
)
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
return render_template(
"dashboard/dashboard.html",
scraping_task_id=async_result.id,
books=list_active_books(),
registered=get_registered_books(),
registered=reg,
logs=get_ui_logs(),
)
@app.route("/abort/<book_id>", methods=["POST"])
@logcall
def abort_download(book_id):
log_debug(f"[WEB] Abort requested for book: {book_id}")
set_abort(book_id)
@ -165,27 +213,32 @@ def abort_download(book_id):
@app.route("/api/books")
@logcall
def api_books():
return jsonify(list_active_books())
@app.route("/api/book/<book_id>/status")
@logcall
def api_book_status(book_id):
return jsonify(getStatus(book_id))
@app.route("/api/book/<book_id>/logs")
@logcall
def api_book_logs(book_id):
logs = r.lrange(f"logs:{book_id}", 0, -1) or []
return jsonify(logs)
@app.route("/progress/<book_id>")
@logcall
def progress(book_id):
return jsonify(get_progress(book_id))
@app.route("/celery-result/<task_id>")
@logcall
def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app)
if result.successful():
@ -196,32 +249,66 @@ def celery_result(task_id):
@app.route("/clear-logs", methods=["POST"])
@logcall
def clear_logs():
reset_ui_logs()
return jsonify({"status": "ok", "message": "UI logs cleared"})
return jsonify({"status": "ok"})
@app.route("/logs", methods=["GET"])
@logcall
def logs():
# LAST_LOG_INDEX vanuit de client (default = -1 bij eerste call)
try:
last_index = int(request.args.get("last_index", -1))
except:
last_index = -1
new_lines, total = get_ui_logs_delta(last_index)
return jsonify({"lines": new_lines, "total": total})
# Haal volledige huidige loglijst op
all_logs = get_ui_logs() or []
# Delta: alle regels met index > last_index
new_lines = []
new_last = last_index
for idx, line in enumerate(all_logs):
if idx > last_index:
new_lines.append(line)
new_last = idx
return jsonify({"lines": new_lines, "last": new_last})
# =====================================================
# SECTION 4 — DEBUG ROUTES
# =====================================================
@app.route("/debug/sync_state", methods=["GET"])
def debug_sync_state():
results = sync_books_from_redis()
return {"status": "ok", "synced": results}
from scraper.utils.state_sync import inspect_books_state
@app.route("/debug/inspect_state", methods=["GET"])
def debug_inspect_state():
"""
Shows:
- raw SQLite values,
- raw Redis values,
- what the merged result WOULD be.
No writes happen.
"""
results = inspect_books_state()
return render_template("debug/inspect_state.html", results=results)
@app.route("/debug/redis-keys")
@logcall
def debug_redis_keys():
cursor = 0
results = {}
while True:
cursor, keys = r.scan(cursor, match="*", count=200)
for k in keys:
@ -231,22 +318,17 @@ def debug_redis_keys():
results[k] = "<non-string value>"
if cursor == 0:
break
return jsonify(results)
# =====================================================
# DB DEBUG: LIST ALL BOOKS FROM SQLITE
# DB DEBUG
# =====================================================
from db.repository import fetch_all_books
@app.route("/api/db/books")
@logcall
def api_db_books():
"""
Return ALL books stored in SQLite including INIT-only entries.
Useful to verify that /init wrote correct metadata.
"""
try:
books = fetch_all_books()
return jsonify({"status": "ok", "books": books})
@ -254,11 +336,74 @@ def api_db_books():
return jsonify({"status": "error", "message": str(e)}), 500
# =============================================
# DEBUG QUEUE VIEW (HTML)
# =============================================
from flask import render_template
from urllib.parse import urlparse
import redis
import os
from celery_app import celery_app
@app.route("/debug/queues")
def debug_queues():
insp = celery_app.control.inspect()
workers_active = insp.active() or {}
workers_scheduled = insp.scheduled() or {}
workers_reserved = insp.reserved() or {}
# ---- Redis connection ----
redis_url = os.getenv("REDIS_BROKER")
parsed = urlparse(redis_url)
r = redis.Redis(
host=parsed.hostname,
port=parsed.port,
db=int(parsed.path.strip("/") or 0),
decode_responses=True,
)
queue_names = ["scraping", "controller", "download", "parse", "save", "audio"]
queues = []
for q in queue_names:
key = f"celery:{q}"
try:
queues.append(
{
"name": q,
"redis_key": key,
"length": r.llen(key),
"items": r.lrange(key, 0, 30), # first 30 entries
}
)
except Exception as e:
queues.append(
{
"name": q,
"redis_key": key,
"length": "ERR",
"items": [str(e)],
}
)
return render_template(
"debug/queues.html",
queues=queues,
workers_active=workers_active,
workers_reserved=workers_reserved,
workers_scheduled=workers_scheduled,
)
# =====================================================
# SECTION 5 — INTERNAL HELPERS
# =====================================================
@logcall
def getStatus(book_id):
state = r.hgetall(f"book:{book_id}:state")
status = state.get("status") or "unknown"
@ -280,6 +425,7 @@ def getStatus(book_id):
}
@logcall
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):

@ -54,6 +54,7 @@ def main():
"-l",
"INFO",
"--pool=prefork",
"--concurrency=2",
]
print("[AUDIO-LOCAL] Launching Celery via subprocess…")

@ -32,6 +32,17 @@ celery_app = Celery(
],
)
# >>>>> PLAATS DIT HIER <<<<<
celery_app.conf.update(
worker_redirect_stdouts_level="WARNING",
task_send_sent_event=False,
resultrepr_maxsize=0,
worker_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s",
worker_task_log_format="[%(asctime)s: %(levelname)s/%(processName)s] %(message)s",
)
# >>>>> TOT HIER <<<<<
celery_app.conf.task_routes = {
"scraper.tasks.scraping.*": {"queue": "scraping"},
"scraper.tasks.controller_tasks.*": {"queue": "controller"},

@ -60,9 +60,10 @@ def init_db():
book_id TEXT PRIMARY KEY,
title TEXT,
author TEXT,
description TEXT,
cover_url TEXT,
cover_path TEXT,
book_url TEXT,
chapters_total INTEGER,
@ -72,6 +73,7 @@ def init_db():
audio_done INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
processdate DATETIME,
last_update DATETIME
);
"""
@ -79,14 +81,38 @@ def init_db():
conn.commit()
# --------------------------------------------------------
# SCHEMA UPGRADE: add description column if missing
# SCHEMA UPGRADE UTIL
# --------------------------------------------------------
def add_column(name, type_):
try:
conn.execute(f"ALTER TABLE books ADD COLUMN {name} {type_};")
except:
pass # column already exists
cols = conn.execute("PRAGMA table_info(books);").fetchall()
colnames = [c[1] for c in cols]
if "description" not in colnames:
conn.execute("ALTER TABLE books ADD COLUMN description TEXT;")
conn.commit()
# Existing: ensure new metadata fields exist
add_column("description", "TEXT")
add_column("cover_path", "TEXT")
add_column("book_url", "TEXT")
# --------------------------------------------------------
# NEW FIELDS — MATCH REDIS STATE MODEL (future-proof)
# These do NOT change logic, but enable repository snapshot sync.
# --------------------------------------------------------
# Download counters
add_column("chapters_download_done", "INTEGER DEFAULT 0")
add_column("chapters_download_skipped", "INTEGER DEFAULT 0")
# Audio counters
add_column("audio_skipped", "INTEGER DEFAULT 0")
# Optional future fields
add_column("audio_total", "INTEGER DEFAULT 0")
conn.commit()
# ------------------------------------------------------------
@ -124,6 +150,5 @@ def _raw_get_book(book_id):
def _raw_get_all_books():
conn = get_db()
# unchanged
cur = conn.execute("SELECT * FROM books ORDER BY created_at DESC;")
return [dict(row) for row in cur.fetchall()]

@ -1,60 +1,164 @@
# ============================================================
# File: db/repository.py
# Purpose:
# High-level BookScraper database interface.
# This is the ONLY module Celery tasks and Flask should use.
# Unified façade for BookScraper database state.
#
# New additions for INIT-flow:
# - register_book()
# - update_book_after_full_scrape()
# - get_registered_books()
# - get_active_books()
#
# Existing functions remain unchanged for backward compatibility.
# Responsibilities:
# - Route metadata → SQLite
# - Route counters → Redis (live) + SQLite (snapshot)
# - Provide a clean API for tasks and Flask UI
# ============================================================
from db.db import (
upsert_book,
_raw_get_book,
_raw_get_all_books,
get_db,
from scraper.logger_decorators import logcall
from logbus.publisher import log
import redis
import os
import time
# ============================================================
# SQL low-level engines (snapshot storage)
# ============================================================
from db.state_sql import (
sql_fetch_book,
sql_fetch_all_books,
sql_set_status,
sql_set_chapters_total,
sql_register_book,
sql_update_book,
sql_inc_downloaded,
sql_inc_parsed,
sql_inc_audio_done,
sql_inc_audio_skipped,
)
# ============================================================
# REDIS low-level engines (live counters)
# ============================================================
from db.state_redis import (
redis_set_status,
redis_set_chapters_total,
redis_inc_download_done,
redis_inc_download_skipped,
redis_inc_parsed_done,
redis_inc_audio_done,
redis_inc_audio_skipped,
)
# ============================================================
# Redis setup for legacy progress paths
# ============================================================
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
_r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ============================================================
# INTERNAL — legacy progress helpers
# ============================================================
def _legacy_set_total(book_id, total):
_r.set(f"progress:{book_id}:total", total)
def _legacy_inc_completed(book_id):
_r.incr(f"progress:{book_id}:completed")
def _legacy_inc_skipped(book_id):
_r.incr(f"progress:{book_id}:skipped")
def _legacy_inc_failed(book_id):
_r.incr(f"progress:{book_id}:failed")
# ------------------------------------------------------------
# FETCH OPERATIONS
# ------------------------------------------------------------
def _legacy_add_failed_chapter(book_id, chapter, reason):
entry = f"Chapter {chapter}: {reason}"
_r.rpush(f"progress:{book_id}:failed_list", entry)
def _legacy_get_failed_list(book_id):
return _r.lrange(f"progress:{book_id}:failed_list", 0, -1)
def _legacy_get_progress(book_id):
total = int(_r.get(f"progress:{book_id}:total") or 0)
completed = int(_r.get(f"progress:{book_id}:completed") or 0)
skipped = int(_r.get(f"progress:{book_id}:skipped") or 0)
failed = int(_r.get(f"progress:{book_id}:failed") or 0)
abort = _r.exists(f"abort:{book_id}") == 1
failed_list = _legacy_get_failed_list(book_id)
return {
"book_id": book_id,
"total": total,
"completed": completed,
"skipped": skipped,
"failed": failed,
"failed_list": failed_list,
"abort": abort,
}
# ============================================================
# PUBLIC — UI-ready legacy progress access
# ============================================================
@logcall
def get_progress(book_id):
return _legacy_get_progress(book_id)
@logcall
def add_failed_chapter(book_id, chapter, reason):
_legacy_add_failed_chapter(book_id, chapter, reason)
@logcall
def get_failed_list(book_id):
return _legacy_get_failed_list(book_id)
# ============================================================
# FETCH OPERATIONS (SQLite snapshot)
# ============================================================
@logcall
def fetch_book(book_id):
"""Return a single book dict or None."""
return _raw_get_book(book_id)
return sql_fetch_book(book_id)
@logcall
def fetch_all_books():
"""Return all books ordered newest → oldest."""
return _raw_get_all_books()
return sql_fetch_all_books()
# ============================================================
# NEW — INIT-FLOW SUPPORT
# INIT-FLOW (SQLite metadata only)
# ============================================================
@logcall
def register_book(
book_id,
title,
author=None,
description=None,
cover_url=None,
cover_path=None,
book_url=None,
):
def register_book(book_id, title, author=None, description=None, cover_url=None):
"""
Create a new book entry with initial metadata.
Called when user enters a URL and presses INIT.
"""
fields = {
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path,
"book_url": book_url,
"chapters_total": 0,
"status": "registered",
}
upsert_book(book_id, **fields)
log(f"[DB] Registering new book={book_id} title='{title}'")
sql_register_book(book_id, fields)
@logcall
def update_book_after_full_scrape(
book_id,
title=None,
@ -63,10 +167,6 @@ def update_book_after_full_scrape(
cover_url=None,
chapters_total=None,
):
"""
Called after a FULL scrape when chapters are known.
Moves the book into 'active' state.
"""
fields = {}
if title is not None:
@ -82,94 +182,122 @@ def update_book_after_full_scrape(
fields["status"] = "active"
upsert_book(book_id, **fields)
log(f"[DB] update full scrape metadata book={book_id}")
sql_update_book(book_id, fields)
# ============================================================
# ACTIVE BOOK LISTS
# ============================================================
@logcall
def get_registered_books():
"""
Return books registered but not yet scraped.
"""
conn = get_db()
cur = conn.execute(
"""SELECT * FROM books WHERE status='registered'
ORDER BY created_at DESC"""
)
return [dict(row) for row in cur.fetchall()]
all_books = sql_fetch_all_books()
return [b for b in all_books if b.get("status") == "registered"]
@logcall
def get_active_books():
"""
Return books currently in progress.
"""
conn = get_db()
cur = conn.execute(
"""SELECT * FROM books
WHERE status IN ('active', 'downloading')
ORDER BY created_at DESC"""
)
return [dict(row) for row in cur.fetchall()]
# ------------------------------------------------------------
# BOOK CREATION / METADATA (existing)
# ------------------------------------------------------------
def create_or_update_book(
book_id,
title=None,
author=None,
chapters_total=None,
cover_url=None,
cover_path=None,
status=None,
):
fields = {}
all_books = sql_fetch_all_books()
log(f"[DB] Fetched all books for active filter, total={len(all_books)}")
return [b for b in all_books if b.get("status") in ("active", "downloading")]
if title is not None:
fields["title"] = title
if author is not None:
fields["author"] = author
if chapters_total is not None:
fields["chapters_total"] = chapters_total
if cover_url is not None:
fields["cover_url"] = cover_url
if cover_path is not None:
fields["cover_path"] = cover_path
if status is not None:
fields["status"] = status
if fields:
upsert_book(book_id, **fields)
# ============================================================
# STATUS MANAGEMENT
# ============================================================
@logcall
def set_status(book_id, status):
log(f"[DB] Setting status for {book_id} to '{status}'")
redis_set_status(book_id, status)
sql_set_status(book_id, status)
# ------------------------------------------------------------
# STATUS MANAGEMENT (existing)
# ------------------------------------------------------------
def set_status(book_id, status):
upsert_book(book_id, status=status)
# ============================================================
# CHAPTER TOTALS
# ============================================================
@logcall
def set_chapters_total(book_id, total):
log(f"[DB] Setting chapter total for {book_id} to {total}")
redis_set_chapters_total(book_id, total)
sql_set_chapters_total(book_id, total)
_legacy_set_total(book_id, total) # integrate legacy progress
# ============================================================
# COUNTERS — DOWNLOAD
# ============================================================
@logcall
def inc_download_done(book_id, amount=1):
log(f"[DB] Incrementing download done for {book_id} by {amount}")
redis_inc_download_done(book_id, amount)
sql_inc_downloaded(book_id, amount)
_legacy_inc_completed(book_id)
@logcall
def inc_download_skipped(book_id, amount=1):
log(f"[DB] Incrementing download skipped for {book_id} by {amount}")
redis_inc_download_skipped(book_id, amount)
_legacy_inc_skipped(book_id)
# ============================================================
# COUNTERS — PARSE
# ============================================================
@logcall
def inc_parsed_done(book_id, amount=1):
log(f"[DB] Incrementing parsed done for {book_id} by {amount}")
redis_inc_parsed_done(book_id, amount)
sql_inc_parsed(book_id, amount)
# ------------------------------------------------------------
# INCREMENTING COUNTERS (existing — backward compat only)
# ------------------------------------------------------------
# ============================================================
# COUNTERS — AUDIO
# ============================================================
# ============================================================
# COUNTERS — AUDIO SKIPPED
# ============================================================
@logcall
def inc_audio_skipped(book_id, amount=1):
log(f"[DB] Incrementing audio skipped for {book_id} by {amount}")
# Redis live counter (maak deze functie in state_redis wanneer nodig)
sql_inc_audio_skipped(book_id, amount)
redis_inc_audio_skipped(book_id, amount)
# Geen SQLite kolom? Dan overslaan.
@logcall
def inc_audio_done(book_id, amount=1):
log(f"[DB] Incrementing audio done for {book_id} by {amount}")
redis_inc_audio_done(book_id, amount)
sql_inc_audio_done(book_id, amount)
# ============================================================
# BACKWARDS COMPATIBILITY SHIMS (old task API)
# ============================================================
@logcall
def inc_downloaded(book_id, amount=1):
book = _raw_get_book(book_id)
if not book:
return
cur = book.get("downloaded", 0) or 0
upsert_book(book_id, downloaded=cur + amount)
"""
Old name used by older tasks.
Redirects to new unified counter.
"""
return inc_download_done(book_id, amount)
@logcall
def inc_parsed(book_id, amount=1):
book = _raw_get_book(book_id)
if not book:
return
cur = book.get("parsed", 0) or 0
upsert_book(book_id, parsed=cur + amount)
"""
Old name used by older tasks.
"""
return inc_parsed_done(book_id, amount)
def inc_audio_done(book_id, amount=1):
book = _raw_get_book(book_id)
if not book:
return
cur = book.get("audio_done", 0) or 0
upsert_book(book_id, audio_done=cur + amount)
@logcall
def inc_audio_done_legacy(book_id, amount=1):
"""
Old audio name used by older tasks.
"""
return inc_audio_done(book_id, amount)

@ -0,0 +1,79 @@
# ============================================================
# File: db/state_redis.py
# Purpose:
# Low-level Redis counters/state for BookScraper.
# Used ONLY by db.repository façade.
# ============================================================
import os
import time
import redis
from logbus.publisher import log
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def redis_set_status(book_id: str, status: str):
key = f"book:{book_id}:state"
r.hset(key, "status", status)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# TOTAL CHAPTERS
# ------------------------------------------------------------
def redis_set_chapters_total(book_id: str, total: int):
key = f"book:{book_id}:state"
r.hset(key, "chapters_total", total)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# DOWNLOAD COUNTERS
# ------------------------------------------------------------
def redis_inc_download_done(book_id: str, amount: int = 1):
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_download_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_download_skipped(book_id: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing download skipped for {book_id} by {amount}")
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_download_skipped", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# PARSE COUNTERS
# ------------------------------------------------------------
def redis_inc_parsed_done(book_id: str, amount: int = 1):
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_parsed_done", amount)
r.hset(key, "last_update", int(time.time()))
# ------------------------------------------------------------
# AUDIO COUNTERS
# ------------------------------------------------------------
def redis_inc_audio_done(book_id: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio done for {book_id} by {amount}")
key = f"book:{book_id}:state"
r.hincrby(key, "audio_done", amount)
r.hset(key, "last_update", int(time.time()))
def redis_inc_audio_skipped(book_id: str, amount: int = 1):
log(f"[DB-REDIS] Incrementing audio skipped for {book_id} by {amount}")
"""
New: Count skipped audio chapters (timeouts, pre-existing files, abort, etc.)
SQL does NOT track this; Redis-only metric.
"""
key = f"book:{book_id}:state"
r.hincrby(key, "audio_skipped", amount)
r.hset(key, "last_update", int(time.time()))

@ -0,0 +1,165 @@
# ============================================================
# File: db/state_sql.py
# Purpose:
# Low-level SQLite snapshot layer for BookScraper metadata.
# Used ONLY through db.repository façade.
# ============================================================
import sqlite3
import os
from logbus.publisher import log
DB_PATH = os.getenv("BOOKSCRAPER_DB", "/app/db/books.db")
# ------------------------------------------------------------
# INTERNAL HELPERS
# ------------------------------------------------------------
def _connect():
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
# ------------------------------------------------------------
# FETCH
# ------------------------------------------------------------
def sql_fetch_book(book_id):
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,))
row = cur.fetchone()
conn.close()
return dict(row) if row else None
def sql_fetch_all_books():
conn = _connect()
cur = conn.cursor()
cur.execute("SELECT * FROM books ORDER BY rowid DESC")
rows = cur.fetchall()
conn.close()
return [dict(r) for r in rows]
# ------------------------------------------------------------
# REGISTER / UPDATE
# ------------------------------------------------------------
def sql_register_book(book_id, fields: dict):
conn = _connect()
cur = conn.cursor()
cols = ", ".join(["book_id"] + list(fields.keys()))
placeholders = ", ".join(["?"] * (1 + len(fields)))
values = [book_id] + list(fields.values())
cur.execute(
f"INSERT OR REPLACE INTO books ({cols}) VALUES ({placeholders})", values
)
conn.commit()
conn.close()
def sql_update_book(book_id, fields: dict):
if not fields:
return
conn = _connect()
cur = conn.cursor()
set_clause = ", ".join([f"{k} = ?" for k in fields])
params = list(fields.values()) + [book_id]
cur.execute(f"UPDATE books SET {set_clause} WHERE book_id = ?", params)
conn.commit()
conn.close()
# ------------------------------------------------------------
# STATUS
# ------------------------------------------------------------
def sql_set_status(book_id, status: str):
conn = _connect()
cur = conn.cursor()
cur.execute("UPDATE books SET status = ? WHERE book_id = ?", (status, book_id))
conn.commit()
conn.close()
# ------------------------------------------------------------
# CHAPTER TOTAL (snapshot)
# ------------------------------------------------------------
def sql_set_chapters_total(book_id, total: int):
conn = _connect()
cur = conn.cursor()
cur.execute(
"UPDATE books SET chapters_total = ? WHERE book_id = ?", (total, book_id)
)
conn.commit()
conn.close()
# ------------------------------------------------------------
# COUNTERS (SNAPSHOT-ONLY)
# ------------------------------------------------------------
def sql_inc_downloaded(book_id, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET downloaded = COALESCE(downloaded,0) + ?
WHERE book_id = ?
""",
(amount, book_id),
)
conn.commit()
conn.close()
def sql_inc_parsed(book_id, amount=1):
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET parsed = COALESCE(parsed,0) + ?
WHERE book_id = ?
""",
(amount, book_id),
)
conn.commit()
conn.close()
def sql_inc_audio_done(book_id, amount=1):
log(f"[DB-SQL] Incrementing audio done for {book_id} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_done = COALESCE(audio_done,0) + ?
WHERE book_id = ?
""",
(amount, book_id),
)
conn.commit()
conn.close()
def sql_inc_audio_skipped(book_id, amount=1):
log(f"[DB-SQL] Incrementing audio skipped for {book_id} by {amount}")
conn = _connect()
cur = conn.cursor()
cur.execute(
"""
UPDATE books
SET audio_skipped = COALESCE(audio_skipped,0) + ?
WHERE book_id = ?
""",
(amount, book_id),
)
conn.commit()
conn.close()

@ -1,9 +1,31 @@
# logbus/publisher.py
import logging
import os
logger = logging.getLogger("logbus")
logger.setLevel(logging.WARNING)
# ============================================================
# FILE LOGGER — log.txt in BOOKSCRAPER_OUTPUT_DIR
# ============================================================
try:
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
os.makedirs(root, exist_ok=True)
file_path = os.path.join(root, "log.txt")
file_handler = logging.FileHandler(file_path, mode="a", encoding="utf-8")
file_formatter = logging.Formatter("%(message)s") # exact zoals input
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
except Exception:
# Logging naar file mag nooit de app laten crashen
pass
def log(message: str):
"""

@ -1,6 +1,8 @@
import os
import redis
from scraper.logger_decorators import logcall
# GUI log (non-breaking)
from scraper.ui_log import push_ui

@ -1,21 +1,66 @@
# ============================================================
# File: scraper/book_scraper.py
# Purpose:
# Backwards-compatible wrapper giving same API as before.
# Uses the new engine under the hood.
# Backwards-compatible wrapper giving the SAME public API
# as the old BookScraper, but internally uses ScrapeEngine.
#
# execute() → full metadata + chapterlist
#
# (* Chapter downloading komt later in ScrapeEngine,
# maar deze wrapper hoeft NIET aangepast te worden.)
# ============================================================
from scraper.engine.parser import extract_metadata_full
from scraper.logger_decorators import logcall
from scraper.services.scrape_engine import ScrapeEngine
class BookScraper:
def __init__(self, site_scraper, url):
"""
Backwards-compatible BookScraper façade.
In het oude systeem deed BookScraper ALLES:
- metadata ophalen
- cover ophalen
- hoofdstukkenlijst
- hoofdstukken downloaden
- volume folders
- skip logic
In het nieuwe systeem is dát opgesplitst:
ScrapeEngine metadata / chapterlist / download engine (in ontwikkeling)
BookScraper behoudt dezelfde API als voorheen
Daardoor kunnen Celery-tasks en oudere modules blijven werken
zonder refactor-chaos.
"""
@logcall
def __init__(self, site_scraper, url: str):
self.site = site_scraper
self.url = url
@logcall
def execute(self):
"""
Backwards compatible full scrape:
returns {title, author, description, cover_url, chapters, book_url}
Public legacy API.
Retourneert metadata + chapters EXACT zoals de oude BookScraper
vóór downloadfase.
Dit is belangrijk:
- INIT-flow gebruikt metadata only
- scraping tasks gebruiken chapterlist
"""
return extract_metadata_full(self.url, self.site)
data = ScrapeEngine.fetch_metadata_and_chapters(self.site, self.url)
# Legacy output structuur volledig repliceren:
return {
"title": data.get("title"),
"author": data.get("author"),
"description": data.get("description"),
"cover_url": data.get("cover_url"),
"chapters": data.get("chapters", []),
"chapters_total": data.get("chapters_total", 0),
"book_url": data.get("book_url"),
}

@ -16,14 +16,6 @@ import os
import requests
import shutil
from scraper.abort import abort_requested # DEBUG allowed
from db.repository import create_or_update_book
# NEW: Redis State Model (C&U)
from scraper.progress import (
init_book_state,
set_status,
set_chapter_total,
)
class DownloadController:

@ -0,0 +1,33 @@
# ============================================================
# File: scraper/logger_decorators.py
# Purpose: Function-call logging decorator
# ============================================================
from functools import wraps
from scraper.logger import log_debug
def logcall(func):
"""
Decorator: log function name + arguments every time it's called.
Usage: @logcall above any function.
"""
@wraps(func)
def wrapper(*args, **kwargs):
# Naam van de functie
name = func.__qualname__
# Eerste logregel vóór uitvoering
# log_debug(f"[CALL] {name} args={args} kwargs={kwargs}")
log_debug(f"[CALL] {name} args={args}")
# log_debug(f"[CALL] {name}")
result = func(*args, **kwargs)
# Log ná uitvoering
# log_debug(f"[RETURN] {name} → {result}")
return result
return wrapper

@ -6,6 +6,8 @@ import os
import stat
from logbus.publisher import log
from scraper.logger_decorators import logcall
TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")

@ -5,12 +5,13 @@
import os
import requests
from logbus.publisher import log
from typing import Optional
class CoverService:
@staticmethod
def download_main_cover(cover_url: str, book_id: str) -> str | None:
def download_main_cover(cover_url: str, book_id: str) -> Optional[str]:
"""
Downloads cover image into: static/covers/<book_id>.jpg.
Returns local path or None.

@ -16,10 +16,13 @@ from scraper.services.cover_service import CoverService
from db.repository import register_book
from scraper.logger_decorators import logcall
class InitService:
@staticmethod
@logcall
def derive_book_id(url: str) -> str:
"""
PTWXZ URL format ends with /{id}.html.
@ -31,16 +34,20 @@ class InitService:
return url.replace("/", "_")
@staticmethod
@logcall
def execute(url: str) -> dict:
"""
Main INIT-flow entry point.
Returns complete metadata + registration info.
"""
# 1) Determine which BookSite applies
# 1) Determine site
site = SiteResolver.resolve(url)
# 2) Metadata only (no chapters)
book_id = InitService.derive_book_id(url)
site.book_id = book_id
# 2) Metadata only
meta = ScrapeEngine.fetch_metadata_only(site, url)
title = meta.get("title") or "Unknown"
@ -48,27 +55,27 @@ class InitService:
description = meta.get("description")
cover_url = meta.get("cover_url")
# 3) Determine book_id
book_id = InitService.derive_book_id(url)
# 4) Download UI cover (NEW: capture returned local path)
cover_path = CoverService.download_main_cover(cover_url, book_id)
# 4) SQLite registration
# 5) SQLite registration INCLUDING cover_path ← ★ FIX
register_book(
book_id=book_id,
title=title,
author=author,
description=description,
cover_url=cover_url,
cover_path=cover_path, # ← ★ BELANGRIJK
book_url=url,
)
# 5) Download UI cover
CoverService.download_main_cover(cover_url, book_id)
# 6) Structured output for UI
# 6) Output for UI
return {
"book_id": book_id,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path, # ← handig voor UI
"status": "registered",
}

@ -1,33 +1,287 @@
# ============================================================
# File: scraper/services/scrape_engine.py
# Purpose:
# Provide unified scraping methods for INIT-flow.
# Reuses BookScraper internally with ZERO duplication.
# Unified scraping engine for INIT-flow and Celery tasks.
# All functions are fully logged via @logcall.
# ============================================================
from scraper.book_scraper import BookScraper
import os
import time
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from logbus.publisher import log
from scraper.logger import log_debug
from scraper.logger_decorators import logcall
from scraper.utils.utils import load_replacements
class ScrapeEngine:
"""
Adapter layer around BookScraper.
Allows INIT-flow to fetch ONLY metadata (no chapters).
Central scraping engine.
Metadata + chapterlist scraping.
All methods logged with @logcall.
"""
# ------------------------------------------------------------
# REPLACEMENTS LOADER
# ------------------------------------------------------------
@staticmethod
def fetch_metadata_only(site, url: str) -> dict:
@logcall
def _apply_replacements(site):
fp = os.path.join(os.getcwd(), "replacements.txt")
extra = load_replacements(fp)
if not hasattr(site, "replacements"):
site.replacements = {}
site.replacements.update(extra)
return True
# ------------------------------------------------------------
# RATE LIMITER
# ------------------------------------------------------------
MIN_DELAY = 1.0 / float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1"))
@staticmethod
@logcall
def _throttle(last_time=[0]):
now = time.time()
elapsed = now - last_time[0]
if elapsed < ScrapeEngine.MIN_DELAY:
time.sleep(ScrapeEngine.MIN_DELAY - elapsed)
last_time[0] = time.time()
return True
# ------------------------------------------------------------
# HTTP GET
# ------------------------------------------------------------
@staticmethod
@logcall
def _get_doc(url: str, site):
attempt = 1
while True:
ScrapeEngine._throttle()
log_debug(f"[SCRAPER] GET {url} (attempt {attempt})")
try:
resp = requests.get(
url,
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
except Exception as e:
log_debug(f"Network error {e} → retry {attempt + 1}s")
time.sleep(attempt + 1)
attempt += 1
continue
code = resp.status_code
if code == 200:
resp.encoding = getattr(site, "encoding", "utf-8")
return BeautifulSoup(resp.text, "lxml")
if code == 429:
cooldown = 60
log_debug("429 detected — cooldown 60s")
for i in range(cooldown, 0, -1):
log_debug(f" cooldown {i}s…")
time.sleep(1)
attempt += 1
continue
if code in (403, 500):
wait = min(5 * attempt, 30)
log_debug(f"HTTP {code} → retry in {wait}s")
time.sleep(wait)
attempt += 1
continue
wait = attempt + 1
log_debug(f"Unexpected HTTP {code} → sleep {wait}s")
time.sleep(wait)
attempt += 1
# ------------------------------------------------------------
# PARSER HELPERS
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_title(soup):
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownTitle"
@staticmethod
@logcall
def _parse_author(soup):
td = soup.find("td", string=lambda t: t and "" in t)
if td and "" in td.get_text():
return td.get_text(strip=True).split("")[1]
return "UnknownAuthor"
@staticmethod
@logcall
def _parse_description(soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
txt = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if txt:
parts.append(txt)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSER
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_cover(soup, site):
"""
Execute BookScraper but return ONLY metadata.
Chapters are intentionally removed.
Vind cover door book_id substring matching:
- haal book_id uit site.url
- zoek IMG-tags waarvan filename book_id bevat
- kies kortste filename als beste match
"""
scraper = BookScraper(site, url)
result = scraper.execute() # returns full metadata + chapters
try:
parsed = urlparse(site.url)
m = re.search(r"/(\d+)\.html$", parsed.path)
if m:
book_id = m.group(1)
else:
book_id = parsed.path.rstrip("/").split("/")[-1]
except Exception:
return None
imgs = soup.find_all("img", src=True)
candidates = []
for img in imgs:
src = img["src"].strip()
filename = os.path.basename(src)
if book_id in filename:
candidates.append((filename, src))
if not candidates:
return None
candidates.sort(key=lambda t: len(t[0])) # kortste filename wint
best_src = candidates[0][1]
return urljoin(site.root, best_src)
# ------------------------------------------------------------
# RESOLVE CHAPTER PAGE
# ------------------------------------------------------------
@staticmethod
@logcall
def _resolve_chapter_page(soup, site):
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
if not node:
raise ValueError("Could not locate chapter list base node")
href = node.select_one("a").get("href")
url = urljoin(site.root, href)
parsed = urlparse(url)
basepath = parsed.path.rsplit("/", 1)[0] + "/"
chapter_base = f"{parsed.scheme}://{parsed.netloc}{basepath}"
return url, chapter_base
# ------------------------------------------------------------
# PARSE CHAPTER LINKS
# ------------------------------------------------------------
@staticmethod
@logcall
def _parse_chapter_links(soup, chapter_base, selector):
cont = soup.select_one(selector)
if not cont:
return []
items = cont.select("ul li a[href]")
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full = urljoin(chapter_base, href)
chapters.append({"num": idx, "title": title, "url": full})
idx += 1
return chapters
# ============================================================
# PUBLIC APIS
# ============================================================
@staticmethod
@logcall
def fetch_metadata_only(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url # NODIG voor cover parsing
# Strip chapterlist — INIT-flow should not fetch them
return {
"title": result.get("title"),
"author": result.get("author"),
"description": result.get("description"),
"cover_url": result.get("cover_url"),
"title": ScrapeEngine._parse_title(soup),
"author": ScrapeEngine._parse_author(soup),
"description": ScrapeEngine._parse_description(soup),
"cover_url": ScrapeEngine._parse_cover(soup, site),
"book_url": url,
}
@staticmethod
@logcall
def fetch_metadata_and_chapters(site, url: str) -> dict:
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
site.url = url
title = ScrapeEngine._parse_title(soup)
author = ScrapeEngine._parse_author(soup)
desc = ScrapeEngine._parse_description(soup)
cover = ScrapeEngine._parse_cover(soup, site)
chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site)
chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site)
chapters = ScrapeEngine._parse_chapter_links(
chapter_soup, chapter_base, site.chapter_list_selector
)
return {
"title": title,
"author": author,
"description": desc,
"cover_url": cover,
"chapters": chapters,
"chapters_total": len(chapters),
"book_url": url,
}
@staticmethod
@logcall
def fetch_chapterlist(site, url: str):
ScrapeEngine._apply_replacements(site)
soup = ScrapeEngine._get_doc(url, site)
chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site)
chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site)
return ScrapeEngine._parse_chapter_links(
chapter_soup, chapter_base, site.chapter_list_selector
)

@ -6,6 +6,7 @@
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from typing import Optional
class SiteScraper(ABC):
@ -39,7 +40,7 @@ class SiteScraper(ABC):
def parse_description(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None: ...
def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]: ...
# --------------------------
# Chapter extraction

@ -9,6 +9,7 @@ from scraper.sites.base import SiteScraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from typing import Optional
class PiaotianScraper(SiteScraper):
@ -53,7 +54,7 @@ class PiaotianScraper(SiteScraper):
# COVER PARSING
# (exactly your BookScraper._parse_cover logic)
# ------------------------------------------------------------
def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None:
def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]:
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", url)
if not m:

@ -1,5 +1,8 @@
# ============================================================
# File: scraper/tasks/audio_tasks.py
# Purpose: Convert chapter text files into audio using macOS
# “say”, with Redis-based slot control.
# Updated: now uses db.repository for audio counters.
# ============================================================
from celery_app import celery_app
@ -7,61 +10,80 @@ from logbus.publisher import log
import os
import subprocess
import time
import socket
import os
from scraper.progress import inc_audio_done, inc_audio_skipped
# from db.repository import inc_audio_done
from scraper.abort import abort_requested
from scraper.logger_decorators import logcall
from redis import Redis
from urllib.parse import urlparse
# Kies lokale redis als aanwezig, anders standaard backend
redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
# NEW — unified repository façade
from db.repository import (
inc_audio_done,
inc_audio_skipped,
)
parsed = urlparse(redis_url)
HOST = socket.gethostname()
# ------------------------------------------------------------
# REGULIER REDIS CLIENT (slots, file checks, dstate)
# REDIS CLIENT SETUP
# ------------------------------------------------------------
redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
parsed = urlparse(redis_url)
# Slot locking Redis client
redis_client = Redis(
host=parsed.hostname,
port=parsed.port,
db=parsed.path.strip("/"),
)
# ------------------------------------------------------------
# BACKEND CLIENT (abort flags, progress counters) - altijd DB 0
# ------------------------------------------------------------
# Abort + global progress flags always live in DB 0
backend_client = Redis(
host=parsed.hostname,
port=parsed.port,
db=0,
)
# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
AUDIO_TIMEOUT = int(os.getenv("AUDIO_TIMEOUT_SECONDS", "300"))
AUDIO_VOICE = os.getenv("AUDIO_VOICE", "SinJi")
AUDIO_RATE = int(os.getenv("AUDIO_RATE", "200"))
HOST_PATH = os.getenv("HOST_PATH", "/app/output")
AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1"))
HOST_PATH = os.getenv("HOST_PATH", "/app/output")
CONTAINER_PREFIX = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "/app/output")
AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1"))
# ============================================================
# CELERY TASK
# ============================================================
@celery_app.task(bind=True, queue="audio", ignore_result=True)
@logcall
def generate_audio(
self, book_id, volume_name, chapter_number, chapter_title, chapter_text
self, book_id, volume_name, chapter_number, chapter_title, chapter_path
):
log(f"[AUDIO] CH{chapter_number}: START task → raw_input={chapter_text}")
"""
chapter_path: absolute container path to chapter text file.
"""
log(f"[AUDIO]({HOST}) CH{chapter_number}: START → {chapter_title}")
# Abort early
# ------------------------------------------------------------
# ABORT CHECK
# ------------------------------------------------------------
if abort_requested(book_id, backend_client):
inc_audio_skipped(book_id)
log(f"[AUDIO] ABORT detected → skip CH{chapter_number}")
log(f"[AUDIO]({HOST}) ABORT detected → skip CH{chapter_number}")
return
# ============================================================
# ACQUIRE AUDIO SLOT
# ============================================================
# ------------------------------------------------------------
# ACQUIRE SLOT
# ------------------------------------------------------------
slot_key = None
ttl = AUDIO_TIMEOUT + 15
@ -72,11 +94,13 @@ def generate_audio(
log(f"[AUDIO] CH{chapter_number}: Acquired slot {i}/{AUDIO_SLOTS}")
break
# Need to wait
if slot_key is None:
log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting...")
log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting")
start_wait = time.time()
while slot_key is None:
# Try all slots again
for i in range(1, AUDIO_SLOTS + 1):
key = f"audio_slot:{i}"
if redis_client.set(key, "1", nx=True, ex=ttl):
@ -84,32 +108,32 @@ def generate_audio(
log(f"[AUDIO] CH{chapter_number}: Slot acquired after wait")
break
if slot_key:
break
# If still no slot
if not slot_key:
if abort_requested(book_id, backend_client):
log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}")
inc_audio_skipped(book_id)
return
if abort_requested(book_id, backend_client):
log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}")
return
if time.time() - start_wait > ttl:
log(f"[AUDIO] CH{chapter_number}: Wait timeout → abort audio")
inc_audio_skipped(book_id)
return
if time.time() - start_wait > ttl:
log(f"[AUDIO] CH{chapter_number}: Slot wait timeout → aborting audio")
return
time.sleep(0.25)
time.sleep(0.25)
# ============================================================
# ------------------------------------------------------------
# PATH NORMALISATION
# ============================================================
container_path = chapter_text
# ------------------------------------------------------------
container_path = chapter_path
# Fix 1 — container_path kan None zijn → abort zonder crash
if not container_path:
log(f"[AUDIO] CH{chapter_number}: FATAL — no input path provided")
log(f"[AUDIO] CH{chapter_number}: ERROR — no input file path provided")
redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return
# Fix 2 — veilige startswith
# Strip container prefix so that host path is resolvable
if CONTAINER_PREFIX and container_path.startswith(CONTAINER_PREFIX):
relative_path = container_path[len(CONTAINER_PREFIX) :].lstrip("/")
else:
@ -118,35 +142,35 @@ def generate_audio(
parts = relative_path.split("/")
if len(parts) < 3:
log(
f"[AUDIO] CH{chapter_number}: FATAL — cannot parse book/volume from {relative_path}"
f"[AUDIO] CH{chapter_number}: ERROR — cannot parse book/volume from {relative_path}"
)
redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return
book_from_path = parts[0]
volume_from_path = parts[1]
# book_from_path = parts[0] # volume_name passed explicitly anyway
# volume_from_path = parts[1]
host_path = os.path.join(HOST_PATH, relative_path)
# ============================================================
# OUTPUT PREP
# ============================================================
base_dir = os.path.join(HOST_PATH, book_from_path, volume_from_path, "Audio")
# ------------------------------------------------------------
# OUTPUT DIRECTORY
# ------------------------------------------------------------
base_dir = os.path.join(HOST_PATH, parts[0], parts[1], "Audio")
os.makedirs(base_dir, exist_ok=True)
safe_num = f"{chapter_number:04d}"
audio_file = os.path.join(base_dir, f"{safe_num}.m4b")
# Skip if audio already exists
if os.path.exists(audio_file):
log(f"[AUDIO] Skip CH{chapter_number} → already exists")
log(f"[AUDIO] CH{chapter_number}: Already exists → skip")
redis_client.delete(slot_key)
inc_audio_skipped(book_id)
return
# ============================================================
# BUILD CMD
# ============================================================
# ------------------------------------------------------------
# BUILD TTS COMMAND
# ------------------------------------------------------------
cmd = (
f"say --voice={AUDIO_VOICE} "
f"--input-file='{host_path}' "
@ -157,30 +181,34 @@ def generate_audio(
f"--data-format=aac"
)
log(f"[AUDIO] CH{chapter_number}: CMD = {cmd}")
log(f"[AUDIO]({HOST}) CH{chapter_number} → output: {audio_file}")
# ============================================================
# RUN TTS
# ============================================================
# ------------------------------------------------------------
# EXECUTE
# ------------------------------------------------------------
try:
subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT)
# NEW — repository façade
inc_audio_done(book_id)
log(f"[AUDIO] CH{chapter_number}: Completed")
log(f"[AUDIO]({HOST}) CH{chapter_number}: Completed")
except subprocess.TimeoutExpired:
log(f"[AUDIO] CH{chapter_number}: TIMEOUT → remove incomplete file")
log(f"[AUDIO]({HOST}) CH{chapter_number}: TIMEOUT → removing file")
if os.path.exists(audio_file):
try:
os.remove(audio_file)
except Exception:
pass
inc_audio_skipped(book_id)
except subprocess.CalledProcessError as e:
log(f"[AUDIO] CH{chapter_number}: ERROR during say → {e}")
inc_audio_skipped(book_id)
except Exception as e:
log(f"[AUDIO] CH{chapter_number}: UNEXPECTED ERROR → {e}")
inc_audio_skipped(book_id)
finally:
if slot_key:

@ -10,18 +10,20 @@ from celery_app import celery_app
from logbus.publisher import log
from scraper.download_controller import DownloadController
from scraper.progress import (
set_total,
)
from urllib.parse import urlparse
from scraper.logger_decorators import logcall
import redis
import os
from scraper.abort import abort_requested
from db.repository import set_chapters_total
print(">>> [IMPORT] controller_tasks.py loaded")
@celery_app.task(bind=True, queue="controller", ignore_result=False)
@logcall
def launch_downloads(self, book_id: str, scrape_result: dict):
"""
Launch the entire pipeline (download parse save),
@ -62,7 +64,8 @@ def launch_downloads(self, book_id: str, scrape_result: dict):
# ------------------------------------------------------------
# INIT PROGRESS
# ------------------------------------------------------------
set_total(book_id, total)
set_chapters_total(book_id, total)
log(f"[CTRL] Progress initialized for {book_id}: total={total}")
# ------------------------------------------------------------

@ -1,26 +1,21 @@
# ============================================================
# File: scraper/tasks/download_tasks.py
# Purpose: Download chapter HTML with global concurrency,
# retry/backoff logic, 429 support, and abort-awareness.
#
# Logging:
# - timestamp + book_id in message
# - logbus.publisher → console
# - ui_log.push_ui → Redis GUI
# ============================================================
from celery_app import celery_app
from scraper.utils import get_save_path
from scraper.utils.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started
from scraper.progress import (
inc_completed,
inc_chapter_done,
inc_chapter_download_skipped,
# Repository façade — correct imports only
from db.repository import (
set_status,
inc_download_done,
inc_download_skipped,
)
from db.repository import inc_downloaded, set_status
from logbus.publisher import log
from scraper.ui_log import push_ui
from scraper.logger_decorators import logcall
import requests
import redis
@ -90,81 +85,74 @@ def release_global_slot():
# ============================================================
# CELERY TASK — NEW SIGNATURE WITH chapter_dict + book_meta
# CELERY TASK — Unified payload v3
# ============================================================
@celery_app.task(bind=True, queue="download", ignore_result=False)
def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
@logcall
def download_chapter(self, payload: dict):
"""
New unified chapter model:
chapter_dict = {
"num": int,
"url": str,
"title": str,
"volume_path": str
}
book_meta is propagated through the pipeline for parse/save.
Payload:
{
"book_id": str,
"chapter": { "num", "url", "title", "volume_path" },
"book_meta": dict,
"html": None | str,
"parsed": None | dict,
"skipped": bool,
"path": None | str
}
"""
chapter_num = chapter_dict.get("num")
chapter_url = chapter_dict.get("url")
chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
volume_path = chapter_dict.get("volume_path")
if not payload:
raise ValueError("download_chapter received empty payload")
book_id = payload["book_id"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
chapter_num = chapter["num"]
chapter_url = chapter["url"]
chapter_title = chapter.get("title") or f"Chapter {chapter_num}"
volume_path = chapter["volume_path"]
# STATUS UPDATE
set_status(book_id, "downloading")
# -----------------------------------------------------------
# ABORT BEFORE START
# -----------------------------------------------------------
# ABORT CHECK
if abort_requested(book_id) and not chapter_started(book_id, chapter_num):
msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)"
log_msg(book_id, msg)
inc_chapter_download_skipped(book_id)
return {
"book_id": book_id,
"chapter": chapter_dict,
"html": None,
"skipped": True,
"path": None,
"abort": True,
"book_meta": book_meta,
}
# Mark chapter as started
log_msg(book_id, f"[ABORT] Skip chapter {chapter_num}")
inc_download_skipped(book_id)
payload["html"] = None
payload["skipped"] = True
payload["path"] = None
return payload
mark_chapter_started(book_id, chapter_num)
# -----------------------------------------------------------
# SKIP IF FILE ALREADY EXISTS
# -----------------------------------------------------------
# SKIP IF FILE ALREADY SAVED
save_path = get_save_path(chapter_num, volume_path)
if os.path.exists(save_path):
log_msg(book_id, f"[DL] SKIP {chapter_num} ({chapter_title}) → {save_path}")
return {
"book_id": book_id,
"chapter": chapter_dict,
"html": None,
"skipped": True,
"path": save_path,
"book_meta": book_meta,
}
# -----------------------------------------------------------
# GLOBAL + SYNC DELAY
# -----------------------------------------------------------
log_msg(book_id, f"[DL] SKIP {chapter_num}{save_path}")
inc_download_skipped(book_id)
payload["html"] = None
payload["skipped"] = True
payload["path"] = save_path
return payload
# GLOBAL DELAY + SLOT
if GLOBAL_DELAY > 0:
time.sleep(GLOBAL_DELAY)
wait_for_global_delay()
acquire_global_slot(MAX_CONCURRENCY)
# log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
# -----------------------------------------------------------
# HTTP DOWNLOAD
# -----------------------------------------------------------
try:
log_msg(
book_id,
f"[DL] Downloading {chapter_num} ({chapter_title}): {chapter_url}",
)
log_msg(book_id, f"[DL] Downloading {chapter_num} ({chapter_title})")
resp = requests.get(
chapter_url,
@ -178,39 +166,27 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes")
return {
"book_id": book_id,
"chapter": chapter_dict,
"html": html,
"skipped": False,
"path": save_path,
"book_meta": book_meta,
}
inc_download_done(book_id)
# --- attach results ---
payload["html"] = html
payload["skipped"] = False
payload["path"] = save_path
return payload
except Exception as exc:
attempt = self.request.retries
delay = BASE_DELAY * (BACKOFF**attempt)
# Specific 429 handler
if getattr(getattr(exc, "response", None), "status_code", None) == 429:
log_msg(
book_id,
f"[DL] 429 {chapter_num} → WAIT {DELAY_429}s "
f"(attempt {attempt}/{MAX_RETRIES})",
)
log_msg(book_id, f"[DL] 429 → WAIT {DELAY_429}s")
time.sleep(DELAY_429)
set_global_delay()
raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES)
# Normal retry
log_msg(
book_id,
f"[DL] ERROR {chapter_num}: {exc} → retry in {delay}s "
f"(attempt {attempt}/{MAX_RETRIES})",
)
log_msg(book_id, f"[DL] ERROR {chapter_num}: {exc} → retry {delay}s")
raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
finally:
set_global_delay()
release_global_slot()
# log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")

@ -1,33 +1,31 @@
# =========================================================
# ============================================================
# File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text.
# Enhanced version: Piaotia H1→content extractor + clean pipeline
# NO HARDCODED REPLACEMENTS — everything comes from replacement files
# =========================================================
# Enhanced Piaotia extractor + selector fallback + clean pipeline.
# Compatible with payload pipeline v3.
# ============================================================
from celery_app import celery_app
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString, Comment
from scraper.utils import clean_text, load_all_replacements
from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.tasks.download_tasks import log_msg
from scraper.utils.utils import clean_text, load_all_replacements
from scraper.logger_decorators import logcall
from db.repository import inc_parsed_done
from bs4 import NavigableString, Comment
print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
print(">>> [IMPORT] parse_tasks.py loaded (RESTORED + payload v3)")
# ============================================================
# PIAOTIA ADVANCED CONTENT EXTRACTOR (unchanged from original)
# ============================================================
def extract_piaotia_content(soup):
"""
Extract clean chapter content from Piaotia pages.
Start after the table following <H1>.
End before nav/ads/footer/copyright.
"""
h1 = soup.find("h1")
if not h1:
return None
# -------- Find first table after <h1> --------
# Find first table after <h1>
table = None
for sib in h1.next_siblings:
if getattr(sib, "name", None) == "table":
@ -39,44 +37,41 @@ def extract_piaotia_content(soup):
parts = []
# -------- Iterate after table --------
for sib in table.next_siblings:
name = getattr(sib, "name", None)
text = None
if hasattr(sib, "get_text"):
text = sib.get_text(strip=True)
# === STOP CONDITIONS ===
# STOP CONDITIONS
# Comments like <!-- 翻页上AD开始 -->
# <!-- 翻页 -->
if isinstance(sib, Comment) and ("翻页" in sib):
break
# Explicit footer blocks
# explicit footer blocks
if name == "div":
sid = sib.get("id", "")
cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break
# Copyright block — strongest indicator
# copyright block
if text and ("重要声明" in text or "Copyright" in text):
break
# Navigation or 推荐阅读
# navigation blocks
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break
# Skip scripts, ads, centers
if name in ("script", "style"):
continue
# Skip JS containers like <center><script>...</script></center>
if name == "center":
continue
# === ACCUMULATE TEXT ===
# ACCUMULATE
if isinstance(sib, NavigableString):
s = sib.strip()
if s:
@ -90,36 +85,42 @@ def extract_piaotia_content(soup):
return "\n".join(parts).strip()
# ============================================================
# PARSE TASK — PAYLOAD PIPELINE (CORRECT v3 FORMAT)
# ============================================================
@celery_app.task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, download_result: dict):
"""
New signature under chapter_dict pipeline:
- receives ONLY the output dict from download_chapter
- book_meta is inside download_result["book_meta"]
- chapter_dict is inside download_result["chapter"]
"""
book_id = download_result.get("book_id", "NOBOOK")
chapter_dict = download_result.get("chapter") or {}
book_meta = download_result.get("book_meta") or {}
chapter_title = chapter_dict.get("title")
chapter_num = chapter_dict.get("num")
chapter_url = chapter_dict.get("url")
html = download_result.get("html")
# ------------------------------------------------------------
@logcall
def parse_chapter(self, payload: dict):
if not payload:
return {"skipped": True, "reason": "empty_payload"}
book_id = payload["book_id"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
num = chapter.get("num")
title = chapter.get("title") or f"Chapter {num}"
html = payload.get("html")
# SKIPPED DOWNLOAD → SKIP PARSE
# ------------------------------------------------------------
if download_result.get("skipped"):
log_msg(book_id, f"[PARSE] SKIP chapter {chapter_num} (download skipped)")
return download_result # already has chapter + book_meta + skipped
if payload.get("skipped"):
log_msg(book_id, f"[PARSE] SKIP chapter {num} (download skipped)")
return payload
log_msg(book_id, f"[PARSE] Parsing chapter {chapter_num}")
if not html:
log_msg(book_id, f"[PARSE] Missing HTML for chapter {num} → SKIP")
payload["parsed"] = None
payload["skipped"] = True
return payload
log_msg(book_id, f"[PARSE] Parsing chapter {num}")
soup = BeautifulSoup(html, "lxml")
# ------------------------------------------------------------
# STRICT SELECTORS (direct content blocks)
# ------------------------------------------------------------
# ============================================================
# STRICT SELECTORS
# ============================================================
selectors = [
"#content",
"div#content",
@ -141,63 +142,32 @@ def parse_chapter(self, download_result: dict):
raw = None
# --- STRICT SELECTOR FAILED → Try Piaotia extractor ---
# --- STRICT SELECTOR FAILED → Piaotia extractor ---
if node is None:
raw = extract_piaotia_content(soup)
else:
raw = node.get_text(separator="\n")
# # ------------------------------------------------------------
# # PIAOTIA FALLBACK:
# # Extract content between <H1> and the "bottomlink" block.
# # ------------------------------------------------------------
# raw = None
# if node is None:
# h1 = soup.find("h1")
# if h1:
# content_parts = []
# for sib in h1.next_siblings:
# sib_class = getattr(sib, "get", lambda *_: None)("class")
# if sib_class and (
# "bottomlink" in sib_class or sib_class == "bottomlink"
# ):
# break
# if getattr(sib, "name", None) in ["script", "style", "center"]:
# continue
# if hasattr(sib, "get_text"):
# content_parts.append(sib.get_text(separator="\n"))
# else:
# content_parts.append(str(sib))
# raw = "\n".join(content_parts)
# ------------------------------------------------------------
# FINAL FALLBACK
# ------------------------------------------------------------
if raw is None:
if node:
raw = node.get_text(separator="\n")
else:
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
raw = soup.get_text(separator="\n")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
raw = soup.get_text(separator="\n")
# ------------------------------------------------------------
# MULTIPASS CLEANING via replacement files ONLY
# ------------------------------------------------------------
# ============================================================
# MULTIPASS CLEANING via replacement files
# ============================================================
REPL = load_all_replacements()
text = raw
for _ in range(5):
text = clean_text(text, REPL)
# ------------------------------------------------------------
# Collapse excessive empty lines
# ------------------------------------------------------------
# ============================================================
# Collapse double blank lines
# ============================================================
cleaned = []
prev_blank = False
for line in text.split("\n"):
stripped = line.rstrip()
if stripped == "":
@ -208,28 +178,31 @@ def parse_chapter(self, download_result: dict):
else:
prev_blank = False
cleaned.append(stripped)
text = "\n".join(cleaned)
text = chapter_title + "\n" + text
# ------------------------------------------------------------
text = f"{title}\n{text}"
# ============================================================
# Add header to chapter 1
# ------------------------------------------------------------
if chapter_num == 1:
book_url = book_meta.get("book_url") or book_meta.get("url") or "UNKNOWN"
# ============================================================
if num == 1:
book_url = book_meta.get("book_url") or "UNKNOWN"
header = (
f"{book_meta.get('title','')}\n"
f"{book_meta.get('title', '')}\n"
f"Author: {book_meta.get('author','')}\n"
f"Description:\n{book_meta.get('description','')}\n"
f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
)
text = header + text
log_msg(book_id, f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars")
log_msg(book_id, f"[PARSE] Parsed chapter {num}: {len(text)} chars")
# ============================================================
# PAYLOAD OUTPUT (v3)
# ============================================================
payload["parsed"] = text
payload["skipped"] = False
inc_parsed_done(book_id)
# NEW RETURN FORMAT: chapter_dict stays intact
return {
"book_id": book_id,
"chapter": chapter_dict,
"text": text,
"length": len(text),
"book_meta": book_meta,
}
return payload

@ -1,16 +1,12 @@
# =========================================================
# File: scraper/tasks/pipeline.py
# Purpose:
# Build Celery chains for chapter processing using chapter_dict.
# Build Celery chains for chapter processing using payload dict.
#
# New Chain:
# download_chapter(book_id, chapter_dict, book_meta)
# → parse_chapter(download_result)
# → save_chapter(parsed_result)
# → update_progress(final_result, book_id)
#
# All subtasks pass through result dicts unchanged so the
# next stage receives the correct fields.
# Pipeline v3:
# download_chapter(payload)
# → parse_chapter(payload)
# → save_chapter(payload)
# =========================================================
from celery import chain
@ -18,26 +14,28 @@ from celery import chain
from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter
from scraper.tasks.progress_tasks import update_progress
from scraper.logger_decorators import logcall
def build_chapter_pipeline(
book_id: str,
chapter_dict: dict,
book_meta: dict,
):
"""
Build a Celery chain for one chapter using chapter_dict.
download_chapter(book_id, chapter_dict, book_meta)
parse_chapter(download_result)
save_chapter(parsed_result)
update_progress(result, book_id)
@logcall
def build_chapter_pipeline(book_id: str, chapter_dict: dict, book_meta: dict):
"""
Payload model passed through entire pipeline.
"""
payload = {
"book_id": book_id,
"chapter": chapter_dict,
"book_meta": book_meta,
"html": None,
"parsed": None,
"skipped": False,
"path": None,
}
return chain(
download_chapter.s(book_id, chapter_dict, book_meta),
download_chapter.s(payload),
parse_chapter.s(),
save_chapter.s(),
update_progress.s(book_id),
)

@ -1,57 +0,0 @@
# ============================================================
# File: scraper/tasks/progress_tasks.py
# Purpose: Central progress updater for chapter pipelines.
# Updated for chapter_dict pipeline model.
# ============================================================
from celery_app import celery_app
from scraper.progress import inc_completed, inc_skipped, inc_failed
from logbus.publisher import log
print(">>> [IMPORT] progress_tasks.py loaded")
@celery_app.task(bind=False, name="progress.update", queue="controller")
def update_progress(result: dict, book_id: str):
"""
Central progress logic:
- result: output of save_chapter
- book_id: explicitly passed by pipeline
IMPORTANT:
- save_chapter already updates counters for skipped & normal chapters
- progress.update MUST NOT double-increment
"""
ch = result.get("chapter") or {}
chapter_num = ch.get("num")
skipped = result.get("skipped", False)
failed = result.get("failed", False)
# ------------------------------------------------------------
# FAILED CASE
# ------------------------------------------------------------
if failed:
inc_failed(book_id)
log(f"[PROG] FAILED chapter {chapter_num}")
return result
# ------------------------------------------------------------
# SKIPPED CASE
# ------------------------------------------------------------
if skipped:
# save_chapter already did:
# inc_skipped(book_id)
log(f"[PROG] SKIPPED chapter {chapter_num}")
return result
# ------------------------------------------------------------
# NORMAL COMPLETION
# ------------------------------------------------------------
# save_chapter did NOT increment completed for skipped cases
# but DID inc_completed(book_id) for normal cases.
# update_progress should NOT double increment, so only log here.
log(f"[PROG] DONE chapter {chapter_num}")
return result

@ -1,139 +1,83 @@
# ============================================================
# File: scraper/tasks/save_tasks.py
# Purpose: Save parsed chapter text to disk + trigger audio.
# Updated for chapter_dict + book_meta pipeline model.
# File: scraper/tasks/save_tasks.py (RESTORED AUDIO LOGIC)
# ============================================================
print(">>> [IMPORT] save_tasks.py loaded")
from celery import shared_task
import os
from scraper.utils import get_save_path
from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.progress import (
inc_completed,
inc_chapter_done,
inc_chapter_download_skipped,
)
from logbus.publisher import log
from scraper.logger_decorators import logcall
from scraper.utils.utils import get_save_path
from scraper.tasks.download_tasks import log_msg
from scraper.tasks.audio_tasks import generate_audio
from db.repository import inc_download_done, inc_download_skipped
@shared_task(bind=True, queue="save", ignore_result=False)
def save_chapter(self, parsed: dict):
"""
New pipeline model:
parsed = {
"book_id": str,
"chapter": chapter_dict,
"text": str,
"length": int,
"book_meta": dict,
"skipped": bool,
"path": optional str (if skipped)
}
"""
book_id = parsed.get("book_id", "NOBOOK")
chapter_dict = parsed.get("chapter") or {}
book_meta = parsed.get("book_meta") or {}
chapter_num = chapter_dict.get("num")
chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
volume_path = chapter_dict.get("volume_path")
# ------------------------------------------------------------
# VALIDATION
# ------------------------------------------------------------
if chapter_num is None or volume_path is None:
raise ValueError("Invalid parsed payload: chapter_dict missing fields.")
# ------------------------------------------------------------
# SKIPPED CASE
# ------------------------------------------------------------
if parsed.get("skipped"):
path = parsed.get("path", None)
log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num}{path}")
inc_chapter_download_skipped(book_id)
volume_name = os.path.basename(volume_path.rstrip("/"))
# Queue audio only if a valid file exists
@logcall
def save_chapter(self, payload: dict):
if not payload:
log("[SAVE] ERROR: payload is None")
return {"error": True}
book_id = payload["book_id"]
chapter = payload["chapter"]
parsed = payload.get("parsed")
path = payload.get("path")
skipped = payload.get("skipped")
num = chapter["num"]
title = chapter.get("title") or f"Chapter {num}"
volume = chapter.get("volume_path")
volume_name = os.path.basename(volume.rstrip("/"))
# ============================================================
# SKIPPED CASE (restore old behavior)
# ============================================================
if skipped or not parsed:
log_msg(book_id, f"[SAVE] SKIP chapter {num}")
inc_download_skipped(book_id)
# Restore old behavior:
# If file already exists, STILL trigger audio.
if path and os.path.exists(path):
log_msg(book_id, f"[AUDIO] Queueing audio for SKIPPED chapter {num}")
try:
generate_audio.delay(
book_id,
volume_name,
chapter_num,
chapter_title,
path,
)
log_msg(
book_id,
f"[AUDIO] Task queued (SKIPPED) for chapter {chapter_num} in {volume_name}",
)
except Exception as audio_exc:
log_msg(
book_id,
f"[AUDIO] ERROR queueing (SKIPPED) chapter {chapter_num}: {audio_exc}",
)
return {
"book_id": book_id,
"chapter": chapter_dict,
"path": path,
"skipped": True,
"book_meta": book_meta,
}
# ------------------------------------------------------------
# NORMAL SAVE CASE
# ------------------------------------------------------------
try:
text = parsed.get("text", "")
generate_audio.delay(book_id, volume_name, num, title, path)
except Exception as exc:
log_msg(book_id, f"[AUDIO] ERROR queueing skipped audio: {exc}")
# Ensure volume folder exists
os.makedirs(volume_path, exist_ok=True)
return payload
# Build final chapter file path
path = get_save_path(chapter_num, volume_path)
# ============================================================
# NORMAL SAVE CASE
# ============================================================
try:
os.makedirs(volume, exist_ok=True)
save_path = get_save_path(num, volume)
# Write chapter text to file
with open(path, "w", encoding="utf-8") as f:
f.write(text)
with open(save_path, "w", encoding="utf-8") as f:
f.write(parsed)
log_msg(book_id, f"[SAVE] Saved chapter {chapter_num}{path}")
inc_chapter_done(book_id)
inc_completed(book_id)
log_msg(book_id, f"[SAVE] Saved chapter {num}{save_path}")
# Determine volume name
volume_name = os.path.basename(volume_path.rstrip("/"))
inc_download_done(book_id)
# Queue audio task
# Restore old behavior → ALWAYS queue audio
try:
generate_audio.delay(
book_id,
volume_name,
chapter_num,
chapter_title,
path,
)
log_msg(
book_id,
f"[AUDIO] Task queued for chapter {chapter_num} in {volume_name}",
)
except Exception as audio_exc:
log_msg(
book_id, f"[AUDIO] ERROR queueing chapter {chapter_num}: {audio_exc}"
)
return {
"book_id": book_id,
"chapter": chapter_dict,
"path": path,
"book_meta": book_meta,
}
generate_audio.delay(book_id, volume_name, num, title, save_path)
log_msg(book_id, f"[AUDIO] Task queued for chapter {num}")
except Exception as exc:
log_msg(book_id, f"[AUDIO] ERROR queueing chapter {num}: {exc}")
payload["path"] = save_path
payload["skipped"] = False
return payload
except Exception as exc:
log_msg(book_id, f"[SAVE] ERROR saving chapter {chapter_num}: {exc}")
log_msg(book_id, f"[SAVE] ERROR saving chapter {num}: {exc}")
raise

@ -9,6 +9,7 @@ from logbus.publisher import log
import os
import redis
from scraper.logger_decorators import logcall
from scraper.sites import BookSite
from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort # no circular deps

@ -0,0 +1,141 @@
# ============================================================
# File: scraper/utils/state_sync.py
# Purpose:
# State inspection + optional sync logic for book progress.
# This version provides:
# • inspect_books_state() → NO writes, just a dry-run
# • sync_books_from_redis() → NOT USED YET (kept commented)
# ============================================================
import os
import redis
from db.db import get_db
def inspect_books_state():
"""
Reads all books from SQLite and fetches Redis progress,
but performs NO writes. Only shows:
- sqlite row
- redis state
- merged result (dry-run)
Returns a list of inspection dicts.
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"))
db = get_db()
cur = db.cursor()
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
book_id = row["book_id"]
sqlite_row = dict(row)
# Read redis state
redis_key = f"book:{book_id}:state"
progress = r.hgetall(redis_key)
if progress:
decoded = {k.decode(): v.decode() for k, v in progress.items()}
else:
decoded = {}
# Determine dry-run merged result
merged = sqlite_row.copy()
if decoded:
merged["downloaded"] = int(
decoded.get("download_done", merged.get("downloaded", 0))
)
merged["parsed"] = int(decoded.get("parsed_done", merged.get("parsed", 0)))
merged["audio_done"] = int(
decoded.get("audio_done", merged.get("audio_done", 0))
)
merged["chapters_total"] = int(
decoded.get("chapters_total", merged.get("chapters_total", 0))
)
merged["status"] = decoded.get("status", merged.get("status", "unknown"))
results.append(
{
"book_id": book_id,
"sqlite": sqlite_row,
"redis": decoded,
"would_merge_to": merged,
}
)
return results
def sync_books_from_redis():
"""
Reads all books from SQLite, fetches Redis progress,
and updates SQLite rows accordingly.
Returns a list of {
"book_id": ...,
"before": ...,
"redis": ...,
"after": ...
}
"""
r = redis.Redis.from_url(os.getenv("REDIS_BROKER"))
db = get_db()
cur = db.cursor()
# Haal alle boeken op
cur.execute("SELECT * FROM books")
rows = cur.fetchall()
results = []
for row in rows:
book_id = row["book_id"]
before = dict(row)
redis_key = f"book:{book_id}:state"
progress = r.hgetall(redis_key)
if not progress:
results.append(
{"book_id": book_id, "before": before, "redis": {}, "after": before}
)
continue
# Decode Redis bytes → string dictionary
decoded = {k.decode(): v.decode() for k, v in progress.items()}
# Extract counters
downloaded = int(decoded.get("download_done", 0))
parsed = int(decoded.get("parsed_done", 0))
audio_done = int(decoded.get("audio_done", 0))
chapters_total = int(decoded.get("chapters_total", 0))
# Redis status wins
status = decoded.get("status", before["status"])
# Write back to SQLite
cur.execute(
"""
UPDATE books
SET downloaded = ?, parsed = ?, audio_done = ?, chapters_total = ?, status = ?, last_update = datetime('now')
WHERE book_id = ?
""",
(downloaded, parsed, audio_done, chapters_total, status, book_id),
)
db.commit()
# Fetch updated row
cur.execute("SELECT * FROM books WHERE book_id = ?", (book_id,))
after = dict(cur.fetchone())
results.append(
{"book_id": book_id, "before": before, "redis": decoded, "after": after}
)
return results

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

@ -0,0 +1,201 @@
/* =======================================================================
File: static/css/bookcard.css
Purpose:
All styling for registered book cards (book-card) +
status colors + start/abort buttons.
======================================================================= */
/* -----------------------------------------------------------------------
GRID WRAPPER FOR REGISTERED BOOKS
----------------------------------------------------------------------- */
.registered-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(340px, 1fr));
gap: 20px;
margin-top: 15px;
}
/* -----------------------------------------------------------------------
MAIN BOOK CARD
----------------------------------------------------------------------- */
.book-card {
position: relative;
display: grid;
grid-template-columns: 90px auto;
gap: 15px;
padding: 15px;
background: #fff;
border-radius: 10px;
border: 1px solid #e5e5e5;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
transition: border-color 0.25s ease, box-shadow 0.25s ease;
}
/* -----------------------------------------------------------------------
BOOK STATUS COLORS
----------------------------------------------------------------------- */
.book-card.processing {
border-color: #007aff;
box-shadow: 0 0 6px rgba(0, 122, 255, 0.35);
}
.book-card.downloading {
border-color: #ff9500;
box-shadow: 0 0 6px rgba(255, 149, 0, 0.35);
}
.book-card.parsing {
border-color: #ffcc00;
box-shadow: 0 0 6px rgba(255, 204, 0, 0.35);
}
.book-card.audio {
border-color: #e65100;
box-shadow: 0 0 6px rgba(230, 81, 0, 0.35);
}
.book-card.completed {
border-color: #34c759;
box-shadow: 0 0 6px rgba(52, 199, 89, 0.35);
}
.book-card.aborted {
border-color: #ff3b30;
box-shadow: 0 0 6px rgba(255, 59, 48, 0.35);
}
/* -----------------------------------------------------------------------
COVER IMAGE
----------------------------------------------------------------------- */
.book-cover {
width: 90px;
}
.book-img {
width: 90px;
height: 130px;
object-fit: cover;
border-radius: 4px;
background: #f4f4f4;
}
.placeholder {
display: flex;
justify-content: center;
align-items: center;
color: #777;
font-size: 12px;
}
/* -----------------------------------------------------------------------
META INFORMATION
----------------------------------------------------------------------- */
.book-meta {
display: flex;
flex-direction: column;
justify-content: space-between;
}
.book-title {
font-size: 16px;
font-weight: bold;
margin-bottom: 4px;
}
.book-author {
font-size: 14px;
color: #444;
margin-bottom: 8px;
}
.book-created {
font-size: 12px;
color: #666;
margin-bottom: 10px;
}
/* -----------------------------------------------------------------------
ICON BUTTONS
----------------------------------------------------------------------- */
.icon-btn {
width: 34px;
height: 34px;
border: none;
border-radius: 8px;
display: flex;
justify-content: center;
align-items: center;
font-size: 16px;
color: #fff;
cursor: pointer;
transition: background 0.15s ease, transform 0.1s ease;
}
/* Start (green) */
.icon-start {
background: #2d8a3d;
}
.icon-start:hover {
background: #226c30;
transform: scale(1.05);
}
.icon-start:disabled {
background: #9bbb9f !important;
cursor: not-allowed;
transform: none;
opacity: 0.5;
}
/* Abort (red) */
.icon-abort {
background: #c62828;
}
.icon-abort:hover {
background: #a31f1f;
transform: scale(1.05);
}
.icon-abort:disabled {
background: #d8a0a0 !important;
cursor: not-allowed;
transform: none;
opacity: 0.5;
}
/* Hide button (gray) */
.hide-form {
position: absolute;
top: 6px;
right: 6px;
margin: 0;
}
.icon-hide {
background: #777;
}
.icon-hide:hover {
background: #555;
transform: scale(1.05);
}
/* -----------------------------------------------------------------------
BOOK ACTIONS (right aligned button row)
----------------------------------------------------------------------- */
.book-actions {
display: flex;
justify-content: flex-end; /* rechts uitlijnen */
gap: 10px; /* ruimte tussen knoppen */
margin-top: 12px;
}

@ -2,33 +2,28 @@
File: static/css/dashboard.css
Purpose:
Clean full-width vertical dashboard layout with large log viewer.
Book-card CSS is now moved to bookcard.css
======================================================================= */
/* ------------------------------
GENERAL PAGE LAYOUT
------------------------------ */
/* -----------------------------------------------------------------------
1) GENERAL PAGE LAYOUT
----------------------------------------------------------------------- */
/* Dashboard content should use full width */
.dashboard-container {
display: flex;
flex-direction: column;
width: 100%;
max-width: 1200px; /* voorkomt overflow rechts */
max-width: 1200px;
margin: 20px auto;
padding: 0 20px;
gap: 18px; /* kleiner dan 30px */
gap: 18px;
}
/* ------------------------------
SECTIONS (input, progress, logs)
------------------------------ */
.dashboard-section {
background: #ffffff;
padding: 16px; /* kleiner */
padding: 16px;
border-radius: 6px;
border: 1px solid #ddd;
margin: 0; /* weg extra witruimte */
}
.page-title {
@ -36,9 +31,9 @@
margin-bottom: 15px;
}
/* ------------------------------
BOOK LIST (optional)
------------------------------ */
/* -----------------------------------------------------------------------
2) ACTIVE BOOK LIST (dashboard left panel)
----------------------------------------------------------------------- */
.book-list {
display: flex;
@ -52,7 +47,6 @@
color: #777;
}
/* List item */
.book-list-item {
padding: 12px 16px;
background: #f7f7f7;
@ -73,7 +67,6 @@
border-color: #1e88e5;
}
/* Title + metadata */
.book-title {
font-size: 16px;
font-weight: 600;
@ -84,13 +77,9 @@
color: #555;
}
.meta-label {
font-weight: 600;
}
/* ------------------------------
PROGRESS BOX
------------------------------ */
/* -----------------------------------------------------------------------
3) PROGRESS BOX
----------------------------------------------------------------------- */
.progress-box {
background: #fafafa;
@ -141,14 +130,35 @@
margin-top: 4px;
}
/* ------------------------------
LOG VIEWER LARGE FULL-WIDTH
------------------------------ */
.book-abort-area {
margin-top: 10px;
text-align: right;
}
.abort-btn {
padding: 6px 12px;
border-radius: 4px;
border: 1px solid #cc0000;
background: #ff4444;
color: white;
font-size: 12px;
cursor: pointer;
transition: background 0.2s, border-color 0.2s;
}
.abort-btn:hover {
background: #ff2222;
border-color: #aa0000;
}
/* -----------------------------------------------------------------------
4) LOG VIEWER
----------------------------------------------------------------------- */
.log-viewer {
width: 100%;
max-width: 100%;
overflow: hidden; /* voorkom horizontaal uitsteken */
overflow: hidden;
}
.log-header {
@ -171,11 +181,11 @@
max-height: 75vh;
overflow-y: auto;
overflow-x: hidden; /* voorkom dat de log naar rechts uitsteekt */
overflow-x: hidden;
background: #000000; /* Pure terminal black */
color: #00ff66; /* Matrix / retro green */
border: 1px solid #0f0; /* neon green frame */
background: #000;
color: #00ff66;
border: 1px solid #0f0;
border-radius: 6px;
padding: 12px;
@ -183,48 +193,39 @@
font-size: 13px;
line-height: 1.35;
white-space: pre-wrap; /* wraps text */
word-break: break-word; /* lange links breken */
white-space: pre-wrap;
word-break: break-word;
}
/* Basestijl voor alle logregels */
.log-line {
white-space: pre-wrap;
padding: 2px 0;
font-family: "SF Mono", "Consolas", "Courier New", monospace;
}
/* Subklassen per logtype */
.log-line.default {
color: #00ff66; /* groen */
color: #00ff66;
}
.log-line.dl {
color: #00ccff; /* cyan */
color: #00ccff;
}
.log-line.parse {
color: #ffaa00; /* oranje */
color: #ffaa00;
}
.log-line.save {
color: #ffdd33; /* geel */
color: #ffdd33;
}
.log-line.audio {
color: #ff66ff; /* paars */
color: #ff66ff;
}
.log-line.ctrl {
color: #66aaff; /* lichtblauw */
color: #66aaff;
}
.log-line.error {
color: #ff3333; /* rood */
color: #ff3333;
}
/* ------------------------------
PLACEHOLDER
------------------------------ */
/* -----------------------------------------------------------------------
5) PLACEHOLDER / FOOTER
----------------------------------------------------------------------- */
.dashboard-placeholder {
font-size: 15px;
@ -232,6 +233,7 @@
text-align: center;
color: #777;
}
.footer {
text-align: center;
padding: 12px;
@ -240,23 +242,56 @@
font-size: 12px;
border-top: 1px solid #ddd;
}
.book-abort-area {
margin-top: 10px;
text-align: right;
/* -----------------------------
DROPDOWN NAVIGATION
------------------------------ */
/* Container for dropdown */
.nav-dropdown {
position: relative;
}
.abort-btn {
padding: 6px 12px;
border-radius: 4px;
border: 1px solid #cc0000;
background: #ff4444;
color: white;
font-size: 12px;
/* The clickable label ("Tools ▾") */
.nav-dropdown > .nav-item {
cursor: pointer;
transition: background 0.2s, border-color 0.2s;
}
.abort-btn:hover {
background: #ff2222;
border-color: #aa0000;
/* Hide dropdown by default */
.dropdown-menu {
display: none;
position: absolute;
top: 100%;
right: 0;
background: #fff; /* zelfde achtergrond als navbar */
border: 1px solid #ddd;
padding: 8px 0;
margin: 0;
list-style: none; /* verwijder bolletjes */
border-radius: 4px;
min-width: 160px;
z-index: 1000;
}
/* Show dropdown when hovering over parent */
.nav-dropdown:hover .dropdown-menu {
display: block;
}
/* Menu item styling */
.dropdown-menu li {
padding: 0;
margin: 0;
}
.dropdown-menu li a {
display: block;
padding: 8px 16px;
white-space: nowrap;
color: #333;
text-decoration: none;
}
/* Hover state */
.dropdown-menu li a:hover {
background: #f0f0f0;
}

@ -2,15 +2,15 @@
File: static/js/dashboard.js
Purpose:
Dashboard interactions:
- select book
- refresh logs
- refresh progress
- Select active book
- Live logs & progress
- Bookcard AJAX start/abort
NOTE:
$ / $$ / autoScroll komen uit helpers.js
updateLogs() is provided by log_view.js
======================================================================= */
/* ---------------------------------------------------------
Simple fetch wrapper
Utility: Safe fetch wrapper
--------------------------------------------------------- */
async function apiGet(url) {
try {
@ -32,79 +32,61 @@ let REFRESH_INTERVAL = null;
console.log(">>> dashboard.js LOADED");
/* ---------------------------------------------------------
DOM Ready setup
DOM READY
--------------------------------------------------------- */
document.addEventListener("DOMContentLoaded", () => {
console.log(">>> dashboard.js DOMContentLoaded");
// =====================================================
// GLOBAL FALLBACK POLLING — ALWAYS FETCH LOGS
// Runs when no books exist or no selection has been made
// =====================================================
console.log(">>> dashboard.js: enabling global fallback polling");
// Fallback: fetch global logs if no active book
setInterval(() => {
// if no active book → fetch global logs
if (!ACTIVE_BOOK) {
refreshBook(null); // triggers /logs
}
if (!ACTIVE_BOOK) refreshBook(null);
}, 2000);
// Sidebar items
const items = $$(".book-list-item");
console.log(">>> dashboard.js found book-list items:", items.length);
// Geen boeken → geen polling starten
// if (!items || items.length === 0) {
// console.log(">>> dashboard.js: geen boeken aanwezig, polling uit.");
// return;
// }
// Book selection listener
items.forEach((item) => {
item.addEventListener("click", () => {
console.log(">>> dashboard.js: user clicked book:", item.dataset.bookId);
selectBook(item.dataset.bookId);
});
});
// Auto-select first book
// Auto-select
if (!ACTIVE_BOOK && items[0]) {
console.log(
">>> dashboard.js: auto-select first book:",
items[0].dataset.bookId
);
selectBook(items[0].dataset.bookId);
}
// Initial binding of book-card buttons
bindBookCardButtons();
// Refresh sidebar every 2 seconds
setInterval(refreshActiveBooks, 2800);
});
/* ---------------------------------------------------------
Select a book (updates UI + starts polling)
Select a book
--------------------------------------------------------- */
function selectBook(bookId) {
console.log(">>> selectBook(", bookId, ")");
ACTIVE_BOOK = bookId;
console.log(">>> Selecting book", bookId);
// Highlight
// Highlight sidebar
$$(".book-list-item").forEach((el) => {
el.classList.toggle("active", el.dataset.bookId === bookId);
});
// Reset previous polling
if (REFRESH_INTERVAL) {
console.log(">>> dashboard.js: clearing previous polling interval");
clearInterval(REFRESH_INTERVAL);
}
// Reset polling
if (REFRESH_INTERVAL) clearInterval(REFRESH_INTERVAL);
// Start new polling
console.log(">>> dashboard.js: starting polling for bookId =", bookId);
REFRESH_INTERVAL = setInterval(() => {
refreshBook(ACTIVE_BOOK);
}, 2000);
// Immediate refresh
refreshBook(ACTIVE_BOOK);
}
setInterval(refreshActiveBooks, 2000);
/* ---------------------------------------------------------
Refresh sidebar list
--------------------------------------------------------- */
async function refreshActiveBooks() {
const books = await apiGet("/api/books");
if (!books) return;
@ -112,8 +94,8 @@ async function refreshActiveBooks() {
const container = $("#book-list");
if (!container) return;
// Herbouw de lijst
container.innerHTML = "";
books.forEach((b) => {
const div = document.createElement("div");
div.className = "book-list-item";
@ -126,75 +108,152 @@ async function refreshActiveBooks() {
${b.download_done}/${b.download_total} downloaded,
${b.audio_done}/${b.audio_total} audio
</div>
<button class="abort-btn" onclick="abortBook('${b.book_id}')">Abort</button>
`;
// Event listener opnieuw koppelen
div.addEventListener("click", () => selectBook(b.book_id));
container.appendChild(div);
});
// Als ACTIVE_BOOK nog niet bekend → auto-selecteer eerste boek
if (!ACTIVE_BOOK && books.length > 0) {
selectBook(books[0].book_id);
}
}
/* ---------------------------------------------------------
Fetch logs + progress from API
Fetch logs + progress
--------------------------------------------------------- */
async function refreshBook(bookId) {
console.log(">>> refreshBook(", bookId, ")");
// 1) Als er GEEN bookId is → haal alleen globale logs op
if (!bookId) {
console.log(">>> refreshBook: no active book → fetch /logs");
const data = await apiGet("/logs");
if (data && data.logs) updateLogs(data.logs);
return; // klaar
if (data) updateLogs(data);
return;
}
// 2) Als er WEL een boek is → haal book status + logs op
const state = await apiGet(`/api/book/${bookId}/status`);
const logs = await apiGet(`/api/book/${bookId}/logs`);
console.log(">>> refreshBook state =", state);
console.log(">>> refreshBook logs =", logs);
if (state) updateProgressBars(state);
if (state) {
updateProgressBars(state);
refreshBookCards();
}
if (logs) updateLogs(logs);
}
/* ---------------------------------------------------------
Update LOG VIEW panel
BOOKCARD BUTTON BINDING idempotent
--------------------------------------------------------- */
function updateLogs(logList) {
const output = $("#log-output");
if (!output) {
console.warn(">>> updateLogs: no #log-output element found");
return;
}
function bindBookCardButtons() {
console.log(">>> bindBookCardButtons() scanning…");
// START BUTTONS
document.querySelectorAll(".book-card .icon-start").forEach((btn) => {
if (btn.dataset.bound === "1") return; // prevent double-binding
btn.dataset.bound = "1";
btn.addEventListener("click", (ev) => {
ev.preventDefault();
if (btn.disabled) return;
const bookId = btn.closest(".book-card").dataset.bookId;
console.log(">>> START clicked:", bookId);
startBook(bookId);
});
});
// ABORT BUTTONS
document.querySelectorAll(".book-card .icon-abort").forEach((btn) => {
if (btn.dataset.bound === "1") return;
btn.dataset.bound = "1";
output.innerHTML = "";
btn.addEventListener("click", (ev) => {
ev.preventDefault();
if (btn.disabled) return;
logList.forEach((line) => logAppend(line));
const bookId = btn.closest(".book-card").dataset.bookId;
console.log(">>> ABORT clicked:", bookId);
abortBookAjax(bookId);
});
});
}
autoScroll(output);
/* ---------------------------------------------------------
AJAX START
--------------------------------------------------------- */
function startBook(bookId) {
console.log(">>> startBook():", bookId);
fetch("/start", {
method: "POST",
headers: { "Content-Type": "application/x-www-form-urlencoded" },
body: `book_id=${bookId}`,
})
.then(async (r) => {
console.log(">>> /start status:", r.status);
let data = null;
try {
data = await r.json();
} catch (e) {}
console.log(">>> /start response:", data);
refreshBookCards();
refreshBook(bookId);
})
.catch((err) => console.error("Start failed:", err));
}
function abortBook(book_id) {
if (!confirm(`Abort tasks for book ${book_id}?`)) return;
/* ---------------------------------------------------------
AJAX ABORT
--------------------------------------------------------- */
function abortBookAjax(bookId) {
if (!confirm(`Abort tasks for book ${bookId}?`)) return;
console.log(">>> abortBookAjax():", bookId);
fetch(`/abort/${bookId}`, { method: "POST" })
.then(async (r) => {
let data = null;
try {
data = await r.json();
} catch (e) {}
console.log(">>> /abort response:", data);
fetch(`/abort/${book_id}`, { method: "POST" })
.then((r) => r.json())
.then((data) => {
console.log("Abort:", data);
refreshBookCards();
refreshBook(bookId);
})
.catch((err) => {
console.error("Abort failed:", err);
});
.catch((err) => console.error("Abort failed:", err));
}
/* ---------------------------------------------------------
Refresh all book-cards (status, classes, buttons)
--------------------------------------------------------- */
async function refreshBookCards() {
const books = await apiGet("/api/books");
if (!books) return;
document.querySelectorAll(".book-card").forEach((card) => {
const id = card.dataset.bookId;
const info = books.find((b) => b.book_id === id);
if (!info) return;
// Status CSS
card.className = `book-card ${info.status}`;
// Button states
const startBtn = card.querySelector(".icon-start");
const abortBtn = card.querySelector(".icon-abort");
if (startBtn) startBtn.disabled = info.status !== "registered";
if (abortBtn)
abortBtn.disabled = ![
"processing",
"downloading",
"parsing",
"audio",
].includes(info.status);
});
bindBookCardButtons(); // rebind new DOM
}

@ -1,38 +1,36 @@
/* =======================================================================
File: static/js/log_view.js
Purpose:
Log viewer functionality:
- filtering
- clearing
- auto-scroll
- delta polling (efficient)
- rolling limit (prevent GUI freeze)
High-performance rolling log viewer
- efficient delta polling
- append-only mode (no DOM reset)
- rolling limit (prevents memory freeze)
- supports both global logs and per-book logs
======================================================================= */
console.log(">>> log_view.js LOADING…");
/* ---------------------------------------------------------
Log filtering
Global log viewer state
--------------------------------------------------------- */
let LOG_FILTER = "ALL";
let LAST_LOG_INDEX = -1; // For delta polling
const MAX_LOG_LINES = 1000; // Rolling cap to prevent freezing
let LAST_LOG_INDEX = -1; // delta offset
const MAX_LOG_LINES = 600; // safe rolling window
/* ---------------------------------------------------------
Apply filter on existing log lines
--------------------------------------------------------- */
function applyLogFilter() {
console.log(">>> log_view.js applyLogFilter(), filter =", LOG_FILTER);
const lines = $$(".log-line");
console.log(">>> log_view.js number of log-line elements:", lines.length);
lines.forEach((line) => {
const text = line.innerText;
line.style.display =
LOG_FILTER === "ALL" || text.includes(LOG_FILTER) ? "block" : "none";
const show = LOG_FILTER === "ALL" || (text && text.includes(LOG_FILTER));
line.style.display = show ? "block" : "none";
});
}
/* ---------------------------------------------------------
UI bindings
DOM Ready bind clear/filter
--------------------------------------------------------- */
document.addEventListener("DOMContentLoaded", () => {
console.log(">>> log_view.js DOMContentLoaded");
@ -41,84 +39,107 @@ document.addEventListener("DOMContentLoaded", () => {
const clearBtn = $("#log-clear");
const output = $("#log-output");
if (!filterSel) {
console.log(">>> log_view.js: No log viewer found on this page.");
if (!output) {
console.log(
">>> log_view.js: No #log-output on this page → viewer disabled"
);
return;
}
console.log(">>> log_view.js: log viewer detected.");
// Filter dropdown
// filterSel.addEventListener("change", () => {
// LOG_FILTER = filterSel.value;
// console.log(">>> log_view.js filter changed to:", LOG_FILTER);
// applyLogFilter();
// });
// Filter dropdown (currently disabled in your UI)
// if (filterSel) {
// filterSel.addEventListener("change", () => {
// LOG_FILTER = filterSel.value;
// applyLogFilter();
// });
// }
// Clear log window
if (clearBtn) {
clearBtn.addEventListener("click", () => {
console.log(">>> log_view.js log-clear clicked → clearing output");
if (output) {
output.innerHTML = "";
LAST_LOG_INDEX = -1; // reset delta polling
}
console.log(">>> log_view.js: Clear log viewer");
output.innerHTML = "";
LAST_LOG_INDEX = -1; // reset delta polling
});
}
});
/* ---------------------------------------------------------
Append + Rolling buffer
Append ONE line (smart class assignment)
--------------------------------------------------------- */
function logAppend(lineText) {
function rollingAppend(lineText) {
const output = $("#log-output");
if (!output) return;
const div = document.createElement("div");
div.classList.add("log-line");
// -----------------------------------------------------
// Assign subtype classes
// -----------------------------------------------------
if (lineText.includes("[DL]") || lineText.includes("[DOWNLOAD]")) {
// Type detection
if (lineText.includes("[DL]") || lineText.includes("[DOWNLOAD]"))
div.classList.add("dl");
} else if (lineText.includes("[PARSE]")) {
div.classList.add("parse");
} else if (lineText.includes("[SAVE]")) {
div.classList.add("save");
} else if (lineText.includes("[AUDIO]")) {
div.classList.add("audio");
} else if (lineText.includes("[CTRL]")) {
div.classList.add("ctrl");
} else if (lineText.includes("[ERROR]")) {
div.classList.add("error");
} else {
div.classList.add("default");
}
else if (lineText.includes("[PARSE]")) div.classList.add("parse");
else if (lineText.includes("[SAVE]")) div.classList.add("save");
else if (lineText.includes("[AUDIO]")) div.classList.add("audio");
else if (lineText.includes("[CTRL]")) div.classList.add("ctrl");
else if (lineText.includes("[ERROR]")) div.classList.add("error");
else div.classList.add("default");
div.textContent = lineText;
div.innerText = lineText;
output.appendChild(div);
// Rolling buffer
while (output.children.length > MAX_LOG_LINES) {
// Rolling limit
while (output.childNodes.length > MAX_LOG_LINES) {
output.removeChild(output.firstChild);
}
}
/* ---------------------------------------------------------
Primary API entry: updateLogs()
Used by dashboard.js AND delta polling
Accepts:
{ logs: [...], last_index: N }
OR legacy:
{ lines: [...], total: N }
--------------------------------------------------------- */
function updateLogs(packet) {
const output = $("#log-output");
if (!output) return;
if (!packet) return;
// Normalized log arrays
let lines = packet.logs || packet.lines || [];
if (!Array.isArray(lines)) return;
// Append only new lines
lines.forEach((line) => rollingAppend(line));
// Update delta index
if (packet.last_index !== undefined) {
LAST_LOG_INDEX = packet.last_index;
} else if (packet.total !== undefined) {
LAST_LOG_INDEX = packet.total - 1;
}
applyLogFilter();
autoScroll(output);
}
/* ---------------------------------------------------------
Delta-based log polling
Delta polling: ONLY global logs use this
Dashboard overrides logs per book.
--------------------------------------------------------- */
function pollLogs() {
fetch(`/logs?last_index=${LAST_LOG_INDEX}`)
.then((r) => r.json())
.then((data) => {
const lines = data.lines || [];
if (lines.length > 0) {
lines.forEach((line) => logAppend(line));
LAST_LOG_INDEX = data.total - 1;
LAST_LOG_INDEX = data.last; // <-- DE JUISTE INDEX!
}
})
.catch((err) => {
@ -126,7 +147,6 @@ function pollLogs() {
});
}
// Poll every 800 ms
setInterval(pollLogs, 1800);
setInterval(pollLogs, 2800);
console.log(">>> log_view.js LOADED");

@ -0,0 +1,82 @@
{# ============================================================
File: templates/components/bookcard.html
Purpose:
Eén enkele boekkaart met:
- status styles
- cover
- metadata
- hide button
- start (play)
- abort (stop)
Requires:
variable "b" in context
============================================================ #}
<div class="book-card {{ b.status }}" data-book-id="{{ b.book_id }}">
<!-- ======================================================
HIDE BUTTON (icon-only)
====================================================== -->
<form
action="/hide/{{ b.book_id }}"
method="POST"
onsubmit="return confirm('Dit boek verbergen?')"
class="hide-form"
>
<button class="icon-btn icon-hide" title="Verbergen">
<i class="fa-solid fa-xmark"></i>
</button>
</form>
<!-- ======================================================
COVER
====================================================== -->
<div class="book-cover">
{% if b.cover_path %}
<img src="/{{ b.cover_path }}" alt="cover" class="book-img" />
{% else %}
<div class="book-img placeholder">?</div>
{% endif %}
</div>
<!-- ======================================================
META + BUTTONS
====================================================== -->
<div class="book-meta">
<div class="book-title">{{ b.title }}</div>
<div class="book-author">{{ b.author }}</div>
<div class="book-created">Geregistreerd: {{ b.created_at }}</div>
<div class="book-actions">
<!-- START -->
<form action="/start" method="POST">
<input type="hidden" name="book_id" value="{{ b.book_id }}" />
<button
class="icon-btn icon-start"
title="Start scraping"
{% if b.status != "registered" %}
disabled
{% endif %}
>
<i class="fa-solid fa-play"></i>
</button>
</form>
<!-- ABORT -->
<form action="/abort/{{ b.book_id }}" method="POST">
<input type="hidden" name="book_id" value="{{ b.book_id }}" />
<button
class="icon-btn icon-abort"
title="Stoppen (abort)"
{% if b.status not in ["processing","downloading","parsing","audio"] %}
disabled
{% endif %}
>
<i class="fa-solid fa-stop"></i>
</button>
</form>
</div>
</div> <!-- einde .book-meta -->
</div> <!-- einde .book-card -->

@ -1,16 +1,16 @@
<!-- =======================================================================
File: templates/components/nav.html
Purpose: Global navigation bar for BookScraper UI
Purpose: Global navigation bar for BookScraper UI (improved version)
======================================================================= -->
<nav class="navbar">
<div class="nav-inner">
<!-- Left side: Branding -->
<!-- Branding / Home -->
<div class="nav-brand">
<a href="/">BookScraper</a>
</div>
<!-- Right side: Navigation Links -->
<!-- Main navigation -->
<ul class="nav-links">
<li>
<a href="/dashboard" class="nav-item"> Dashboard </a>
@ -23,6 +23,18 @@
<li>
<a href="/logs" class="nav-item"> Logs </a>
</li>
<!-- Tools dropdown -->
<li class="nav-dropdown">
<span class="nav-item">Tools ▾</span>
<ul class="dropdown-menu">
<li><a href="/api/db/books">DB Viewer</a></li>
<li><a href="/debug/inspect_state">Inspect State</a></li>
<li><a href="/debug/sync_state">Sync State</a></li>
<li><a href="/debug/redis-keys">Redis Keys</a></li>
<li><a href="/debug/queues">queues</a></li>
</ul>
</li>
</ul>
</div>
</nav>

@ -0,0 +1,21 @@
{# ============================================================ File:
templates/components/registered_books.html Purpose: Toon een grid van
geregistreerde boeken. Elke kaart wordt gerenderd via bookcard.html.
============================================================ #}
<section class="dashboard-section">
<h2>Geregistreerde boeken</h2>
{% if registered and registered|length > 0 %}
<div class="registered-grid">
{% for b in registered %} {% include "components/bookcard.html" %} {% endfor
%}
</div>
{% else %}
<p>Geen geregistreerde boeken.</p>
{% endif %}
</section>

@ -5,7 +5,7 @@
Used on landing pages or detail pages.
======================================================================= -->
<form method="POST" action="/start" class="url-form">
<form method="POST" action="/init" class="url-form">
<label for="url" class="url-label">Book URL:</label>
<input

@ -22,6 +22,9 @@
<hr />
{% include "components/registered_books.html" %}
<hr />
<!-- ===========================================================
BOOK LIST
=========================================================== -->

@ -0,0 +1,88 @@
{% extends "layout.html" %} {% block content %}
<h1>State Inspection (SQL vs Redis)</h1>
<style>
.state-card {
border: 1px solid #444;
padding: 18px;
margin-bottom: 30px;
background: #222;
border-radius: 8px;
}
.state-title {
font-size: 1.4em;
margin-bottom: 14px;
color: #9cf;
}
table.state-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 12px;
}
.state-table th,
.state-table td {
border: 1px solid #555;
padding: 6px 10px;
}
.state-table th {
background: #333;
color: #fff;
}
.same {
color: #9f9;
}
.diff {
color: #ff7b7b;
font-weight: bold;
}
.empty {
color: #aaa;
font-style: italic;
}
</style>
{% macro cmp(sqlval, redisval) %} {% if (sqlval|string) == (redisval|string) %}
<td class="same">{{ sqlval }}</td>
<td class="same">{{ redisval }}</td>
{% else %}
<td class="diff">{{ sqlval }}</td>
<td class="diff">{{ redisval }}</td>
{% endif %} {% endmacro %} {% for entry in results %}
<div class="state-card">
<div class="state-title">📘 {{ entry.book_id }}</div>
{% set sql = entry.sqlite %} {% set redis = entry.redis %} {% set merged =
entry.would_merge_to %}
<table class="state-table">
<tr>
<th>Field</th>
<th>SQLite</th>
<th>Redis</th>
<th>Merged Result</th>
</tr>
{% for field in [ "status", "chapters_total", "downloaded",
"chapters_download_done", "chapters_download_skipped", "parsed",
"chapters_parsed_done", "audio_done", "audio_skipped", "last_update" ] %}
<tr>
<th>{{ field }}</th>
<td>{{ sql.get(field, '') }}</td>
<td>{{ redis.get(field, '') }}</td>
<td>{{ merged.get(field, '') }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endfor %} {% endblock %}

@ -0,0 +1,91 @@
{% extends "layout.html" %} {% block content %}
<h1>Celery Queue Debug</h1>
<style>
.debug-section {
margin-bottom: 40px;
}
.debug-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 20px;
}
.debug-table th,
.debug-table td {
border: 1px solid #444;
padding: 6px 10px;
}
.debug-table th {
background: #333;
color: #fff;
}
pre {
background: #1e1e1e;
color: #ddd;
padding: 10px;
overflow-x: auto;
}
code {
color: #9cf;
}
</style>
<div class="debug-section">
<h2>Workers</h2>
<h3>Active Tasks</h3>
<pre>{{ workers_active | tojson(indent=2) }}</pre>
<h3>Reserved</h3>
<pre>{{ workers_reserved | tojson(indent=2) }}</pre>
<h3>Scheduled</h3>
<pre>{{ workers_scheduled | tojson(indent=2) }}</pre>
</div>
<hr />
<div class="debug-section">
<h2>Queues</h2>
{% for q in queues %}
<div class="debug-queue">
<h3>{{ q.name }} ({{ q.length }} items)</h3>
<table class="debug-table">
<tr>
<th>Redis Key</th>
<td>{{ q.redis_key }}</td>
</tr>
<tr>
<th>Length</th>
<td>{{ q.length }}</td>
</tr>
<tr>
<th>Items (first 30)</th>
<td>
{% if q["items"] %}
<ul style="margin: 0; padding-left: 20px">
{% for item in q["items"] %}
<li><code>{{ item | e }}</code></li>
{% endfor %}
</ul>
{% else %}
<i>No items</i>
{% endif %}
</td>
</tr>
</table>
</div>
{% endfor %}
</div>
<script>
setInterval(() => {
window.location.reload();
}, 5000);
</script>
{% endblock %}

@ -14,6 +14,12 @@
<!-- CSS -->
<link rel="stylesheet" href="/static/css/style.css" />
<link rel="stylesheet" href="/static/css/dashboard.css" />
<link rel="stylesheet" href="/static/css/bookcard.css" />
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css"
/>
<!-- GLOBAL HELPERS (moet ALTIJD boven alles geladen worden) -->
<script src="/static/js/helpers.js"></script>

@ -5,7 +5,7 @@ import requests
from io import BytesIO
from bs4 import BeautifulSoup
from scraper.logger import log_debug
from scraper.utils import clean_text
from scraper.utils.utils import clean_text
from urllib.parse import urljoin
@ -103,8 +103,11 @@ class ChapterDownloader:
collecting = True
continue
text = sib.get_text("\n", strip=True) if hasattr(
sib, "get_text") else str(sib).strip()
text = (
sib.get_text("\n", strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
@ -121,6 +124,7 @@ class ChapterDownloader:
vdir = f"{output_base}/v{volume}"
import os
os.makedirs(vdir, exist_ok=True)
fname = f"{number:05d}_{title}.txt"

Loading…
Cancel
Save