|
|
diff --git a/bookscraper/.gitignore b/bookscraper/.gitignore
|
|
|
index 08fedd4..cd78ff3 100644
|
|
|
--- a/bookscraper/.gitignore
|
|
|
+++ b/bookscraper/.gitignore
|
|
|
@@ -1,4 +1,164 @@
|
|
|
-output/
|
|
|
+# ============================================
|
|
|
+# PYTHON
|
|
|
+# ============================================
|
|
|
+
|
|
|
+# Bytecode
|
|
|
+__pycache__/
|
|
|
+*.pyc
|
|
|
+*.pyo
|
|
|
+*.pyd
|
|
|
+
|
|
|
+# Virtual environments
|
|
|
venv/
|
|
|
+env/
|
|
|
+.venv/
|
|
|
+
|
|
|
+# Python build artifacts
|
|
|
+build/
|
|
|
+dist/
|
|
|
+*.egg-info/
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# PROJECT-SPECIFIC IGNORE RULES
|
|
|
+# ============================================
|
|
|
+
|
|
|
+# Output generated by BookScraper
|
|
|
+output/
|
|
|
+audio_output/
|
|
|
+m4b_output/
|
|
|
+covers/
|
|
|
+
|
|
|
+# Logs
|
|
|
*.log
|
|
|
-__pycache__/
|
|
|
\ No newline at end of file
|
|
|
+logs/
|
|
|
+log/
|
|
|
+celerybeat-schedule
|
|
|
+celerybeat.pid
|
|
|
+
|
|
|
+# Redis dump (if ever created)
|
|
|
+dump.rdb
|
|
|
+
|
|
|
+# Temporary HTML/debug scrapings
|
|
|
+tmp/
|
|
|
+temp/
|
|
|
+*.html.tmp
|
|
|
+*.debug.html
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# CELERY / RUNTIME
|
|
|
+# ============================================
|
|
|
+
|
|
|
+celerybeat-schedule
|
|
|
+*.pid
|
|
|
+*.worker
|
|
|
+
|
|
|
+# Celery progress / abort temporary files (if any)
|
|
|
+abort_flags/
|
|
|
+progress_cache/
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# DOCKER
|
|
|
+# ============================================
|
|
|
+
|
|
|
+# Docker build cache
|
|
|
+**/.dockerignore
|
|
|
+**/Dockerfile~
|
|
|
+docker-compose.override.yml
|
|
|
+docker-compose.local.yml
|
|
|
+docker-compose*.backup
|
|
|
+
|
|
|
+# Local bind mounts from Docker
|
|
|
+**/.volumes/
|
|
|
+**/mnt/
|
|
|
+**/cache/
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# FRONTEND / STATIC FILES
|
|
|
+# ============================================
|
|
|
+
|
|
|
+# Node / JS (if ever used)
|
|
|
+node_modules/
|
|
|
+npm-debug.log
|
|
|
+yarn-debug.log
|
|
|
+yarn-error.log
|
|
|
+dist/
|
|
|
+.bundle/
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# VS CODE / EDITORS
|
|
|
+# ============================================
|
|
|
+
|
|
|
+# VSCode
|
|
|
+.vscode/
|
|
|
+.history/
|
|
|
+.code-workspace
|
|
|
+
|
|
|
+# PyCharm / JetBrains
|
|
|
+.idea/
|
|
|
+
|
|
|
+# Editor backups
|
|
|
+*.swp
|
|
|
+*.swo
|
|
|
+*~
|
|
|
+
|
|
|
+# Autosave files
|
|
|
+*.bak
|
|
|
+*.tmp
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# SYSTEM / OS FILES
|
|
|
+# ============================================
|
|
|
+
|
|
|
+# MacOS bullshit
|
|
|
+.DS_Store
|
|
|
+.AppleDouble
|
|
|
+.LSOverride
|
|
|
+Icon?
|
|
|
+.Trashes
|
|
|
+
|
|
|
+# Windows
|
|
|
+Thumbs.db
|
|
|
+Desktop.ini
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# ARCHIVES
|
|
|
+# ============================================
|
|
|
+
|
|
|
+*.zip
|
|
|
+*.tar.gz
|
|
|
+*.tgz
|
|
|
+*.7z
|
|
|
+*.rar
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# AUDIO / TTS / TEMPORARY
|
|
|
+# ============================================
|
|
|
+
|
|
|
+*.wav
|
|
|
+*.mp3
|
|
|
+*.m4a
|
|
|
+*.m4b
|
|
|
+*.aac
|
|
|
+*.flac
|
|
|
+
|
|
|
+tts_temp/
|
|
|
+audio_temp/
|
|
|
+tts_cache/
|
|
|
+
|
|
|
+
|
|
|
+# ============================================
|
|
|
+# GIT INTERNAL SAFETY
|
|
|
+# ============================================
|
|
|
+
|
|
|
+# Never track your global git config junk
|
|
|
+.gitignore_global
|
|
|
+.gitconfig
|
|
|
+.gitattributes-global
|
|
|
diff --git a/bookscraper/app.py b/bookscraper/app.py
|
|
|
index bf758c8..241bc22 100644
|
|
|
--- a/bookscraper/app.py
|
|
|
+++ b/bookscraper/app.py
|
|
|
@@ -1,5 +1,5 @@
|
|
|
# ============================================
|
|
|
-# File: bookscraper/app.py (ASYNC SCRAPING)
|
|
|
+# File: bookscraper/app.py (ASYNC SCRAPING + DASHBOARD)
|
|
|
# ============================================
|
|
|
|
|
|
from dotenv import load_dotenv
|
|
|
@@ -9,34 +9,45 @@ load_dotenv()
|
|
|
print(">>> [WEB] Importing celery_app …")
|
|
|
from celery_app import celery_app
|
|
|
|
|
|
-from flask import Flask, render_template, request, jsonify
|
|
|
+from flask import (
|
|
|
+ Flask,
|
|
|
+ render_template,
|
|
|
+ request,
|
|
|
+ jsonify,
|
|
|
+ redirect,
|
|
|
+ send_from_directory,
|
|
|
+)
|
|
|
from scraper.logger import log_debug
|
|
|
|
|
|
-# Abort + Progress (per book_id)
|
|
|
+# Abort + Progress (legacy)
|
|
|
from scraper.abort import set_abort
|
|
|
from scraper.progress import get_progress
|
|
|
|
|
|
-# UI LOGS (GLOBAL — no book_id)
|
|
|
-from scraper.ui_log import get_ui_logs, reset_ui_logs # <-- ADDED
|
|
|
+# NEW: Full Redis Book State Model
|
|
|
+from scraper.progress import get_state
|
|
|
+from scraper.progress import r as redis_client
|
|
|
|
|
|
-from celery.result import AsyncResult
|
|
|
+# NEW: Indexed log fetchers
|
|
|
+from scraper.log_index import fetch_logs, fetch_recent_logs, fetch_global_logs
|
|
|
+
|
|
|
+# UI LOGS (legacy)
|
|
|
+from scraper.ui_log import get_ui_logs, reset_ui_logs
|
|
|
|
|
|
-# ⬇⬇⬇ TOEGEVOEGD voor cover-serving
|
|
|
-from flask import send_from_directory
|
|
|
+from celery.result import AsyncResult
|
|
|
import os
|
|
|
+import time
|
|
|
+import re
|
|
|
|
|
|
app = Flask(__name__)
|
|
|
|
|
|
-
|
|
|
# =====================================================
|
|
|
-# STATIC FILE SERVING FOR OUTPUT ← TOEGEVOEGD
|
|
|
+# STATIC FILE SERVING FOR OUTPUT
|
|
|
# =====================================================
|
|
|
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
|
|
|
|
|
|
|
|
@app.route("/output/<path:filename>")
|
|
|
def serve_output(filename):
|
|
|
- """Serve output files such as cover.jpg and volumes."""
|
|
|
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
|
|
|
|
|
|
|
|
|
@@ -49,7 +60,7 @@ def index():
|
|
|
|
|
|
|
|
|
# =====================================================
|
|
|
-# START SCRAPING (async via Celery)
|
|
|
+# START SCRAPING → DIRECT REDIRECT TO DASHBOARD (book_idx native)
|
|
|
# =====================================================
|
|
|
@app.route("/start", methods=["POST"])
|
|
|
def start_scraping():
|
|
|
@@ -58,64 +69,150 @@ def start_scraping():
|
|
|
if not url:
|
|
|
return render_template("result.html", error="Geen URL opgegeven.")
|
|
|
|
|
|
- # ---------------------------------------------------------
|
|
|
- # NEW: Clear UI log buffer when starting a new scrape
|
|
|
- # ---------------------------------------------------------
|
|
|
reset_ui_logs()
|
|
|
-
|
|
|
log_debug(f"[WEB] Scraping via Celery: {url}")
|
|
|
|
|
|
- async_result = celery_app.send_task(
|
|
|
+ # --------------------------------------------
|
|
|
+ # Extract book_idx from URL
|
|
|
+ # Supports:
|
|
|
+ # - /15/15618.html
|
|
|
+ # - /15618/
|
|
|
+ # - /15618.html
|
|
|
+ # --------------------------------------------
|
|
|
+ m = re.search(r"/(\d+)(?:\.html|/)?$", url)
|
|
|
+ if not m:
|
|
|
+ return render_template(
|
|
|
+ "result.html", error="Kan book_idx niet bepalen uit URL."
|
|
|
+ )
|
|
|
+
|
|
|
+ book_idx = m.group(1)
|
|
|
+
|
|
|
+ # --------------------------------------------
|
|
|
+ # Start async scraping task
|
|
|
+ # --------------------------------------------
|
|
|
+ celery_app.send_task(
|
|
|
"scraper.tasks.scraping.start_scrape_book",
|
|
|
args=[url],
|
|
|
queue="scraping",
|
|
|
)
|
|
|
|
|
|
- return render_template(
|
|
|
- "result.html",
|
|
|
- message="Scraping gestart.",
|
|
|
- scraping_task_id=async_result.id,
|
|
|
- book_title=None,
|
|
|
- )
|
|
|
+ # --------------------------------------------
|
|
|
+ # DIRECT redirect — no waiting on Celery
|
|
|
+ # --------------------------------------------
|
|
|
+ return redirect(f"/book/{book_idx}")
|
|
|
|
|
|
|
|
|
# =====================================================
|
|
|
-# CLEAR UI LOGS MANUALLY (NEW)
|
|
|
+# CLEAR UI LOGS (legacy)
|
|
|
# =====================================================
|
|
|
@app.route("/clear-logs", methods=["POST"])
|
|
|
def clear_logs():
|
|
|
reset_ui_logs()
|
|
|
- return jsonify({"status": "ok", "message": "UI logs cleared"})
|
|
|
+ return jsonify({"status": "ok"})
|
|
|
+
|
|
|
+
|
|
|
+# =====================================================
|
|
|
+# ABORT (per book_idx)
|
|
|
+# =====================================================
|
|
|
+@app.route("/abort/<book_idx>", methods=["POST"])
|
|
|
+def abort_download(book_idx):
|
|
|
+ log_debug(f"[WEB] Abort requested for book: {book_idx}")
|
|
|
+ set_abort(book_idx)
|
|
|
+ return jsonify({"status": "ok", "aborted": book_idx})
|
|
|
+
|
|
|
+
|
|
|
+# =====================================================
|
|
|
+# LEGACY PROGRESS ENDPOINT
|
|
|
+# =====================================================
|
|
|
+@app.route("/progress/<book_idx>", methods=["GET"])
|
|
|
+def progress(book_idx):
|
|
|
+ return jsonify(get_progress(book_idx))
|
|
|
+
|
|
|
+
|
|
|
+# =====================================================
|
|
|
+# REDIS STATE ENDPOINT
|
|
|
+# =====================================================
|
|
|
+@app.route("/state/<book_idx>", methods=["GET"])
|
|
|
+def full_state(book_idx):
|
|
|
+ return jsonify(get_state(book_idx))
|
|
|
|
|
|
|
|
|
# =====================================================
|
|
|
-# ABORT (per book_id)
|
|
|
+# LIST ALL BOOKS — METADATA
|
|
|
# =====================================================
|
|
|
-@app.route("/abort/<book_id>", methods=["POST"])
|
|
|
-def abort_download(book_id):
|
|
|
- log_debug(f"[WEB] Abort requested for book: {book_id}")
|
|
|
- set_abort(book_id)
|
|
|
- return jsonify({"status": "ok", "aborted": book_id})
|
|
|
+@app.route("/books", methods=["GET"])
|
|
|
+def list_books():
|
|
|
+ books = sorted(redis_client.smembers("books") or [])
|
|
|
+ result = []
|
|
|
+
|
|
|
+ for book_idx in books:
|
|
|
+ meta = redis_client.hgetall(f"book:{book_idx}:meta") or {}
|
|
|
+ state = get_state(book_idx) or {}
|
|
|
+
|
|
|
+ result.append(
|
|
|
+ {
|
|
|
+ "id": book_idx,
|
|
|
+ "title": meta.get("title", book_idx),
|
|
|
+ "author": meta.get("author"),
|
|
|
+ "url": meta.get("url"),
|
|
|
+ "cover_url": meta.get("cover_url"),
|
|
|
+ "scraped_at": meta.get("scraped_at"),
|
|
|
+ "status": state.get("status"),
|
|
|
+ "last_update": state.get("last_update"),
|
|
|
+ "chapters_total": state.get("chapters_total"),
|
|
|
+ "chapters_done": state.get("chapters_done"),
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ return jsonify(result)
|
|
|
|
|
|
|
|
|
# =====================================================
|
|
|
-# PROGRESS (per book_id)
|
|
|
+# LIBRARY DASHBOARD PAGE
|
|
|
# =====================================================
|
|
|
-@app.route("/progress/<book_id>", methods=["GET"])
|
|
|
-def progress(book_id):
|
|
|
- return jsonify(get_progress(book_id))
|
|
|
+@app.route("/library", methods=["GET"])
|
|
|
+def library_page():
|
|
|
+ return render_template("library.html")
|
|
|
|
|
|
|
|
|
# =====================================================
|
|
|
-# LOGS — GLOBAL UI LOGS
|
|
|
+# BOOK DASHBOARD PAGE — book_idx native
|
|
|
# =====================================================
|
|
|
-@app.route("/logs", methods=["GET"])
|
|
|
-def logs():
|
|
|
- return jsonify({"logs": get_ui_logs()})
|
|
|
+@app.route("/book/<book_idx>", methods=["GET"])
|
|
|
+def book_dashboard(book_idx):
|
|
|
+ return render_template(
|
|
|
+ "book_dashboard.html",
|
|
|
+ book_id=book_idx, # for template backward compatibility
|
|
|
+ book_idx=book_idx,
|
|
|
+ )
|
|
|
|
|
|
|
|
|
# =====================================================
|
|
|
-# CELERY RESULT → return book_id when scraping finishes
|
|
|
+# INDEXED LOG API — book_idx direct
|
|
|
+# =====================================================
|
|
|
+@app.route("/api/book/<book_idx>/logs", methods=["GET"])
|
|
|
+def api_book_logs(book_idx):
|
|
|
+ cursor = int(request.args.get("cursor", "0"))
|
|
|
+ logs, new_cursor = fetch_logs(book_idx, cursor)
|
|
|
+ return jsonify({"logs": logs, "cursor": new_cursor})
|
|
|
+
|
|
|
+
|
|
|
+@app.route("/api/book/<book_idx>/logs/recent", methods=["GET"])
|
|
|
+def api_book_logs_recent(book_idx):
|
|
|
+ limit = int(request.args.get("limit", "200"))
|
|
|
+ logs = fetch_recent_logs(book_idx, limit)
|
|
|
+ return jsonify({"logs": logs})
|
|
|
+
|
|
|
+
|
|
|
+@app.route("/api/logs/global", methods=["GET"])
|
|
|
+def api_global_logs():
|
|
|
+ cursor = int(request.args.get("cursor", "0"))
|
|
|
+ logs, new_cursor = fetch_global_logs(cursor)
|
|
|
+ return jsonify({"logs": logs, "cursor": new_cursor})
|
|
|
+
|
|
|
+
|
|
|
+# =====================================================
|
|
|
+# CELERY RESULT
|
|
|
# =====================================================
|
|
|
@app.route("/celery-result/<task_id>", methods=["GET"])
|
|
|
def celery_result(task_id):
|
|
|
@@ -123,10 +220,8 @@ def celery_result(task_id):
|
|
|
|
|
|
if result.successful():
|
|
|
return jsonify({"ready": True, "result": result.get()})
|
|
|
-
|
|
|
if result.failed():
|
|
|
return jsonify({"ready": True, "error": "failed"})
|
|
|
-
|
|
|
return jsonify({"ready": False})
|
|
|
|
|
|
|
|
|
diff --git a/bookscraper/logbus/publisher.py b/bookscraper/logbus/publisher.py
|
|
|
index 9a597db..5476475 100644
|
|
|
--- a/bookscraper/logbus/publisher.py
|
|
|
+++ b/bookscraper/logbus/publisher.py
|
|
|
@@ -1,4 +1,11 @@
|
|
|
-# logbus/publisher.py
|
|
|
+# ============================================================
|
|
|
+# File: logbus/publisher.py
|
|
|
+# Purpose:
|
|
|
+# Centralized logger:
|
|
|
+# - console logging
|
|
|
+# - UI legacy log echo
|
|
|
+# - NEW: indexed Redis log ingest (non-blocking)
|
|
|
+# ============================================================
|
|
|
|
|
|
import logging
|
|
|
|
|
|
@@ -10,20 +17,45 @@ def log(message: str):
|
|
|
Dumb logger:
|
|
|
- skip lege messages
|
|
|
- stuur message 1:1 door
|
|
|
- - geen prefixes
|
|
|
+ - geen prefixes wijzigen
|
|
|
- geen mutaties
|
|
|
"""
|
|
|
|
|
|
if not message or not message.strip():
|
|
|
return
|
|
|
|
|
|
- # console
|
|
|
+ # ============================================================
|
|
|
+ # SAFETY FIX (C&U):
|
|
|
+ # voorkom infinite loop: messages die uit log_index komen
|
|
|
+ # beginnen met "[IDX]" en mogen NIET opnieuw via de pipeline.
|
|
|
+ # ============================================================
|
|
|
+ if message.startswith("[IDX]"):
|
|
|
+ logger.warning(message)
|
|
|
+ return
|
|
|
+
|
|
|
+ # ---------------------------------------
|
|
|
+ # Console log
|
|
|
+ # ---------------------------------------
|
|
|
logger.warning(message)
|
|
|
|
|
|
- # UI-echo
|
|
|
+ # ---------------------------------------
|
|
|
+ # Legacy UI log (bestaand gedrag)
|
|
|
+ # ---------------------------------------
|
|
|
try:
|
|
|
- from scraper.ui_log import push_ui
|
|
|
+ from scraper.ui_log import push_ui # delayed import
|
|
|
|
|
|
push_ui(message)
|
|
|
except Exception:
|
|
|
+ # UI log mag nooit crash veroorzaken
|
|
|
+ pass
|
|
|
+
|
|
|
+ # ---------------------------------------
|
|
|
+ # NEW: Indexed Redis log entry
|
|
|
+ # ---------------------------------------
|
|
|
+ try:
|
|
|
+ from scraper.log_index import ingest_indexed_log # delayed import
|
|
|
+
|
|
|
+ ingest_indexed_log(message)
|
|
|
+ except Exception:
|
|
|
+ # Fail silently — logging mag nooit pipeline breken
|
|
|
pass
|
|
|
diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py
|
|
|
index 922d0c7..bf54375 100644
|
|
|
--- a/bookscraper/scraper/book_scraper.py
|
|
|
+++ b/bookscraper/scraper/book_scraper.py
|
|
|
@@ -13,10 +13,7 @@ from scraper.models.book_state import Chapter
|
|
|
class BookScraper:
|
|
|
"""
|
|
|
Minimal scraper: only metadata + chapter list.
|
|
|
- The DownloadController handles Celery pipelines for:
|
|
|
- - download
|
|
|
- - parse
|
|
|
- - save
|
|
|
+ The DownloadController handles Celery pipelines.
|
|
|
"""
|
|
|
|
|
|
def __init__(self, site, url):
|
|
|
@@ -29,6 +26,8 @@ class BookScraper:
|
|
|
self.cover_url = ""
|
|
|
self.chapter_base = None
|
|
|
|
|
|
+ self.book_idx = None # NUMERIEK ID UIT URL (enige bron)
|
|
|
+
|
|
|
self.chapters = []
|
|
|
|
|
|
# Load custom replacements
|
|
|
@@ -40,6 +39,9 @@ class BookScraper:
|
|
|
"""Main entry point. Returns metadata + chapter URLs."""
|
|
|
soup = self._fetch(self.url)
|
|
|
|
|
|
+ # Book_idx alleen uit main URL (afspraak)
|
|
|
+ self._extract_book_idx()
|
|
|
+
|
|
|
self._parse_title(soup)
|
|
|
self._parse_author(soup)
|
|
|
self._parse_description(soup)
|
|
|
@@ -54,14 +56,30 @@ class BookScraper:
|
|
|
"title": self.book_title,
|
|
|
"author": self.book_author,
|
|
|
"description": self.book_description,
|
|
|
- "cover_url": self.cover_url, # ← used by DownloadController
|
|
|
+ "cover_url": self.cover_url,
|
|
|
"book_url": self.url,
|
|
|
+ "book_idx": self.book_idx, # <<< belangrijk voor logging pipeline
|
|
|
"chapters": [
|
|
|
{"num": ch.number, "title": ch.title, "url": ch.url}
|
|
|
for ch in self.chapters
|
|
|
],
|
|
|
}
|
|
|
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ def _extract_book_idx(self):
|
|
|
+ """
|
|
|
+ Extract numeric ID from main URL such as:
|
|
|
+ https://www.piaotia.com/bookinfo/15/15618.html
|
|
|
+ This is the ONLY allowed source.
|
|
|
+ """
|
|
|
+ m = re.search(r"/(\d+)\.html$", self.url)
|
|
|
+ if m:
|
|
|
+ self.book_idx = m.group(1)
|
|
|
+ log_debug(f"[BookScraper] Extracted book_idx = {self.book_idx}")
|
|
|
+ else:
|
|
|
+ self.book_idx = None
|
|
|
+ log_debug("[BookScraper] book_idx NOT FOUND in URL")
|
|
|
+
|
|
|
# ------------------------------------------------------------
|
|
|
def _fetch(self, url):
|
|
|
log_debug(f"[BookScraper] Fetch: {url}")
|
|
|
@@ -109,10 +127,7 @@ class BookScraper:
|
|
|
def _parse_cover(self, soup):
|
|
|
"""
|
|
|
Extract correct cover based on book_id path logic.
|
|
|
- 1. primary: match "/files/article/image/{vol}/{book_id}/"
|
|
|
- 2. fallback: endswith "/{book_id}s.jpg"
|
|
|
"""
|
|
|
- # Extract book_id from URL
|
|
|
m = re.search(r"/(\d+)\.html$", self.url)
|
|
|
if not m:
|
|
|
log_debug("[BookScraper] No book_id found in URL → cannot match cover")
|
|
|
@@ -120,20 +135,15 @@ class BookScraper:
|
|
|
|
|
|
book_id = m.group(1)
|
|
|
|
|
|
- # Extract vol folder from URL (bookinfo/<vol>/<id>.html)
|
|
|
m2 = re.search(r"/bookinfo/(\d+)/", self.url)
|
|
|
volume = m2.group(1) if m2 else None
|
|
|
|
|
|
log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
|
|
|
|
|
|
imgs = soup.find_all("img", src=True)
|
|
|
-
|
|
|
chosen = None
|
|
|
|
|
|
- # --------------------------------------------------------
|
|
|
- # PRIORITY 1: Path-match
|
|
|
- # /files/article/image/{vol}/{book_id}/
|
|
|
- # --------------------------------------------------------
|
|
|
+ # PATH-MATCH
|
|
|
if volume:
|
|
|
target_path = f"/files/article/image/{volume}/{book_id}/"
|
|
|
for img in imgs:
|
|
|
@@ -143,9 +153,7 @@ class BookScraper:
|
|
|
log_debug(f"[BookScraper] Cover matched by PATH: {src}")
|
|
|
break
|
|
|
|
|
|
- # --------------------------------------------------------
|
|
|
- # PRIORITY 2: endswith "/{book_id}s.jpg"
|
|
|
- # --------------------------------------------------------
|
|
|
+ # SUFFIX-MATCH
|
|
|
if not chosen:
|
|
|
target_suffix = f"/{book_id}s.jpg"
|
|
|
for img in imgs:
|
|
|
@@ -155,9 +163,6 @@ class BookScraper:
|
|
|
log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
|
|
|
break
|
|
|
|
|
|
- # --------------------------------------------------------
|
|
|
- # No match
|
|
|
- # --------------------------------------------------------
|
|
|
if not chosen:
|
|
|
log_debug("[BookScraper] No matching cover found")
|
|
|
return
|
|
|
@@ -167,14 +172,12 @@ class BookScraper:
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def get_chapter_page(self, soup):
|
|
|
- """Return BeautifulSoup of the main chapter list page."""
|
|
|
node = soup.select_one(
|
|
|
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
|
|
|
)
|
|
|
href = node.select_one("a").get("href")
|
|
|
chapter_url = urljoin(self.site.root, href)
|
|
|
|
|
|
- # base for chapter links
|
|
|
parts = chapter_url.rsplit("/", 1)
|
|
|
self.chapter_base = parts[0] + "/"
|
|
|
|
|
|
diff --git a/bookscraper/scraper/download_controller.py b/bookscraper/scraper/download_controller.py
|
|
|
index 9a9e978..4f2cf32 100644
|
|
|
--- a/bookscraper/scraper/download_controller.py
|
|
|
+++ b/bookscraper/scraper/download_controller.py
|
|
|
@@ -1,231 +1,133 @@
|
|
|
-# =========================================================
|
|
|
+# ============================================================
|
|
|
# File: scraper/download_controller.py
|
|
|
# Purpose:
|
|
|
-# Build Celery pipelines for all chapters
|
|
|
-# and pass book_id for abort/progress/log functionality.
|
|
|
-# + Download and replicate cover image to all volume folders
|
|
|
-# + Generate scripts (allinone.txt, makebook, say)
|
|
|
-# + Initialize Redis Book State Model (status + counters)
|
|
|
-# =========================================================
|
|
|
-
|
|
|
-from celery import group
|
|
|
-from scraper.tasks.pipeline import build_chapter_pipeline
|
|
|
-from scraper.scriptgen import generate_all_scripts
|
|
|
-from logbus.publisher import log
|
|
|
+# Prepare folder structure, volumes, cover, and Celery pipelines
|
|
|
+# using ONLY Celery-safe primitive arguments.
|
|
|
+#
|
|
|
+# Workers never receive BookContext/ChapterContext.
|
|
|
+# ============================================================
|
|
|
+
|
|
|
import os
|
|
|
import requests
|
|
|
-import shutil
|
|
|
-from scraper.abort import abort_requested # DEBUG allowed
|
|
|
+from logbus.publisher import log
|
|
|
+from celery import chain, group
|
|
|
+
|
|
|
+from scraper.tasks.download_tasks import download_chapter
|
|
|
+from scraper.tasks.parse_tasks import parse_chapter
|
|
|
+from scraper.tasks.save_tasks import save_chapter
|
|
|
+from scraper.tasks.progress_tasks import update_progress
|
|
|
+
|
|
|
|
|
|
-# NEW: Redis State Model (C&U)
|
|
|
-from scraper.progress import (
|
|
|
- init_book_state,
|
|
|
- set_status,
|
|
|
- set_chapter_total,
|
|
|
-)
|
|
|
+print(">>> [IMPORT] download_controller.py loaded (final Celery-safe mode)")
|
|
|
|
|
|
|
|
|
class DownloadController:
|
|
|
"""
|
|
|
- Coordinates all chapter pipelines (download → parse → save),
|
|
|
- including:
|
|
|
- - volume splitting
|
|
|
- - consistent meta propagation
|
|
|
- - book_id-based abort + progress tracking
|
|
|
- - cover download + volume replication
|
|
|
- - script generation (allinone.txt, makebook, say)
|
|
|
- - Redis book state initialisation and status updates
|
|
|
+ Responsibilities:
|
|
|
+ • Determine output root
|
|
|
+ • Download cover
|
|
|
+ • Assign chapters → volumes
|
|
|
+ • Build Celery pipelines (primitive-only arguments)
|
|
|
"""
|
|
|
|
|
|
- def __init__(self, book_id: str, scrape_result: dict):
|
|
|
- self.book_id = book_id
|
|
|
+ def __init__(self, scrape_result: dict):
|
|
|
+ self.book_idx = scrape_result.get("book_id")
|
|
|
+ self.title = scrape_result.get("title", "Unknown")
|
|
|
+ self.author = scrape_result.get("author")
|
|
|
+ self.cover_url = scrape_result.get("cover")
|
|
|
self.scrape_result = scrape_result
|
|
|
|
|
|
- # Core metadata
|
|
|
- self.title = scrape_result.get("title", "UnknownBook")
|
|
|
- self.chapters = scrape_result.get("chapters", []) or []
|
|
|
- self.cover_url = scrape_result.get("cover_url")
|
|
|
-
|
|
|
- # Output base dir
|
|
|
- root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
|
|
-
|
|
|
- # Volume size
|
|
|
- self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
|
|
|
-
|
|
|
- # Base folder for the whole book
|
|
|
- self.book_base = os.path.join(root, self.title)
|
|
|
- os.makedirs(self.book_base, exist_ok=True)
|
|
|
-
|
|
|
- # Meta passed to parse/save stage
|
|
|
- self.meta = {
|
|
|
- "title": self.title,
|
|
|
- "author": scrape_result.get("author"),
|
|
|
- "description": scrape_result.get("description"),
|
|
|
- "book_url": scrape_result.get("book_url"),
|
|
|
- }
|
|
|
-
|
|
|
- # -------------------------------------------------
|
|
|
- # DEBUG — bevestig dat controller correct book_id ziet
|
|
|
- # -------------------------------------------------
|
|
|
- log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'")
|
|
|
-
|
|
|
- try:
|
|
|
- abort_state = abort_requested(book_id)
|
|
|
- log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}")
|
|
|
- except Exception as e:
|
|
|
- log(f"[CTRL_DEBUG] abort_requested ERROR: {e}")
|
|
|
-
|
|
|
- # -------------------------------------------------
|
|
|
- # NEW: Initialize Redis Book State Model
|
|
|
- # -------------------------------------------------
|
|
|
- try:
|
|
|
- init_book_state(
|
|
|
- book_id=self.book_id,
|
|
|
- title=self.title,
|
|
|
- url=self.scrape_result.get("book_url"),
|
|
|
- chapters_total=len(self.chapters),
|
|
|
- )
|
|
|
- log(f"[CTRL_STATE] init_book_state() completed for {self.title}")
|
|
|
- except Exception as e:
|
|
|
- log(f"[CTRL_STATE] init_book_state FAILED: {e}")
|
|
|
-
|
|
|
- # ---------------------------------------------------------
|
|
|
- # Cover Download
|
|
|
- # ---------------------------------------------------------
|
|
|
- def download_cover(self):
|
|
|
- """Download one cover image into the root of the book folder."""
|
|
|
- if not self.cover_url:
|
|
|
- log(f"[CTRL] No cover URL found for '{self.title}'")
|
|
|
- return
|
|
|
-
|
|
|
- cover_path = os.path.join(self.book_base, "cover.jpg")
|
|
|
-
|
|
|
- headers = {
|
|
|
- "User-Agent": (
|
|
|
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
|
|
|
- "Gecko/20100101 Firefox/118.0"
|
|
|
- ),
|
|
|
- "Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
|
|
|
- }
|
|
|
-
|
|
|
- try:
|
|
|
- log(f"[CTRL] Downloading cover: {self.cover_url}")
|
|
|
-
|
|
|
- resp = requests.get(self.cover_url, timeout=10, headers=headers)
|
|
|
- resp.raise_for_status()
|
|
|
+ # List of dicts from scraper: [{num, title, url}]
|
|
|
+ self.chapters = scrape_result.get("chapter_list", [])
|
|
|
+ self.chapter_count = len(self.chapters)
|
|
|
+
|
|
|
+ # Output root
|
|
|
+ self.output_root = os.path.join("/app/output", self.title)
|
|
|
+
|
|
|
+ # Will be filled by assign_volumes()
|
|
|
+ self.volume_paths = {} # chapter_num → volume_path
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Create root folder + download cover
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ def init_book_output(self):
|
|
|
+ os.makedirs(self.output_root, exist_ok=True)
|
|
|
+
|
|
|
+ if self.cover_url:
|
|
|
+ try:
|
|
|
+ resp = requests.get(self.cover_url, timeout=10)
|
|
|
+ if resp.ok:
|
|
|
+ p = os.path.join(self.output_root, "cover.jpg")
|
|
|
+ with open(p, "wb") as f:
|
|
|
+ f.write(resp.content)
|
|
|
+ log(f"[CTRL] Cover saved: {p}")
|
|
|
+ except Exception as exc:
|
|
|
+ log(f"[CTRL] Cover download error: {exc}")
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Volume assignment (no BookContext needed)
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ def assign_volumes(self, max_chapters_per_volume=200):
|
|
|
+ """
|
|
|
+ Determine per-chapter volume_path and ensure folders exist.
|
|
|
+ """
|
|
|
+ volume_index = 1
|
|
|
+ chapters_in_volume = 0
|
|
|
|
|
|
- with open(cover_path, "wb") as f:
|
|
|
- f.write(resp.content)
|
|
|
-
|
|
|
- log(f"[CTRL] Cover saved to: {cover_path}")
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
|
|
|
-
|
|
|
- # ---------------------------------------------------------
|
|
|
- # Cover Replication to Volumes
|
|
|
- # ---------------------------------------------------------
|
|
|
- def replicate_cover_to_volumes(self):
|
|
|
- """Copy cover.jpg into each existing Volume_xxx directory."""
|
|
|
- src = os.path.join(self.book_base, "cover.jpg")
|
|
|
- if not os.path.exists(src):
|
|
|
- log("[CTRL] No cover.jpg found, replication skipped")
|
|
|
- return
|
|
|
+ for ch in self.chapters:
|
|
|
+ if chapters_in_volume >= max_chapters_per_volume:
|
|
|
+ volume_index += 1
|
|
|
+ chapters_in_volume = 0
|
|
|
|
|
|
- try:
|
|
|
- for entry in os.listdir(self.book_base):
|
|
|
- if entry.lower().startswith("volume_"):
|
|
|
- vol_dir = os.path.join(self.book_base, entry)
|
|
|
- dst = os.path.join(vol_dir, "cover.jpg")
|
|
|
-
|
|
|
- shutil.copyfile(src, dst)
|
|
|
- log(f"[CTRL] Cover replicated into: {dst}")
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- log(f"[CTRL] Cover replication failed: {e}")
|
|
|
-
|
|
|
- # ---------------------------------------------------------
|
|
|
- # Volume isolation
|
|
|
- # ---------------------------------------------------------
|
|
|
- def get_volume_path(self, chapter_num: int) -> str:
|
|
|
- """Returns the correct volume directory for a chapter."""
|
|
|
- vol_index = (chapter_num - 1) // self.max_vol + 1
|
|
|
- vol_name = f"Volume_{vol_index:03d}"
|
|
|
- vol_path = os.path.join(self.book_base, vol_name)
|
|
|
- os.makedirs(vol_path, exist_ok=True)
|
|
|
- return vol_path
|
|
|
-
|
|
|
- # ---------------------------------------------------------
|
|
|
- # Pipeline launcher
|
|
|
- # ---------------------------------------------------------
|
|
|
- def start(self):
|
|
|
- total = len(self.chapters)
|
|
|
+ volume_path = os.path.join(self.output_root, f"Volume{volume_index}")
|
|
|
+ os.makedirs(volume_path, exist_ok=True)
|
|
|
|
|
|
- log(
|
|
|
- f"[CTRL] Initialising pipeline for '{self.title}' "
|
|
|
- f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
|
|
|
- )
|
|
|
- log(f"[CTRL] Output root: {self.book_base}")
|
|
|
+ # C&U FIX — scraper outputs key "num", NOT "number"
|
|
|
+ chapter_num = ch["num"]
|
|
|
+ self.volume_paths[chapter_num] = volume_path
|
|
|
|
|
|
- # -------------------------------------
|
|
|
- # NEW: Redis state update
|
|
|
- # -------------------------------------
|
|
|
- try:
|
|
|
- set_status(self.book_id, "downloading")
|
|
|
- set_chapter_total(self.book_id, total)
|
|
|
- log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}")
|
|
|
- except Exception as e:
|
|
|
- log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}")
|
|
|
+ chapters_in_volume += 1
|
|
|
|
|
|
- # -------------------------------------
|
|
|
- # 1) Download cover
|
|
|
- # -------------------------------------
|
|
|
- self.download_cover()
|
|
|
+ log(f"[CTRL] Volume assignment complete: {len(self.volume_paths)} chapters")
|
|
|
|
|
|
- tasks = []
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Build Celery pipelines (primitive-only)
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ def build_pipelines(self):
|
|
|
+ pipelines = []
|
|
|
|
|
|
for ch in self.chapters:
|
|
|
- chapter_num = ch["num"]
|
|
|
- chapter_url = ch["url"]
|
|
|
-
|
|
|
- volume_path = self.get_volume_path(chapter_num)
|
|
|
-
|
|
|
- tasks.append(
|
|
|
- build_chapter_pipeline(
|
|
|
- self.book_id,
|
|
|
- chapter_num,
|
|
|
- chapter_url,
|
|
|
- volume_path,
|
|
|
- self.meta,
|
|
|
- )
|
|
|
+ # C&U FIX — must use "num"
|
|
|
+ num = ch["num"]
|
|
|
+ url = ch["url"]
|
|
|
+ vp = self.volume_paths[num]
|
|
|
+
|
|
|
+ p = chain(
|
|
|
+ download_chapter.s(self.book_idx, num, url, vp),
|
|
|
+ parse_chapter.s(self.book_idx, num),
|
|
|
+ save_chapter.s(self.book_idx, num),
|
|
|
+ update_progress.s(self.book_idx),
|
|
|
)
|
|
|
+ pipelines.append(p)
|
|
|
|
|
|
- async_result = group(tasks).apply_async()
|
|
|
-
|
|
|
- log(
|
|
|
- f"[CTRL] Pipelines dispatched for '{self.title}' "
|
|
|
- f"(book_id={self.book_id}, group_id={async_result.id})"
|
|
|
- )
|
|
|
-
|
|
|
- # Debug abort state
|
|
|
- try:
|
|
|
- abort_state = abort_requested(self.book_id)
|
|
|
- log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}")
|
|
|
- except Exception as e:
|
|
|
- log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}")
|
|
|
-
|
|
|
- # -------------------------------------------------------
|
|
|
- self.replicate_cover_to_volumes()
|
|
|
+ return pipelines
|
|
|
|
|
|
- # -------------------------------------------------------
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Launch Celery group(pipelines)
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ def dispatch_pipelines(self, pipelines):
|
|
|
try:
|
|
|
- generate_all_scripts(
|
|
|
- self.book_base,
|
|
|
- self.title,
|
|
|
- self.meta.get("author"),
|
|
|
- )
|
|
|
- log(f"[CTRL] Scripts generated for '{self.title}'")
|
|
|
- except Exception as e:
|
|
|
- log(f"[CTRL] Script generation failed: {e}")
|
|
|
-
|
|
|
- return async_result
|
|
|
+ g = group(pipelines)
|
|
|
+ result = g.apply_async()
|
|
|
+ return result
|
|
|
+ except Exception as exc:
|
|
|
+ log(f"[CTRL] ERROR dispatching pipelines: {exc}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Legacy convenience entrypoint
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ def start(self):
|
|
|
+ self.init_book_output()
|
|
|
+ self.assign_volumes()
|
|
|
+ return self.dispatch_pipelines(self.build_pipelines())
|
|
|
diff --git a/bookscraper/scraper/tasks/audio_tasks.py b/bookscraper/scraper/tasks/audio_tasks.py
|
|
|
index fea3285..f43d3a9 100644
|
|
|
--- a/bookscraper/scraper/tasks/audio_tasks.py
|
|
|
+++ b/bookscraper/scraper/tasks/audio_tasks.py
|
|
|
@@ -1,5 +1,8 @@
|
|
|
# ============================================================
|
|
|
# File: scraper/tasks/audio_tasks.py
|
|
|
+# Purpose:
|
|
|
+# Convert a saved chapter's text file into an audio file (.m4a)
|
|
|
+# using macOS "say". Uses BookContext + ChapterContext.
|
|
|
# ============================================================
|
|
|
|
|
|
from celery_app import celery_app
|
|
|
@@ -12,29 +15,38 @@ from scraper.abort import abort_requested
|
|
|
from redis import Redis
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
-# Kies lokale redis als aanwezig, anders standaard backend
|
|
|
-redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
|
|
|
-
|
|
|
-parsed = urlparse(redis_url)
|
|
|
+from scraper.progress import (
|
|
|
+ set_status,
|
|
|
+ set_last_update,
|
|
|
+ inc_audio_done,
|
|
|
+ save_skip_reason,
|
|
|
+)
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
-# REGULIER REDIS CLIENT (slots, file checks, state)
|
|
|
+# REDIS CLIENTS
|
|
|
# ------------------------------------------------------------
|
|
|
+
|
|
|
+redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
|
|
|
+parsed = urlparse(redis_url)
|
|
|
+
|
|
|
+# Slot management DB (same as download slot handling)
|
|
|
redis_client = Redis(
|
|
|
host=parsed.hostname,
|
|
|
port=parsed.port,
|
|
|
db=parsed.path.strip("/"),
|
|
|
)
|
|
|
|
|
|
-# ------------------------------------------------------------
|
|
|
-# BACKEND CLIENT (abort flags, progress counters) - altijd DB 0
|
|
|
-# ------------------------------------------------------------
|
|
|
+# Backend DB (abort + progress counters)
|
|
|
backend_client = Redis(
|
|
|
host=parsed.hostname,
|
|
|
port=parsed.port,
|
|
|
db=0,
|
|
|
)
|
|
|
|
|
|
+# ------------------------------------------------------------
|
|
|
+# ENVIRONMENT
|
|
|
+# ------------------------------------------------------------
|
|
|
+
|
|
|
AUDIO_TIMEOUT = int(os.getenv("AUDIO_TIMEOUT_SECONDS", "300"))
|
|
|
AUDIO_VOICE = os.getenv("AUDIO_VOICE", "SinJi")
|
|
|
AUDIO_RATE = int(os.getenv("AUDIO_RATE", "200"))
|
|
|
@@ -44,20 +56,54 @@ AUDIO_SLOTS = int(os.getenv("AUDIO_SLOTS", "1"))
|
|
|
CONTAINER_PREFIX = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "/app/output")
|
|
|
|
|
|
|
|
|
+# ============================================================
|
|
|
+# CELERY TASK — NOW USING CONTEXT OBJECTS
|
|
|
+# ============================================================
|
|
|
+
|
|
|
+
|
|
|
@celery_app.task(bind=True, queue="audio", ignore_result=True)
|
|
|
-def generate_audio(
|
|
|
- self, book_id, volume_name, chapter_number, chapter_title, chapter_text
|
|
|
-):
|
|
|
- log(f"[AUDIO] CH{chapter_number}: START task → raw_input={chapter_text}")
|
|
|
-
|
|
|
- # Abort early
|
|
|
- if abort_requested(book_id, backend_client):
|
|
|
- log(f"[AUDIO] ABORT detected → skip CH{chapter_number}")
|
|
|
+def generate_audio(self, book_context, chapter_context):
|
|
|
+ """
|
|
|
+ Create audio using:
|
|
|
+ - book_context.book_idx
|
|
|
+ - chapter_context.number
|
|
|
+ - chapter_context.path (text filepath)
|
|
|
+ """
|
|
|
+
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ # IDENTIFIERS
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ book_idx = book_context.book_idx
|
|
|
+ chapter_number = chapter_context.number
|
|
|
+ text_file_path = chapter_context.path
|
|
|
+
|
|
|
+ log(f"[AUDIO] CH{chapter_number}: START task → {text_file_path}")
|
|
|
+
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ # Update state: audio stage active
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ try:
|
|
|
+ set_status(book_idx, "audio")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ # Abort BEFORE doing anything
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ if abort_requested(book_idx, backend_client):
|
|
|
+ log(f"[AUDIO] ABORT detected → skip chapter {chapter_number}")
|
|
|
+ try:
|
|
|
+ chapter_context.add_skip("audio", "abort")
|
|
|
+ save_skip_reason(book_idx, chapter_number, "audio_abort")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
return
|
|
|
|
|
|
- # ============================================================
|
|
|
+ # ------------------------------------------------------------
|
|
|
# ACQUIRE AUDIO SLOT
|
|
|
- # ============================================================
|
|
|
+ # ------------------------------------------------------------
|
|
|
slot_key = None
|
|
|
ttl = AUDIO_TIMEOUT + 15
|
|
|
|
|
|
@@ -68,11 +114,13 @@ def generate_audio(
|
|
|
log(f"[AUDIO] CH{chapter_number}: Acquired slot {i}/{AUDIO_SLOTS}")
|
|
|
break
|
|
|
|
|
|
+ # If no slot free → wait
|
|
|
if slot_key is None:
|
|
|
log(f"[AUDIO] CH{chapter_number}: All slots busy → waiting...")
|
|
|
start_wait = time.time()
|
|
|
|
|
|
while slot_key is None:
|
|
|
+ # retry each slot
|
|
|
for i in range(1, AUDIO_SLOTS + 1):
|
|
|
key = f"audio_slot:{i}"
|
|
|
if redis_client.set(key, "1", nx=True, ex=ttl):
|
|
|
@@ -80,101 +128,135 @@ def generate_audio(
|
|
|
log(f"[AUDIO] CH{chapter_number}: Slot acquired after wait")
|
|
|
break
|
|
|
|
|
|
- if slot_key:
|
|
|
- break
|
|
|
-
|
|
|
- if abort_requested(book_id, backend_client):
|
|
|
+ # abort while waiting
|
|
|
+ if abort_requested(book_idx, backend_client):
|
|
|
log(f"[AUDIO] ABORT while waiting → skip CH{chapter_number}")
|
|
|
+ try:
|
|
|
+ chapter_context.add_skip("audio", "abort_wait")
|
|
|
+ save_skip_reason(book_idx, chapter_number, "audio_abort_wait")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
return
|
|
|
|
|
|
+ # timeout
|
|
|
if time.time() - start_wait > ttl:
|
|
|
- log(f"[AUDIO] CH{chapter_number}: Slot wait timeout → aborting audio")
|
|
|
+ log(f"[AUDIO] CH{chapter_number}: WAIT TIMEOUT → aborting")
|
|
|
+ try:
|
|
|
+ chapter_context.add_skip("audio", "timeout_wait")
|
|
|
+ save_skip_reason(book_idx, chapter_number, "audio_timeout_wait")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
return
|
|
|
|
|
|
time.sleep(0.25)
|
|
|
|
|
|
- # ============================================================
|
|
|
- # PATH NORMALISATION
|
|
|
- # ============================================================
|
|
|
-
|
|
|
- container_path = chapter_text
|
|
|
-
|
|
|
- # Fix 1 — container_path kan None zijn → abort zonder crash
|
|
|
- if not container_path:
|
|
|
- log(f"[AUDIO] CH{chapter_number}: FATAL — no input path provided")
|
|
|
- redis_client.delete(slot_key)
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ # VALIDATE INPUT PATH
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ if not text_file_path:
|
|
|
+ log(f"[AUDIO] CH{chapter_number}: No input path")
|
|
|
+ try:
|
|
|
+ chapter_context.add_skip("audio", "missing_input")
|
|
|
+ save_skip_reason(book_idx, chapter_number, "audio_missing_input")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ if slot_key:
|
|
|
+ redis_client.delete(slot_key)
|
|
|
return
|
|
|
|
|
|
- # Fix 2 — veilige startswith
|
|
|
- if CONTAINER_PREFIX and container_path.startswith(CONTAINER_PREFIX):
|
|
|
- relative_path = container_path[len(CONTAINER_PREFIX) :].lstrip("/")
|
|
|
+ # Convert container path → host FS path
|
|
|
+ if text_file_path.startswith(CONTAINER_PREFIX):
|
|
|
+ relative = text_file_path[len(CONTAINER_PREFIX) :].lstrip("/")
|
|
|
else:
|
|
|
- relative_path = container_path
|
|
|
-
|
|
|
- parts = relative_path.split("/")
|
|
|
- if len(parts) < 3:
|
|
|
- log(
|
|
|
- f"[AUDIO] CH{chapter_number}: FATAL — cannot parse book/volume from {relative_path}"
|
|
|
- )
|
|
|
- redis_client.delete(slot_key)
|
|
|
- return
|
|
|
-
|
|
|
- book_from_path = parts[0]
|
|
|
- volume_from_path = parts[1]
|
|
|
-
|
|
|
- host_path = os.path.join(HOST_PATH, relative_path)
|
|
|
-
|
|
|
- # ============================================================
|
|
|
- # OUTPUT PREP
|
|
|
- # ============================================================
|
|
|
-
|
|
|
- base_dir = os.path.join(HOST_PATH, book_from_path, volume_from_path, "Audio")
|
|
|
- os.makedirs(base_dir, exist_ok=True)
|
|
|
+ relative = text_file_path
|
|
|
+
|
|
|
+ host_input_path = os.path.join(HOST_PATH, relative)
|
|
|
+
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ # Determine output directory
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ volume_name = os.path.basename(os.path.dirname(text_file_path))
|
|
|
+ audio_output_dir = os.path.join(
|
|
|
+ HOST_PATH,
|
|
|
+ os.path.basename(book_context.book_base),
|
|
|
+ volume_name,
|
|
|
+ "Audio",
|
|
|
+ )
|
|
|
+ os.makedirs(audio_output_dir, exist_ok=True)
|
|
|
|
|
|
- safe_num = f"{chapter_number:04d}"
|
|
|
- audio_file = os.path.join(base_dir, f"{safe_num}.m4a")
|
|
|
+ audio_file = os.path.join(audio_output_dir, f"{chapter_number:04d}.m4a")
|
|
|
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ # Skip if output already exists
|
|
|
+ # ------------------------------------------------------------
|
|
|
if os.path.exists(audio_file):
|
|
|
log(f"[AUDIO] Skip CH{chapter_number} → already exists")
|
|
|
- redis_client.delete(slot_key)
|
|
|
+ try:
|
|
|
+ chapter_context.add_skip("audio", "already_exists")
|
|
|
+ save_skip_reason(book_idx, chapter_number, "audio_exists")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ if slot_key:
|
|
|
+ redis_client.delete(slot_key)
|
|
|
return
|
|
|
|
|
|
- # ============================================================
|
|
|
- # BUILD CMD
|
|
|
- # ============================================================
|
|
|
-
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ # BUILD COMMAND
|
|
|
+ # ------------------------------------------------------------
|
|
|
cmd = (
|
|
|
f"say --voice={AUDIO_VOICE} "
|
|
|
- f"--input-file='{host_path}' "
|
|
|
+ f"--input-file='{host_input_path}' "
|
|
|
f"--output-file='{audio_file}' "
|
|
|
- f"--file-format=m4bf "
|
|
|
- f"--quality=127 "
|
|
|
- f"-r {AUDIO_RATE} "
|
|
|
- f"--data-format=aac"
|
|
|
+ f"--file-format=m4bf --quality=127 "
|
|
|
+ f"-r {AUDIO_RATE} --data-format=aac"
|
|
|
)
|
|
|
|
|
|
log(f"[AUDIO] CH{chapter_number}: CMD = {cmd}")
|
|
|
|
|
|
- # ============================================================
|
|
|
+ # ------------------------------------------------------------
|
|
|
# RUN TTS
|
|
|
- # ============================================================
|
|
|
+ # ------------------------------------------------------------
|
|
|
try:
|
|
|
subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT)
|
|
|
log(f"[AUDIO] CH{chapter_number}: Completed")
|
|
|
|
|
|
+ inc_audio_done(book_idx)
|
|
|
+ set_last_update(book_idx)
|
|
|
+
|
|
|
+ chapter_context.audio_done = True
|
|
|
+
|
|
|
except subprocess.TimeoutExpired:
|
|
|
- log(f"[AUDIO] CH{chapter_number}: TIMEOUT → remove incomplete file")
|
|
|
+ log(f"[AUDIO] CH{chapter_number}: TIMEOUT → removing incomplete file")
|
|
|
+ try:
|
|
|
+ chapter_context.add_skip("audio", "timeout")
|
|
|
+ save_skip_reason(book_idx, chapter_number, "audio_timeout")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
if os.path.exists(audio_file):
|
|
|
- try:
|
|
|
- os.remove(audio_file)
|
|
|
- except Exception:
|
|
|
- pass
|
|
|
+ os.remove(audio_file)
|
|
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
- log(f"[AUDIO] CH{chapter_number}: ERROR during say → {e}")
|
|
|
+ log(f"[AUDIO] CH{chapter_number}: ERROR in say → {e}")
|
|
|
+ try:
|
|
|
+ chapter_context.add_skip("audio", f"cmd_error:{e}")
|
|
|
+ save_skip_reason(book_idx, chapter_number, "audio_cmd_error")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
|
|
|
except Exception as e:
|
|
|
log(f"[AUDIO] CH{chapter_number}: UNEXPECTED ERROR → {e}")
|
|
|
+ try:
|
|
|
+ chapter_context.add_skip("audio", f"unexpected:{e}")
|
|
|
+ save_skip_reason(book_idx, chapter_number, "audio_unexpected_error")
|
|
|
+ set_last_update(book_idx)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
|
|
|
finally:
|
|
|
if slot_key:
|
|
|
diff --git a/bookscraper/scraper/tasks/controller_tasks.py b/bookscraper/scraper/tasks/controller_tasks.py
|
|
|
index 0f06405..b7336c8 100644
|
|
|
--- a/bookscraper/scraper/tasks/controller_tasks.py
|
|
|
+++ b/bookscraper/scraper/tasks/controller_tasks.py
|
|
|
@@ -1,81 +1,98 @@
|
|
|
# ============================================================
|
|
|
# File: scraper/tasks/controller_tasks.py
|
|
|
# Purpose:
|
|
|
-# Start the download → parse → save pipeline for a scraped book,
|
|
|
-# including progress/abort tracking via book_id.
|
|
|
-# ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
|
|
|
+# Launch the full download/parse/save pipeline.
|
|
|
+#
|
|
|
+# JSON-safe Celery architecture:
|
|
|
+# • controller receives primitive scrape_result
|
|
|
+# • controller builds DownloadController locally
|
|
|
+# • controller assigns volumes & prepares folders
|
|
|
+# • Celery pipelines receive ONLY (book_idx, chapter_num)
|
|
|
# ============================================================
|
|
|
|
|
|
from celery_app import celery_app
|
|
|
from logbus.publisher import log
|
|
|
|
|
|
from scraper.download_controller import DownloadController
|
|
|
-from scraper.progress import (
|
|
|
- set_total,
|
|
|
-)
|
|
|
+from scraper.progress import set_total, set_status
|
|
|
from scraper.abort import abort_requested
|
|
|
|
|
|
-print(">>> [IMPORT] controller_tasks.py loaded")
|
|
|
+print(">>> [IMPORT] controller_tasks.py loaded (ID-only mode)")
|
|
|
|
|
|
|
|
|
@celery_app.task(bind=True, queue="controller", ignore_result=False)
|
|
|
def launch_downloads(self, book_id: str, scrape_result: dict):
|
|
|
- """
|
|
|
- Launch the entire pipeline (download → parse → save),
|
|
|
- AND initialize progress counters.
|
|
|
-
|
|
|
- Chapter-level progress is updated INSIDE the download/parse/save tasks.
|
|
|
- This task MUST NOT call .get() on async subtasks (Celery restriction).
|
|
|
- """
|
|
|
-
|
|
|
- title = scrape_result.get("title", "UnknownBook")
|
|
|
- chapters = scrape_result.get("chapters", []) or []
|
|
|
- total = len(chapters)
|
|
|
-
|
|
|
- log(f"[CTRL] Book '{title}' → {total} chapters (book_id={book_id})")
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # INIT PROGRESS
|
|
|
- # ------------------------------------------------------------
|
|
|
- set_total(book_id, total)
|
|
|
- log(f"[CTRL] Progress initialized for {book_id}: total={total}")
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # BUILD CONTROLLER
|
|
|
- # ------------------------------------------------------------
|
|
|
- ctl = DownloadController(book_id, scrape_result)
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # START PIPELINES (ASYNC)
|
|
|
- # Returns a celery group AsyncResult. We DO NOT iterate or get().
|
|
|
- # Progress & failures are handled by the worker subtasks.
|
|
|
- # ------------------------------------------------------------
|
|
|
+
|
|
|
+ title = scrape_result.get("title", "Unknown")
|
|
|
+ chapter_count = scrape_result.get("chapters", 0)
|
|
|
+ book_idx = scrape_result.get("book_id")
|
|
|
+
|
|
|
+ log(f"[CTRL] Book '{title}' → {chapter_count} chapters (book_idx={book_idx})")
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Initialize progress counters
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ set_total(book_idx, chapter_count)
|
|
|
+ set_status(book_idx, "downloading")
|
|
|
+ except Exception as exc:
|
|
|
+ log(f"[CTRL] ERROR setting up progress counters: {exc}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Build controller
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ ctl = DownloadController(scrape_result)
|
|
|
+ except Exception as exc:
|
|
|
+ log(f"[CTRL] ERROR constructing DownloadController: {exc}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Prepare folders + cover (NEW: init_book_output)
|
|
|
+ # -----------------------------------------------------------
|
|
|
try:
|
|
|
- group_result = ctl.start()
|
|
|
+ ctl.init_book_output()
|
|
|
+ except Exception as exc:
|
|
|
+ log(f"[CTRL] ERROR initializing context: {exc}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ log(f"[CTRL_STATE] init_book_output() completed for {title}")
|
|
|
|
|
|
- log(
|
|
|
- f"[CTRL] Pipelines dispatched for '{title}' "
|
|
|
- f"(book_id={book_id}, group_id={group_result.id})"
|
|
|
- )
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Abort check BEFORE pipelines
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ if abort_requested(book_idx):
|
|
|
+ log(f"[CTRL] ABORT requested BEFORE pipeline dispatch → stopping.")
|
|
|
+ set_status(book_idx, "aborted")
|
|
|
+ return {"status": "aborted"}
|
|
|
|
|
|
- # Abort flag set BEFORE tasks start?
|
|
|
- if abort_requested(book_id):
|
|
|
- log(f"[CTRL] ABORT requested before tasks start")
|
|
|
- return {"book_id": book_id, "aborted": True}
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Assign volumes
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ ctl.assign_volumes()
|
|
|
+ except Exception as exc:
|
|
|
+ log(f"[CTRL] ERROR during volume assignment: {exc}")
|
|
|
+ raise
|
|
|
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Build pipelines (primitive arguments)
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ tasks = ctl.build_pipelines()
|
|
|
+ except Exception as exc:
|
|
|
+ log(f"[CTRL] ERROR building pipelines: {exc}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Dispatch all pipelines
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ result_group = ctl.dispatch_pipelines(tasks)
|
|
|
except Exception as exc:
|
|
|
- log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
|
|
|
+ log(f"[CTRL] ERROR dispatching pipelines: {exc}")
|
|
|
raise
|
|
|
|
|
|
- # ------------------------------------------------------------
|
|
|
- # CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
|
|
|
- # (Download/parse/save tasks update progress themselves)
|
|
|
- # ------------------------------------------------------------
|
|
|
- log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
|
|
|
-
|
|
|
- return {
|
|
|
- "book_id": book_id,
|
|
|
- "total": total,
|
|
|
- "started": True,
|
|
|
- "group_id": group_result.id,
|
|
|
- }
|
|
|
+ log(f"[CTRL] Pipelines dispatched for {title} (book_idx={book_idx})")
|
|
|
+ return {"book_idx": book_idx, "pipelines": len(tasks)}
|
|
|
diff --git a/bookscraper/scraper/tasks/download_tasks.py b/bookscraper/scraper/tasks/download_tasks.py
|
|
|
index 5110483..1e01630 100644
|
|
|
--- a/bookscraper/scraper/tasks/download_tasks.py
|
|
|
+++ b/bookscraper/scraper/tasks/download_tasks.py
|
|
|
@@ -1,22 +1,41 @@
|
|
|
# ============================================================
|
|
|
# File: scraper/tasks/download_tasks.py
|
|
|
-# Purpose: Download chapter HTML with global concurrency,
|
|
|
-# retry/backoff logic, 429 support, and abort-awareness.
|
|
|
#
|
|
|
-# Logging:
|
|
|
-# - timestamp + book_id in de message
|
|
|
-# - message wordt via publisher.py naar console gestuurd
|
|
|
-# - message wordt via ui_log.push_ui naar Redis GUI logbuffer gestuurd
|
|
|
+# FINAL ARCHITECTURE — CELERY-SAFE VERSION
|
|
|
+#
|
|
|
+# This task downloads EXACTLY ONE CHAPTER using ONLY primitive
|
|
|
+# Celery-safe arguments:
|
|
|
+#
|
|
|
+# download_chapter(book_idx, chapter_num, chapter_url, volume_path)
|
|
|
+#
|
|
|
+# BookContext / ChapterContext are NOT passed into Celery and
|
|
|
+# NOT loaded inside workers. Workers ONLY manipulate:
|
|
|
+#
|
|
|
+# • Redis abort flags
|
|
|
+# • Redis chapter-start markers
|
|
|
+# • Redis progress counters
|
|
|
+# • save_path derived from volume_path + chapter_num
|
|
|
+#
|
|
|
+# The chapter HTML is written to disk and a small Redis skip-
|
|
|
+# reason is stored when needed. All other state lives outside
|
|
|
+# Celery.
|
|
|
+#
|
|
|
+# This is the architecture we agreed on to avoid JSON
|
|
|
+# serialization errors and to remove the need for
|
|
|
+# load_book_context/save_book_context.
|
|
|
#
|
|
|
-# publisher.py en ui_log.py blijven DOM.
|
|
|
# ============================================================
|
|
|
|
|
|
from celery_app import celery_app
|
|
|
from scraper.utils import get_save_path
|
|
|
from scraper.abort import abort_requested, chapter_started, mark_chapter_started
|
|
|
|
|
|
-from logbus.publisher import log # console logging (DOM)
|
|
|
-from scraper.ui_log import push_ui # GUI logging (DOM)
|
|
|
+from scraper.progress import (
|
|
|
+ inc_chapter_done,
|
|
|
+ save_skip_reason,
|
|
|
+)
|
|
|
+
|
|
|
+from logbus.publisher import log
|
|
|
|
|
|
import requests
|
|
|
import redis
|
|
|
@@ -25,50 +44,38 @@ import time
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
|
-print(">>> [IMPORT] download_tasks.py loaded")
|
|
|
+print(">>> [IMPORT] download_tasks.py loaded (final Celery-safe mode)")
|
|
|
|
|
|
|
|
|
# -----------------------------------------------------------
|
|
|
-# TIMESTAMPED LOG WRAPPER
|
|
|
+# TIMESTAMPED LOGGER (book_idx ONLY)
|
|
|
# -----------------------------------------------------------
|
|
|
-def log_msg(book_id: str, message: str):
|
|
|
+def log_msg(book_idx: str, message: str):
|
|
|
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
- full = f"{ts} [{book_id}] {message}"
|
|
|
- log(full)
|
|
|
- push_ui(full)
|
|
|
+ log(f"{ts} [{book_idx}] {message}")
|
|
|
|
|
|
|
|
|
# -----------------------------------------------------------
|
|
|
-# Retry parameters (ENV)
|
|
|
+# ENV SETTINGS
|
|
|
# -----------------------------------------------------------
|
|
|
MAX_RETRIES = int(os.getenv("DOWNLOAD_MAX_RETRIES", "7"))
|
|
|
BASE_DELAY = int(os.getenv("DOWNLOAD_BASE_DELAY", "2"))
|
|
|
BACKOFF = int(os.getenv("DOWNLOAD_BACKOFF_MULTIPLIER", "2"))
|
|
|
DELAY_429 = int(os.getenv("DOWNLOAD_429_DELAY", "10"))
|
|
|
|
|
|
-# -----------------------------------------------------------
|
|
|
-# Global concurrency
|
|
|
-# -----------------------------------------------------------
|
|
|
MAX_CONCURRENCY = int(os.getenv("DOWNLOAD_MAX_GLOBAL_CONCURRENCY", "1"))
|
|
|
-
|
|
|
-# -----------------------------------------------------------
|
|
|
-# Global delay sync
|
|
|
-# -----------------------------------------------------------
|
|
|
GLOBAL_DELAY = int(os.getenv("DOWNLOAD_GLOBAL_MIN_DELAY", "1"))
|
|
|
-DELAY_KEY = "download:delay_lock"
|
|
|
|
|
|
-# -----------------------------------------------------------
|
|
|
-# Redis
|
|
|
-# -----------------------------------------------------------
|
|
|
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
|
|
|
redis_client = redis.Redis.from_url(REDIS_URL)
|
|
|
|
|
|
SEM_KEY = "download:active"
|
|
|
+DELAY_KEY = "download:delay_lock"
|
|
|
|
|
|
|
|
|
-# ============================================================
|
|
|
-# GLOBAL DELAY FUNCTIONS
|
|
|
-# ============================================================
|
|
|
+# -----------------------------------------------------------
|
|
|
+# Delay + concurrency helpers
|
|
|
+# -----------------------------------------------------------
|
|
|
def wait_for_global_delay():
|
|
|
if GLOBAL_DELAY <= 0:
|
|
|
return
|
|
|
@@ -82,13 +89,10 @@ def set_global_delay():
|
|
|
redis_client.set(DELAY_KEY, "1", nx=True, ex=GLOBAL_DELAY)
|
|
|
|
|
|
|
|
|
-# ============================================================
|
|
|
-# GLOBAL CONCURRENCY FUNCTIONS
|
|
|
-# ============================================================
|
|
|
-def acquire_global_slot(max_slots: int, retry_delay: float = 0.5):
|
|
|
+def acquire_global_slot(max_slots: int, retry_delay=0.5):
|
|
|
while True:
|
|
|
- current = redis_client.incr(SEM_KEY)
|
|
|
- if current <= max_slots:
|
|
|
+ cur = redis_client.incr(SEM_KEY)
|
|
|
+ if cur <= max_slots:
|
|
|
return
|
|
|
redis_client.decr(SEM_KEY)
|
|
|
time.sleep(retry_delay)
|
|
|
@@ -98,81 +102,71 @@ def release_global_slot():
|
|
|
redis_client.decr(SEM_KEY)
|
|
|
|
|
|
|
|
|
-print(f">>> [CONFIG] Global concurrency = {MAX_CONCURRENCY}")
|
|
|
-print(f">>> [CONFIG] Global min delay = {GLOBAL_DELAY}s")
|
|
|
-print(
|
|
|
- f">>> [CONFIG] Retries: MAX={MAX_RETRIES}, base={BASE_DELAY}, "
|
|
|
- f"backoff={BACKOFF}, 429={DELAY_429}"
|
|
|
-)
|
|
|
-
|
|
|
-
|
|
|
# ============================================================
|
|
|
-# CELERY TASK: DOWNLOAD CHAPTER
|
|
|
+# CELERY TASK — PRIMITIVE ARG MODE
|
|
|
+#
|
|
|
+# book_idx: str Unique book identifier used for state tracking
|
|
|
+# chapter_num: int Chapter number (1-based)
|
|
|
+# chapter_url: str URL to download this chapter
|
|
|
+# volume_path: str Filesystem directory where this chapter belongs
|
|
|
# ============================================================
|
|
|
@celery_app.task(bind=True, queue="download", ignore_result=False)
|
|
|
def download_chapter(
|
|
|
- self, book_id: str, chapter_num: int, chapter_url: str, base_path: str
|
|
|
+ self, book_idx: str, chapter_num: int, chapter_url: str, volume_path: str
|
|
|
):
|
|
|
"""
|
|
|
- Download chapter HTML.
|
|
|
- Abort logic:
|
|
|
- - If abort active AND chapter not started → SKIP
|
|
|
- - If abort active BUT chapter already started → Proceed normally
|
|
|
+ Download a single chapter using ONLY Celery-safe primitives.
|
|
|
+ No BookContext or ChapterContext is ever loaded.
|
|
|
+
|
|
|
+ Writes:
|
|
|
+ <volume_path>/<chapter_num>.html
|
|
|
"""
|
|
|
|
|
|
# -----------------------------------------------------------
|
|
|
- # ABORT BEFORE START
|
|
|
+ # Abort BEFORE start
|
|
|
# -----------------------------------------------------------
|
|
|
- if abort_requested(book_id) and not chapter_started(book_id, chapter_num):
|
|
|
- msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)"
|
|
|
- log_msg(book_id, msg)
|
|
|
- return {
|
|
|
- "book_id": book_id,
|
|
|
- "chapter": chapter_num,
|
|
|
- "url": chapter_url,
|
|
|
- "html": None,
|
|
|
- "skipped": True,
|
|
|
- "path": None,
|
|
|
- "abort": True,
|
|
|
- }
|
|
|
-
|
|
|
- # Mark started
|
|
|
- mark_chapter_started(book_id, chapter_num)
|
|
|
+ if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num):
|
|
|
+ log_msg(book_idx, f"[ABORT] Skip chapter {chapter_num}")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, "abort_before_start")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
+
|
|
|
+ # Mark chapter as started
|
|
|
+ mark_chapter_started(book_idx, chapter_num)
|
|
|
|
|
|
# -----------------------------------------------------------
|
|
|
- # NEW POSITION FOR SKIP BLOCK (before any delay logic)
|
|
|
+ # Path resolution
|
|
|
# -----------------------------------------------------------
|
|
|
- save_path = get_save_path(chapter_num, base_path)
|
|
|
+ save_path = get_save_path(chapter_num, volume_path)
|
|
|
+ os.makedirs(volume_path, exist_ok=True)
|
|
|
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Skip if file already exists
|
|
|
+ # -----------------------------------------------------------
|
|
|
if os.path.exists(save_path):
|
|
|
- log_msg(book_id, f"[DL] SKIP {chapter_num} (exists) → {save_path}")
|
|
|
- return {
|
|
|
- "book_id": book_id,
|
|
|
- "chapter": chapter_num,
|
|
|
- "url": chapter_url,
|
|
|
- "html": None,
|
|
|
- "skipped": True,
|
|
|
- "path": save_path,
|
|
|
- }
|
|
|
+ log_msg(book_idx, f"[DL] SKIP {chapter_num} (exists)")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, "already_exists")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
|
|
|
# -----------------------------------------------------------
|
|
|
- # Hard delay (only for real downloads)
|
|
|
+ # Delay + concurrency enforcement
|
|
|
# -----------------------------------------------------------
|
|
|
if GLOBAL_DELAY > 0:
|
|
|
time.sleep(GLOBAL_DELAY)
|
|
|
|
|
|
- # Sync delay
|
|
|
wait_for_global_delay()
|
|
|
-
|
|
|
- # Acquire concurrency slot
|
|
|
acquire_global_slot(MAX_CONCURRENCY)
|
|
|
- log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
|
|
|
|
|
|
+ log_msg(book_idx, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # HTTP Download
|
|
|
+ # -----------------------------------------------------------
|
|
|
try:
|
|
|
- # -----------------------------------------------------------
|
|
|
- # HTTP DOWNLOAD
|
|
|
- # -----------------------------------------------------------
|
|
|
- log_msg(book_id, f"[DL] Downloading chapter {chapter_num}: {chapter_url}")
|
|
|
+ log_msg(book_idx, f"[DL] GET chapter {chapter_num}: {chapter_url}")
|
|
|
|
|
|
resp = requests.get(
|
|
|
chapter_url,
|
|
|
@@ -184,42 +178,31 @@ def download_chapter(
|
|
|
resp.encoding = resp.apparent_encoding or "gb2312"
|
|
|
html = resp.text
|
|
|
|
|
|
- log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes")
|
|
|
+ # Write file
|
|
|
+ with open(save_path, "w", encoding="utf-8") as f:
|
|
|
+ f.write(html)
|
|
|
|
|
|
- return {
|
|
|
- "book_id": book_id,
|
|
|
- "chapter": chapter_num,
|
|
|
- "url": chapter_url,
|
|
|
- "html": html,
|
|
|
- "skipped": False,
|
|
|
- "path": save_path,
|
|
|
- }
|
|
|
+ log_msg(book_idx, f"[DL] OK {chapter_num}: {len(html)} bytes")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
|
|
|
except Exception as exc:
|
|
|
- attempt = self.request.retries
|
|
|
- delay = BASE_DELAY * (BACKOFF**attempt)
|
|
|
-
|
|
|
- # 429 hard block
|
|
|
+ # 429: Too Many Requests
|
|
|
if getattr(getattr(exc, "response", None), "status_code", None) == 429:
|
|
|
- log_msg(
|
|
|
- book_id,
|
|
|
- f"[DL] 429 {chapter_num} → WAIT {DELAY_429}s "
|
|
|
- f"(attempt {attempt}/{MAX_RETRIES})",
|
|
|
- )
|
|
|
-
|
|
|
+ log_msg(book_idx, f"[DL] 429 → wait {DELAY_429}s")
|
|
|
time.sleep(DELAY_429)
|
|
|
set_global_delay()
|
|
|
raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES)
|
|
|
|
|
|
- # Normal error
|
|
|
+ # Standard error
|
|
|
+ delay = BASE_DELAY * (BACKOFF ** min(self.request.retries, 5))
|
|
|
log_msg(
|
|
|
- book_id,
|
|
|
+ book_idx,
|
|
|
f"[DL] ERROR {chapter_num}: {exc} → retry in {delay}s "
|
|
|
- f"(attempt {attempt}/{MAX_RETRIES})",
|
|
|
+ f"(attempt {self.request.retries + 1}/{MAX_RETRIES})",
|
|
|
)
|
|
|
+
|
|
|
raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
|
|
|
|
|
|
finally:
|
|
|
- set_global_delay()
|
|
|
release_global_slot()
|
|
|
- log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")
|
|
|
diff --git a/bookscraper/scraper/tasks/parse_tasks.py b/bookscraper/scraper/tasks/parse_tasks.py
|
|
|
index 52066f9..7460f70 100644
|
|
|
--- a/bookscraper/scraper/tasks/parse_tasks.py
|
|
|
+++ b/bookscraper/scraper/tasks/parse_tasks.py
|
|
|
@@ -1,155 +1,132 @@
|
|
|
-# =========================================================
|
|
|
+# ============================================================
|
|
|
# File: scraper/tasks/parse_tasks.py
|
|
|
-# Purpose: Parse downloaded HTML into clean chapter text.
|
|
|
-# Enhanced version: Piaotia H1→content extractor + clean pipeline
|
|
|
-# NO HARDCODED REPLACEMENTS — everything comes from replacement files
|
|
|
-# =========================================================
|
|
|
+#
|
|
|
+# FINAL ARCHITECTURE — CELERY-SAFE VERSION
|
|
|
+#
|
|
|
+# This task parses a downloaded chapter using ONLY primitive,
|
|
|
+# Celery-safe arguments:
|
|
|
+#
|
|
|
+# parse_chapter(book_idx, chapter_num, html_path, text_path)
|
|
|
+#
|
|
|
+# NO BookContext or ChapterContext are ever loaded.
|
|
|
+#
|
|
|
+# Responsibilities:
|
|
|
+# • Read HTML file from disk
|
|
|
+# • Convert to cleaned text
|
|
|
+# • Write .txt file back to disk
|
|
|
+# • Update Redis progress counters
|
|
|
+# • Abort-aware
|
|
|
+# • Skip if HTML missing
|
|
|
+#
|
|
|
+# This matches the structure of download_tasks.py and avoids all
|
|
|
+# serialization issues.
|
|
|
+# ============================================================
|
|
|
|
|
|
from celery_app import celery_app
|
|
|
-from bs4 import BeautifulSoup
|
|
|
|
|
|
-from scraper.utils import clean_text, load_all_replacements
|
|
|
-from scraper.tasks.download_tasks import log_msg # unified logger
|
|
|
+from scraper.utils import clean_text
|
|
|
+from scraper.abort import abort_requested, chapter_started, mark_chapter_started
|
|
|
+from scraper.progress import (
|
|
|
+ inc_chapter_done,
|
|
|
+ save_skip_reason,
|
|
|
+)
|
|
|
|
|
|
-print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
|
|
|
+from logbus.publisher import log
|
|
|
|
|
|
+import os
|
|
|
+from datetime import datetime
|
|
|
|
|
|
+
|
|
|
+print(">>> [IMPORT] parse_tasks.py loaded (final Celery-safe mode)")
|
|
|
+
|
|
|
+
|
|
|
+# -----------------------------------------------------------
|
|
|
+# TIMESTAMPED LOGGER (book_idx ONLY)
|
|
|
+# -----------------------------------------------------------
|
|
|
+def log_msg(book_idx: str, message: str):
|
|
|
+ ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ log(f"{ts} [{book_idx}] {message}")
|
|
|
+
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# CELERY TASK — PRIMITIVE ARGUMENT MODE
|
|
|
+#
|
|
|
+# book_idx: str Unique Redis state key
|
|
|
+# chapter_num: int Chapter number (1-based)
|
|
|
+# html_path: str Path to downloaded HTML file
|
|
|
+# text_path: str Path where parsed/cleaned text must be written
|
|
|
+# ============================================================
|
|
|
@celery_app.task(bind=True, queue="parse", ignore_result=False)
|
|
|
-def parse_chapter(self, download_result: dict, meta: dict):
|
|
|
-
|
|
|
- book_id = download_result.get("book_id", "NOBOOK")
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # SKIPPED DOWNLOAD → SKIP PARSE
|
|
|
- # ------------------------------------------------------------
|
|
|
- if download_result.get("skipped"):
|
|
|
- chapter = download_result.get("chapter")
|
|
|
- log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
|
|
|
- download_result["book_id"] = book_id
|
|
|
- return download_result
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # NORMAL PARSE
|
|
|
- # ------------------------------------------------------------
|
|
|
- chapter_num = download_result["chapter"]
|
|
|
- chapter_url = download_result["url"]
|
|
|
- html = download_result["html"]
|
|
|
-
|
|
|
- log_msg(book_id, f"[PARSE] Parsing chapter {chapter_num}")
|
|
|
-
|
|
|
- soup = BeautifulSoup(html, "lxml")
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # STRICT SELECTORS (direct content blocks)
|
|
|
- # ------------------------------------------------------------
|
|
|
- selectors = [
|
|
|
- "#content",
|
|
|
- "div#content",
|
|
|
- ".content",
|
|
|
- "div.content",
|
|
|
- "#chaptercontent",
|
|
|
- "div#chaptercontent",
|
|
|
- "#chapterContent",
|
|
|
- ".read-content",
|
|
|
- "div.read-content",
|
|
|
- ]
|
|
|
-
|
|
|
- node = None
|
|
|
- for sel in selectors:
|
|
|
- tmp = soup.select_one(sel)
|
|
|
- if tmp:
|
|
|
- node = tmp
|
|
|
- break
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # PIAOTIA FALLBACK:
|
|
|
- # Extract content between <H1> and the "bottomlink" block.
|
|
|
- # ------------------------------------------------------------
|
|
|
- raw = None
|
|
|
- if node is None:
|
|
|
- h1 = soup.find("h1")
|
|
|
- if h1:
|
|
|
- content_parts = []
|
|
|
- for sib in h1.next_siblings:
|
|
|
-
|
|
|
- # stop at bottom navigation/footer block
|
|
|
- sib_class = getattr(sib, "get", lambda *_: None)("class")
|
|
|
- if sib_class and (
|
|
|
- "bottomlink" in sib_class or sib_class == "bottomlink"
|
|
|
- ):
|
|
|
- break
|
|
|
-
|
|
|
- # ignore typical noise containers
|
|
|
- if getattr(sib, "name", None) in ["script", "style", "center"]:
|
|
|
- continue
|
|
|
-
|
|
|
- if hasattr(sib, "get_text"):
|
|
|
- content_parts.append(sib.get_text(separator="\n"))
|
|
|
- else:
|
|
|
- content_parts.append(str(sib))
|
|
|
-
|
|
|
- raw = "\n".join(content_parts)
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # FINAL FALLBACK
|
|
|
- # ------------------------------------------------------------
|
|
|
- if raw is None:
|
|
|
- if node:
|
|
|
- raw = node.get_text(separator="\n")
|
|
|
- else:
|
|
|
- # drop scripts & styles
|
|
|
- for tag in soup(["script", "style", "noscript"]):
|
|
|
- tag.decompose()
|
|
|
-
|
|
|
- raw = soup.get_text(separator="\n")
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # MULTIPASS CLEANING via replacement files ONLY
|
|
|
- # ------------------------------------------------------------
|
|
|
- REPL = load_all_replacements()
|
|
|
-
|
|
|
- text = raw
|
|
|
- for _ in range(5): # like the C# CleanText loop
|
|
|
- text = clean_text(text, REPL)
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # Collapse excessive empty lines
|
|
|
- # ------------------------------------------------------------
|
|
|
- cleaned = []
|
|
|
- prev_blank = False
|
|
|
-
|
|
|
- for line in text.split("\n"):
|
|
|
- stripped = line.rstrip()
|
|
|
- if stripped == "":
|
|
|
- if prev_blank:
|
|
|
- continue
|
|
|
- prev_blank = True
|
|
|
- cleaned.append("")
|
|
|
- else:
|
|
|
- prev_blank = False
|
|
|
- cleaned.append(stripped)
|
|
|
-
|
|
|
- text = "\n".join(cleaned)
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # Add header to chapter 1
|
|
|
- # ------------------------------------------------------------
|
|
|
- if chapter_num == 1:
|
|
|
- book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
|
|
|
- header = (
|
|
|
- f"{meta.get('title','')}\n"
|
|
|
- f"Author: {meta.get('author','')}\n"
|
|
|
- f"Description:\n{meta.get('description','')}\n"
|
|
|
- f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
|
|
|
- )
|
|
|
- text = header + text
|
|
|
-
|
|
|
- log_msg(book_id, f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars")
|
|
|
-
|
|
|
- return {
|
|
|
- "book_id": book_id,
|
|
|
- "chapter": chapter_num,
|
|
|
- "url": chapter_url,
|
|
|
- "text": text,
|
|
|
- "length": len(text),
|
|
|
- }
|
|
|
+def parse_chapter(
|
|
|
+ self, book_idx: str, chapter_num: int, html_path: str, text_path: str
|
|
|
+):
|
|
|
+ """
|
|
|
+ Parse a downloaded chapter using ONLY primitive arguments.
|
|
|
+ Converts HTML → cleaned text and writes it to disk.
|
|
|
+ """
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Abort BEFORE start
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num):
|
|
|
+ log_msg(book_idx, f"[ABORT] Skip PARSE {chapter_num}")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, "abort_before_start")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
+
|
|
|
+ # Mark as started
|
|
|
+ mark_chapter_started(book_idx, chapter_num)
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Check if HTML file exists
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ if not os.path.exists(html_path):
|
|
|
+ log_msg(book_idx, f"[PARSE] SKIP {chapter_num} (no HTML file)")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, "no_html_file")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Load HTML
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ with open(html_path, "r", encoding="utf-8") as f:
|
|
|
+ raw_html = f.read()
|
|
|
+ except Exception as exc:
|
|
|
+ log_msg(book_idx, f"[PARSE] ERROR reading HTML {chapter_num}: {exc}")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, f"read_error: {exc}")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
+
|
|
|
+ if not raw_html.strip():
|
|
|
+ log_msg(book_idx, f"[PARSE] SKIP {chapter_num} (empty HTML)")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, "html_empty")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Clean HTML → Text
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ log_msg(book_idx, f"[PARSE] Start {chapter_num}")
|
|
|
+
|
|
|
+ cleaned = clean_text(raw_html)
|
|
|
+
|
|
|
+ os.makedirs(os.path.dirname(text_path), exist_ok=True)
|
|
|
+ with open(text_path, "w", encoding="utf-8") as f:
|
|
|
+ f.write(cleaned)
|
|
|
+
|
|
|
+ log_msg(book_idx, f"[PARSE] OK {chapter_num}: {len(cleaned)} chars")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
+
|
|
|
+ except Exception as exc:
|
|
|
+ log_msg(book_idx, f"[PARSE] ERROR {chapter_num}: {exc}")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, f"parse_error: {exc}")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
diff --git a/bookscraper/scraper/tasks/pipeline.py b/bookscraper/scraper/tasks/pipeline.py
|
|
|
index 9da657e..2dae558 100644
|
|
|
--- a/bookscraper/scraper/tasks/pipeline.py
|
|
|
+++ b/bookscraper/scraper/tasks/pipeline.py
|
|
|
@@ -1,17 +1,19 @@
|
|
|
-# =========================================================
|
|
|
+# ============================================================
|
|
|
# File: scraper/tasks/pipeline.py
|
|
|
# Purpose:
|
|
|
-# Build Celery chains for chapter processing.
|
|
|
+# Build a per-chapter Celery pipeline using ONLY JSON-safe
|
|
|
+# arguments: (book_idx, chapter_num).
|
|
|
#
|
|
|
-# Chain:
|
|
|
-# download_chapter(book_id, chapter_num, url, base_path)
|
|
|
-# → parse_chapter(download_result, meta)
|
|
|
-# → save_chapter(parsed_result, base_path)
|
|
|
-# → update_progress(final_result, book_id)
|
|
|
+# Pipeline:
|
|
|
+# download_chapter(book_idx, chapter_num)
|
|
|
+# → parse_chapter(book_idx, chapter_num)
|
|
|
+# → save_chapter(book_idx, chapter_num)
|
|
|
+# → update_progress(book_idx)
|
|
|
#
|
|
|
-# All subtasks must pass through result dicts untouched so the
|
|
|
-# next stage receives the correct fields.
|
|
|
-# =========================================================
|
|
|
+# No BookContext/ChapterContext objects are passed through
|
|
|
+# Celery anymore. All context is loaded internally inside
|
|
|
+# each task.
|
|
|
+# ============================================================
|
|
|
|
|
|
from celery import chain
|
|
|
|
|
|
@@ -21,25 +23,41 @@ from scraper.tasks.save_tasks import save_chapter
|
|
|
from scraper.tasks.progress_tasks import update_progress
|
|
|
|
|
|
|
|
|
-def build_chapter_pipeline(
|
|
|
- book_id: str,
|
|
|
- chapter_number: int,
|
|
|
- chapter_url: str,
|
|
|
- base_path: str,
|
|
|
- meta: dict,
|
|
|
-):
|
|
|
+print(">>> [IMPORT] pipeline.py loaded (ID-only mode)")
|
|
|
+
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# Build chapter pipeline
|
|
|
+# ============================================================
|
|
|
+def build_chapter_pipeline(book_idx: str, chapter_num: int):
|
|
|
"""
|
|
|
- Build a Celery chain for one chapter.
|
|
|
+ Constructs the Celery chain for a single chapter.
|
|
|
|
|
|
- download_chapter(book_id, chapter_number, chapter_url, base_path)
|
|
|
- → parse_chapter(download_result, meta)
|
|
|
- → save_chapter(parsed_result, base_path)
|
|
|
- → update_progress(result, book_id)
|
|
|
+ Every task receives only JSON-safe arguments. All state
|
|
|
+ (BookContext + ChapterContext) is loaded inside the task.
|
|
|
"""
|
|
|
|
|
|
return chain(
|
|
|
- download_chapter.s(book_id, chapter_number, chapter_url, base_path),
|
|
|
- parse_chapter.s(meta),
|
|
|
- save_chapter.s(base_path),
|
|
|
- update_progress.s(book_id),
|
|
|
+ download_chapter.s(book_idx, chapter_num),
|
|
|
+ parse_chapter.s(book_idx, chapter_num),
|
|
|
+ save_chapter.s(book_idx, chapter_num),
|
|
|
+ # update_progress only needs book_idx.
|
|
|
+ # BUT chain forwards previous task's result → so we accept *args.
|
|
|
+ update_progress.s(book_idx),
|
|
|
)
|
|
|
+
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# Build pipelines for all chapters
|
|
|
+# (usually called from download_controller)
|
|
|
+# ============================================================
|
|
|
+def build_all_pipelines(book_idx: str, chapters):
|
|
|
+ """
|
|
|
+ Utility: given a list of chapter numbers, build a list of chains.
|
|
|
+
|
|
|
+ chapters = [1, 2, 3, ...]
|
|
|
+ """
|
|
|
+ pipelines = []
|
|
|
+ for ch in chapters:
|
|
|
+ pipelines.append(build_chapter_pipeline(book_idx, ch))
|
|
|
+ return pipelines
|
|
|
diff --git a/bookscraper/scraper/tasks/progress_tasks.py b/bookscraper/scraper/tasks/progress_tasks.py
|
|
|
index 9045fab..717a70c 100644
|
|
|
--- a/bookscraper/scraper/tasks/progress_tasks.py
|
|
|
+++ b/bookscraper/scraper/tasks/progress_tasks.py
|
|
|
@@ -1,43 +1,72 @@
|
|
|
# ============================================================
|
|
|
# File: scraper/tasks/progress_tasks.py
|
|
|
-# Purpose: Central progress updater for chapter pipelines.
|
|
|
+# Purpose:
|
|
|
+# Update pipeline progress after each chapter finishes.
|
|
|
+#
|
|
|
+# MUST accept chain-call semantics:
|
|
|
+# update_progress(previous_result, book_idx)
|
|
|
+#
|
|
|
+# Only book_idx is meaningful; previous_result is ignored.
|
|
|
+#
|
|
|
+# JSON-safe: no BookContext or ChapterContext objects are
|
|
|
+# ever passed into Celery tasks.
|
|
|
# ============================================================
|
|
|
|
|
|
from celery_app import celery_app
|
|
|
-from scraper.progress import inc_completed, inc_skipped, inc_failed
|
|
|
+from scraper.progress import inc_chapter_done, set_status, get_progress
|
|
|
+
|
|
|
from logbus.publisher import log
|
|
|
+from datetime import datetime
|
|
|
+
|
|
|
+
|
|
|
+print(">>> [IMPORT] progress_tasks.py loaded (ID-only mode)")
|
|
|
|
|
|
-print(">>> [IMPORT] progress_tasks.py loaded")
|
|
|
|
|
|
+# -----------------------------------------------------------
|
|
|
+# TIMESTAMPED LOGGER (book_idx ONLY)
|
|
|
+# -----------------------------------------------------------
|
|
|
+def log_msg(book_idx: str, msg: str):
|
|
|
+ ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ log(f"{ts} [{book_idx}] {msg}")
|
|
|
|
|
|
-@celery_app.task(bind=False, name="progress.update", queue="controller")
|
|
|
-def update_progress(result: dict, book_id: str):
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# CELERY TASK — must accept chain semantics
|
|
|
+# ============================================================
|
|
|
+@celery_app.task(bind=True, queue="progress", ignore_result=False)
|
|
|
+def update_progress(self, *args):
|
|
|
"""
|
|
|
- Central progress logic:
|
|
|
- - result: output of save_chapter
|
|
|
- - book_id: explicitly passed by pipeline
|
|
|
+ Chain-safe progress update.
|
|
|
+
|
|
|
+ Celery chain will call:
|
|
|
+ update_progress(previous_result, book_idx)
|
|
|
+
|
|
|
+ Therefore:
|
|
|
+ book_idx = args[-1]
|
|
|
|
|
|
- IMPORTANT:
|
|
|
- - save_chapter already updates counters for skipped & normal chapters
|
|
|
- - progress.update MUST NOT double-increment
|
|
|
+ This increments the done counter and sets status to
|
|
|
+ "completed" once all chapters are done.
|
|
|
"""
|
|
|
|
|
|
- ch = result.get("chapter")
|
|
|
- skipped = result.get("skipped", False)
|
|
|
- failed = result.get("failed", False)
|
|
|
+ if not args:
|
|
|
+ return None
|
|
|
+
|
|
|
+ # Last argument is ALWAYS the book_idx
|
|
|
+ book_idx = args[-1]
|
|
|
+
|
|
|
+ # Increment chapter_done counter
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
|
|
|
- if failed:
|
|
|
- inc_failed(book_id)
|
|
|
- log(f"[PROG] FAILED chapter {ch}")
|
|
|
+ # Fetch progress counters
|
|
|
+ prog = get_progress(book_idx)
|
|
|
+ done = prog.get("done", 0)
|
|
|
+ total = prog.get("total", 0)
|
|
|
|
|
|
- elif skipped:
|
|
|
- # save_chapter already did:
|
|
|
- # inc_skipped + inc_completed
|
|
|
- log(f"[PROG] SKIPPED chapter {ch}")
|
|
|
+ log_msg(book_idx, f"[PROGRESS] Updated: {done}/{total}")
|
|
|
|
|
|
- else:
|
|
|
- # Normal completion: save_chapter only does inc_completed
|
|
|
- inc_completed(book_id)
|
|
|
- log(f"[PROG] DONE chapter {ch}")
|
|
|
+ # If finished → update status
|
|
|
+ if total > 0 and done >= total:
|
|
|
+ set_status(book_idx, "completed")
|
|
|
+ log_msg(book_idx, "[PROGRESS] All chapters completed")
|
|
|
|
|
|
- return result
|
|
|
+ return None
|
|
|
diff --git a/bookscraper/scraper/tasks/save_tasks.py b/bookscraper/scraper/tasks/save_tasks.py
|
|
|
index 8aa0578..e615e9a 100644
|
|
|
--- a/bookscraper/scraper/tasks/save_tasks.py
|
|
|
+++ b/bookscraper/scraper/tasks/save_tasks.py
|
|
|
@@ -1,123 +1,128 @@
|
|
|
# ============================================================
|
|
|
# File: scraper/tasks/save_tasks.py
|
|
|
-# Purpose: Save parsed chapter text to disk + trigger audio.
|
|
|
+#
|
|
|
+# FINAL ARCHITECTURE — CELERY-SAFE VERSION
|
|
|
+#
|
|
|
+# save_chapter(book_idx, chapter_num, text_path, json_path)
|
|
|
+#
|
|
|
+# Deze task slaat GEEN BookContext meer op, en laadt hem ook niet.
|
|
|
+# Alle data komt uit PRIMITIEVE argumenten zodat Celery nooit
|
|
|
+# hoeft te serialiseren of picklen.
|
|
|
+#
|
|
|
+# Functionaliteit:
|
|
|
+# • Abort-aware
|
|
|
+# • Skip als text ontbreekt
|
|
|
+# • Schrijft JSON chapter-object naar disk
|
|
|
+# • Houdt Redis progress state bij
|
|
|
+#
|
|
|
+# Deze versie is 100% consistent met download → parse → save pipeline.
|
|
|
# ============================================================
|
|
|
|
|
|
-print(">>> [IMPORT] save_tasks.py loaded")
|
|
|
+from celery_app import celery_app
|
|
|
|
|
|
-from celery import shared_task
|
|
|
+from scraper.abort import abort_requested, chapter_started, mark_chapter_started
|
|
|
+from scraper.progress import inc_chapter_done, save_skip_reason
|
|
|
+
|
|
|
+from logbus.publisher import log
|
|
|
+import json
|
|
|
import os
|
|
|
+from datetime import datetime
|
|
|
+
|
|
|
|
|
|
-from scraper.utils import get_save_path
|
|
|
-from scraper.tasks.download_tasks import log_msg # unified logger
|
|
|
-from scraper.progress import (
|
|
|
- inc_completed,
|
|
|
- inc_skipped,
|
|
|
- inc_failed,
|
|
|
- add_failed_chapter,
|
|
|
-)
|
|
|
+print(">>> [IMPORT] save_tasks.py loaded (final Celery-safe mode)")
|
|
|
|
|
|
-from scraper.tasks.audio_tasks import generate_audio
|
|
|
|
|
|
+# -----------------------------------------------------------
|
|
|
+# TIMESTAMP LOGGER
|
|
|
+# -----------------------------------------------------------
|
|
|
+def log_msg(book_idx: str, message: str):
|
|
|
+ ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ log(f"{ts} [{book_idx}] {message}")
|
|
|
|
|
|
-@shared_task(bind=True, queue="save", ignore_result=False)
|
|
|
-def save_chapter(self, parsed: dict, base_path: str):
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# CELERY TASK — primitive arguments only
|
|
|
+#
|
|
|
+# book_idx: str
|
|
|
+# chapter_num: int
|
|
|
+# text_path: str path to parsed .txt file
|
|
|
+# json_path: str output path for chapter JSON model
|
|
|
+# ============================================================
|
|
|
+@celery_app.task(bind=True, queue="save", ignore_result=False)
|
|
|
+def save_chapter(self, book_idx: str, chapter_num: int, text_path: str, json_path: str):
|
|
|
"""
|
|
|
- Save parsed chapter text to disk.
|
|
|
-
|
|
|
- parsed = {
|
|
|
- "book_id": str,
|
|
|
- "chapter": int,
|
|
|
- "text": str,
|
|
|
- "url": str,
|
|
|
- "skipped": bool,
|
|
|
- "path": optional str
|
|
|
- }
|
|
|
+ Save parsed chapter text + metadata to JSON on disk.
|
|
|
+ No BookContext is loaded or saved.
|
|
|
"""
|
|
|
|
|
|
- book_id = parsed.get("book_id", "NOBOOK")
|
|
|
- chapter = parsed.get("chapter")
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # SKIP CASE (download or parse skipped the chapter)
|
|
|
- # ------------------------------------------------------------
|
|
|
- if parsed.get("skipped"):
|
|
|
- path = parsed.get("path", "(no-path)")
|
|
|
- log_msg(book_id, f"[SAVE] SKIP chapter {chapter} → {path}")
|
|
|
-
|
|
|
- inc_skipped(book_id)
|
|
|
-
|
|
|
- # Determine volume name from the base path
|
|
|
- volume_name = os.path.basename(base_path.rstrip("/"))
|
|
|
-
|
|
|
- # Queue audio using the existing saved file
|
|
|
- try:
|
|
|
- generate_audio.delay(
|
|
|
- book_id,
|
|
|
- volume_name,
|
|
|
- chapter,
|
|
|
- f"Chapter {chapter}",
|
|
|
- path, # <<-- correct: this is always the real file path
|
|
|
- )
|
|
|
- log_msg(
|
|
|
- book_id,
|
|
|
- f"[AUDIO] Task queued (SKIPPED) for chapter {chapter} in {volume_name}",
|
|
|
- )
|
|
|
- except Exception as audio_exc:
|
|
|
- log_msg(
|
|
|
- book_id,
|
|
|
- f"[AUDIO] ERROR queueing (SKIPPED) chapter {chapter}: {audio_exc}",
|
|
|
- )
|
|
|
-
|
|
|
- return {
|
|
|
- "book_id": book_id, # <<< FIXED
|
|
|
- "chapter": chapter,
|
|
|
- "path": path,
|
|
|
- "skipped": True,
|
|
|
- }
|
|
|
-
|
|
|
- # ------------------------------------------------------------
|
|
|
- # NORMAL SAVE CASE
|
|
|
- # ------------------------------------------------------------
|
|
|
- try:
|
|
|
- text = parsed.get("text", "")
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Abort BEFORE the task starts
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ if abort_requested(book_idx) and not chapter_started(book_idx, chapter_num):
|
|
|
+ log_msg(book_idx, f"[ABORT] Skip SAVE {chapter_num}")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, "abort_before_start")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
|
|
|
- if chapter is None:
|
|
|
- raise ValueError("Missing chapter number in parsed payload")
|
|
|
+ # Mark chapter as started
|
|
|
+ mark_chapter_started(book_idx, chapter_num)
|
|
|
|
|
|
- # Ensure chapter folder exists
|
|
|
- os.makedirs(base_path, exist_ok=True)
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Ensure parsed text exists
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ if not os.path.exists(text_path):
|
|
|
+ log_msg(book_idx, f"[SAVE] SKIP {chapter_num} (missing text file)")
|
|
|
|
|
|
- # Build chapter file path
|
|
|
- path = get_save_path(chapter, base_path)
|
|
|
+ save_skip_reason(book_idx, chapter_num, "no_text_file")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
|
|
|
- # Save chapter text to disk
|
|
|
- with open(path, "w", encoding="utf-8") as f:
|
|
|
- f.write(text)
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Read parsed text
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ with open(text_path, "r", encoding="utf-8") as f:
|
|
|
+ text = f.read()
|
|
|
+ except Exception as exc:
|
|
|
+ log_msg(book_idx, f"[SAVE] ERROR reading text {chapter_num}: {exc}")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, f"text_read_error: {exc}")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
+
|
|
|
+ if not text.strip():
|
|
|
+ log_msg(book_idx, f"[SAVE] SKIP {chapter_num} (text empty)")
|
|
|
|
|
|
- log_msg(book_id, f"[SAVE] Saved chapter {chapter} → {path}")
|
|
|
+ save_skip_reason(book_idx, chapter_num, "text_empty")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
|
|
|
- inc_completed(book_id)
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Build JSON chapter representation
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ chapter_obj = {
|
|
|
+ "chapter_num": chapter_num,
|
|
|
+ "text": text,
|
|
|
+ }
|
|
|
+
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ # Write JSON output
|
|
|
+ # -----------------------------------------------------------
|
|
|
+ try:
|
|
|
+ os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
|
|
|
|
|
- # Determine volume name
|
|
|
- volume_name = os.path.basename(base_path.rstrip("/"))
|
|
|
+ with open(json_path, "w", encoding="utf-8") as f:
|
|
|
+ json.dump(chapter_obj, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
- # Queue audio task (always use the saved file path)
|
|
|
- try:
|
|
|
- generate_audio.delay(
|
|
|
- book_id,
|
|
|
- volume_name,
|
|
|
- chapter,
|
|
|
- f"Chapter {chapter}",
|
|
|
- path,
|
|
|
- )
|
|
|
- log_msg(
|
|
|
- book_id, f"[AUDIO] Task queued for chapter {chapter} in {volume_name}"
|
|
|
- )
|
|
|
- except Exception as audio_exc:
|
|
|
- log_msg(book_id, f"[AUDIO] ERROR queueing chapter {chapter}: {audio_exc}")
|
|
|
+ log_msg(book_idx, f"[SAVE] OK {chapter_num}: {json_path}")
|
|
|
|
|
|
- return {"book_id": book_id, "chapter": chapter, "path": path}
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
|
|
|
except Exception as exc:
|
|
|
- log_msg(book_id, f"[SAVE] ERROR saving chapter {chapter}: {exc}")
|
|
|
+ log_msg(book_idx, f"[SAVE] ERROR writing JSON {chapter_num}: {exc}")
|
|
|
+
|
|
|
+ save_skip_reason(book_idx, chapter_num, f"json_write_error: {exc}")
|
|
|
+ inc_chapter_done(book_idx)
|
|
|
+ return None
|
|
|
diff --git a/bookscraper/scraper/tasks/scraping.py b/bookscraper/scraper/tasks/scraping.py
|
|
|
index 0694089..70198d0 100644
|
|
|
--- a/bookscraper/scraper/tasks/scraping.py
|
|
|
+++ b/bookscraper/scraper/tasks/scraping.py
|
|
|
@@ -12,11 +12,11 @@ import redis
|
|
|
from scraper.sites import BookSite
|
|
|
from scraper.book_scraper import BookScraper
|
|
|
from scraper.abort import clear_abort # no circular deps
|
|
|
-from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT
|
|
|
+from scraper.ui_log import reset_ui_logs # NEW
|
|
|
|
|
|
print(">>> [IMPORT] scraping.py loaded")
|
|
|
|
|
|
-# Redis connection (same as Celery broker)
|
|
|
+# Redis = same URL as Celery broker
|
|
|
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
|
|
|
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
|
|
|
|
|
|
@@ -26,24 +26,24 @@ def start_scrape_book(self, url: str):
|
|
|
"""Scrapes metadata + chapters and prepares download tracking."""
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
- # NEW: clear UI log buffer at start of new run
|
|
|
+ # Clear UI logs for a fresh run
|
|
|
# ------------------------------------------------------------
|
|
|
reset_ui_logs()
|
|
|
|
|
|
log(f"[SCRAPING] Start scraping for: {url}")
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
- # Book scrape
|
|
|
+ # Scrape metadata + chapters
|
|
|
# ------------------------------------------------------------
|
|
|
site = BookSite()
|
|
|
scraper = BookScraper(site, url)
|
|
|
- result = scraper.execute() # returns dict with metadata + chapters
|
|
|
+ result = scraper.execute() # dict with metadata + chapters list
|
|
|
|
|
|
chapters = result.get("chapters", [])
|
|
|
full_count = len(chapters)
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
- # DRY RUN
|
|
|
+ # DRY RUN (limit number of chapters)
|
|
|
# ------------------------------------------------------------
|
|
|
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
|
|
|
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
|
|
|
@@ -51,34 +51,45 @@ def start_scrape_book(self, url: str):
|
|
|
if DRY_RUN:
|
|
|
log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}")
|
|
|
chapters = chapters[:TEST_LIMIT]
|
|
|
- result["chapters"] = chapters
|
|
|
+
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ # NORMALISE OUTPUT FORMAT
|
|
|
+ # - chapters = INT
|
|
|
+ # - chapter_list = LIST
|
|
|
+ # ------------------------------------------------------------
|
|
|
+ result["chapter_list"] = chapters
|
|
|
+ result["chapters"] = len(chapters)
|
|
|
|
|
|
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
- # BOOK RUN ID (using title as ID)
|
|
|
+ # Ensure book_id exists
|
|
|
# ------------------------------------------------------------
|
|
|
- title = result.get("title") or "UnknownBook"
|
|
|
- book_id = title # user requirement
|
|
|
+ book_idx = result.get("book_idx")
|
|
|
+ if not book_idx:
|
|
|
+ raise ValueError("BookScraper did not return book_idx")
|
|
|
|
|
|
+ book_id = book_idx
|
|
|
result["book_id"] = book_id
|
|
|
|
|
|
- log(f"[SCRAPING] Assigned book_id = '{book_id}'")
|
|
|
+ log(f"[SCRAPING] Assigned book_id = {book_id}")
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
# RESET ABORT + INITIALISE PROGRESS
|
|
|
# ------------------------------------------------------------
|
|
|
clear_abort(book_id)
|
|
|
|
|
|
- r.set(f"progress:{book_id}:total", len(chapters))
|
|
|
+ r.set(f"progress:{book_id}:total", result["chapters"])
|
|
|
r.set(f"progress:{book_id}:done", 0)
|
|
|
- r.delete(f"logs:{book_id}") # clear old logs if any
|
|
|
+
|
|
|
+ # clear legacy logs
|
|
|
+ r.delete(f"logs:{book_id}")
|
|
|
|
|
|
r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}")
|
|
|
- r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters")
|
|
|
+ r.rpush(f"logs:{book_id}", f":: Found {result['chapters']} chapters")
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
- # DISPATCH DOWNLOAD CONTROLLER
|
|
|
+ # DISPATCH CONTROLLER (book_idx + primitive metadata)
|
|
|
# ------------------------------------------------------------
|
|
|
celery_app.send_task(
|
|
|
"scraper.tasks.controller_tasks.launch_downloads",
|
|
|
@@ -86,11 +97,11 @@ def start_scrape_book(self, url: str):
|
|
|
queue="controller",
|
|
|
)
|
|
|
|
|
|
- log(f"[SCRAPING] Dispatched download controller for '{book_id}'")
|
|
|
+ log(f"[SCRAPING] Dispatched download controller for {book_id}")
|
|
|
|
|
|
return {
|
|
|
"book_id": book_id,
|
|
|
"title": result.get("title"),
|
|
|
"author": result.get("author"),
|
|
|
- "chapters": len(chapters),
|
|
|
+ "chapters": result["chapters"], # integer
|
|
|
}
|
|
|
diff --git a/bookscraper/templates/index.html b/bookscraper/templates/index.html
|
|
|
index a8a4b76..a751f12 100644
|
|
|
--- a/bookscraper/templates/index.html
|
|
|
+++ b/bookscraper/templates/index.html
|
|
|
@@ -1,34 +1,28 @@
|
|
|
-<!DOCTYPE html>
|
|
|
-<html lang="nl">
|
|
|
-<head>
|
|
|
- <meta charset="UTF-8">
|
|
|
- <title>BookScraper</title>
|
|
|
- <style>
|
|
|
- body { font-family: Arial, sans-serif; padding: 40px; max-width: 600px; margin: auto; }
|
|
|
- h1 { margin-bottom: 20px; }
|
|
|
- input[type="text"] {
|
|
|
- width: 100%; padding: 12px; font-size: 16px;
|
|
|
- border: 1px solid #ccc; border-radius: 6px;
|
|
|
- }
|
|
|
- button {
|
|
|
- margin-top: 20px;
|
|
|
- padding: 12px 20px;
|
|
|
- background: #007bff; color: white;
|
|
|
- border: none; border-radius: 6px;
|
|
|
- font-size: 16px; cursor: pointer;
|
|
|
- }
|
|
|
- button:hover { background: #0056b3; }
|
|
|
- </style>
|
|
|
-</head>
|
|
|
-<body>
|
|
|
+{% extends "base.html" %} {% block content %}
|
|
|
|
|
|
-<h1>BookScraper WebGUI</h1>
|
|
|
+<h1>BookScraper</h1>
|
|
|
+
|
|
|
+<div class="card">
|
|
|
+ <h3>Start new scrape</h3>
|
|
|
+
|
|
|
+ <form action="/start" method="POST">
|
|
|
+ <label for="url">Book URL:</label>
|
|
|
+ <input
|
|
|
+ type="text"
|
|
|
+ id="url"
|
|
|
+ name="url"
|
|
|
+ placeholder="https://www.example.com/book/123"
|
|
|
+ required
|
|
|
+ />
|
|
|
|
|
|
-<form action="/start" method="POST">
|
|
|
- <label for="url">Geef een boek-URL op:</label><br><br>
|
|
|
- <input type="text" id="url" name="url" placeholder="https://example.com/book/12345" required>
|
|
|
<button type="submit">Start Scraping</button>
|
|
|
-</form>
|
|
|
+ </form>
|
|
|
+</div>
|
|
|
+
|
|
|
+<div class="card">
|
|
|
+ <h3>Library</h3>
|
|
|
+ <p>Bekijk alle eerder gescrapete boeken.</p>
|
|
|
+ <a href="/library">Open Library</a>
|
|
|
+</div>
|
|
|
|
|
|
-</body>
|
|
|
-</html>
|
|
|
+{% endblock %}
|
|
|
diff --git a/bookscraper/templates/result.html b/bookscraper/templates/result.html
|
|
|
index 57aabf9..379cf6b 100644
|
|
|
--- a/bookscraper/templates/result.html
|
|
|
+++ b/bookscraper/templates/result.html
|
|
|
@@ -1,239 +1,33 @@
|
|
|
-<!DOCTYPE html>
|
|
|
-<html lang="nl">
|
|
|
- <head>
|
|
|
- <meta charset="UTF-8" />
|
|
|
- <title>BookScraper – Resultaat</title>
|
|
|
+{% extends "base.html" %} {% block content %}
|
|
|
|
|
|
- <style>
|
|
|
- body {
|
|
|
- font-family: Arial, sans-serif;
|
|
|
- padding: 30px;
|
|
|
- max-width: 900px;
|
|
|
- margin: auto;
|
|
|
- }
|
|
|
- h1 {
|
|
|
- margin-bottom: 10px;
|
|
|
- }
|
|
|
- .box {
|
|
|
- padding: 15px;
|
|
|
- border: 1px solid #ddd;
|
|
|
- background: #f8f8f8;
|
|
|
- border-radius: 6px;
|
|
|
- margin-bottom: 20px;
|
|
|
- }
|
|
|
- .logbox {
|
|
|
- background: #000;
|
|
|
- color: #0f0;
|
|
|
- padding: 12px;
|
|
|
- height: 70vh;
|
|
|
- overflow-y: auto;
|
|
|
- font-family: monospace;
|
|
|
- border-radius: 6px;
|
|
|
- font-size: 13px;
|
|
|
- }
|
|
|
+<h1>Scraping gestart</h1>
|
|
|
|
|
|
- /* NEW: Clear button */
|
|
|
- #clearLogBtn {
|
|
|
- margin-bottom: 10px;
|
|
|
- padding: 8px 16px;
|
|
|
- background: #777;
|
|
|
- color: white;
|
|
|
- border: none;
|
|
|
- border-radius: 6px;
|
|
|
- cursor: pointer;
|
|
|
- }
|
|
|
- #clearLogBtn:hover {
|
|
|
- background: #555;
|
|
|
- }
|
|
|
+<div class="card">
|
|
|
+ {% if error %}
|
|
|
+ <div class="alert alert-error"><strong>Fout:</strong> {{ error }}</div>
|
|
|
|
|
|
- #abortBtn {
|
|
|
- padding: 12px 20px;
|
|
|
- background: #d9534f;
|
|
|
- color: white;
|
|
|
- border: none;
|
|
|
- border-radius: 6px;
|
|
|
- cursor: pointer;
|
|
|
- margin-top: 10px;
|
|
|
- }
|
|
|
- #abortBtn:hover {
|
|
|
- background: #c9302c;
|
|
|
- }
|
|
|
- #statusLine {
|
|
|
- font-size: 18px;
|
|
|
- font-weight: bold;
|
|
|
- }
|
|
|
- .hidden {
|
|
|
- display: none;
|
|
|
- }
|
|
|
- </style>
|
|
|
- </head>
|
|
|
+ <div class="actions">
|
|
|
+ <a href="/" class="button">Terug</a>
|
|
|
+ </div>
|
|
|
|
|
|
- <body>
|
|
|
- <a href="/">← Terug</a>
|
|
|
- <h1>Scrape Resultaat--</h1>
|
|
|
+ {% else %} {% if message %}
|
|
|
+ <p>{{ message }}</p>
|
|
|
+ {% endif %} {% if scraping_task_id %}
|
|
|
+ <p><strong>Task ID:</strong> {{ scraping_task_id }}</p>
|
|
|
+ {% endif %}
|
|
|
|
|
|
- {% if error %}
|
|
|
- <div
|
|
|
- class="box"
|
|
|
- style="background: #ffdddd; border-left: 5px solid #ff4444"
|
|
|
- >
|
|
|
- <strong>Fout:</strong> {{ error }}
|
|
|
- </div>
|
|
|
- {% endif %} {% if message %}
|
|
|
- <div class="box">{{ message }}</div>
|
|
|
- {% endif %}
|
|
|
+ <p>
|
|
|
+ Je scraper is gestart.<br />
|
|
|
+ Zodra het eerste resultaat beschikbaar is, verschijnt het boek automatisch
|
|
|
+ in de <strong>Library</strong>.
|
|
|
+ </p>
|
|
|
|
|
|
- <!-- COVER -->
|
|
|
- {% if book_title %}
|
|
|
- <div class="box">
|
|
|
- <strong>Cover:</strong><br />
|
|
|
- <img
|
|
|
- src="/output/{{ book_title }}/cover.jpg"
|
|
|
- alt="Cover"
|
|
|
- style="
|
|
|
- margin-top: 10px;
|
|
|
- max-width: 250px;
|
|
|
- border: 1px solid #ccc;
|
|
|
- border-radius: 4px;
|
|
|
- "
|
|
|
- onerror="this.style.display='none'"
|
|
|
- />
|
|
|
- </div>
|
|
|
- {% endif %}
|
|
|
+ <div class="actions">
|
|
|
+ <a href="/library" class="button">Ga naar Library</a>
|
|
|
+ <a href="/" class="button">Nieuwe scrape</a>
|
|
|
+ </div>
|
|
|
|
|
|
- <div id="statusBox" class="box hidden">
|
|
|
- <div id="statusLine">Status: bezig…</div>
|
|
|
- <div id="progressText"></div>
|
|
|
- <button id="abortBtn" class="hidden">ABORT</button>
|
|
|
- </div>
|
|
|
+ {% endif %}
|
|
|
+</div>
|
|
|
|
|
|
- <!-- FAILED LIST -->
|
|
|
- <div
|
|
|
- id="failedBox"
|
|
|
- class="box hidden"
|
|
|
- style="background: #ffefef; border-left: 5px solid #cc0000"
|
|
|
- >
|
|
|
- <strong>Failed chapters:</strong>
|
|
|
- <ul id="failedList" style="margin-top: 10px"></ul>
|
|
|
- </div>
|
|
|
-
|
|
|
- <div class="box">
|
|
|
- <strong>Live log:</strong><br />
|
|
|
-
|
|
|
- <!-- NEW BUTTON -->
|
|
|
- <button id="clearLogBtn" onclick="clearLogs()">Clear logs</button>
|
|
|
-
|
|
|
- <div id="logbox" class="logbox"></div>
|
|
|
- </div>
|
|
|
-
|
|
|
- <script>
|
|
|
- const scrapingTaskId = "{{ scraping_task_id or '' }}";
|
|
|
-
|
|
|
- let bookId = null;
|
|
|
- let polling = true;
|
|
|
-
|
|
|
- if (scrapingTaskId) pollForBookId();
|
|
|
-
|
|
|
- function pollForBookId() {
|
|
|
- fetch(`/celery-result/${scrapingTaskId}`)
|
|
|
- .then((r) => r.json())
|
|
|
- .then((data) => {
|
|
|
- if (data.ready && data.result && data.result.book_id) {
|
|
|
- bookId = data.result.book_id;
|
|
|
- startLiveUI();
|
|
|
- } else setTimeout(pollForBookId, 800);
|
|
|
- })
|
|
|
- .catch(() => setTimeout(pollForBookId, 1200));
|
|
|
- }
|
|
|
-
|
|
|
- function startLiveUI() {
|
|
|
- document.getElementById("statusBox").classList.remove("hidden");
|
|
|
- document.getElementById("abortBtn").classList.remove("hidden");
|
|
|
-
|
|
|
- document.getElementById("abortBtn").onclick = () => {
|
|
|
- fetch(`/abort/${bookId}`, { method: "POST" });
|
|
|
- };
|
|
|
-
|
|
|
- pollProgress();
|
|
|
- pollLogs();
|
|
|
- }
|
|
|
-
|
|
|
- function pollProgress() {
|
|
|
- if (!bookId) return;
|
|
|
-
|
|
|
- fetch(`/progress/${bookId}`)
|
|
|
- .then((r) => r.json())
|
|
|
- .then((p) => {
|
|
|
- const done = p.completed || 0;
|
|
|
- const total = p.total || 0;
|
|
|
-
|
|
|
- document.getElementById(
|
|
|
- "progressText"
|
|
|
- ).innerText = `Completed: ${done} / ${total} | Skipped: ${
|
|
|
- p.skipped || 0
|
|
|
- } | Failed: ${p.failed || 0}`;
|
|
|
-
|
|
|
- const failedBox = document.getElementById("failedBox");
|
|
|
- const failedList = document.getElementById("failedList");
|
|
|
-
|
|
|
- if (p.failed_list && p.failed_list.length > 0) {
|
|
|
- failedBox.classList.remove("hidden");
|
|
|
- failedList.innerHTML = "";
|
|
|
- p.failed_list.forEach((entry) => {
|
|
|
- const li = document.createElement("li");
|
|
|
- li.textContent = entry;
|
|
|
- failedList.appendChild(li);
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- if (p.abort) {
|
|
|
- document.getElementById("statusLine").innerText = "ABORTED";
|
|
|
- polling = false;
|
|
|
- } else if (done >= total && total > 0) {
|
|
|
- document.getElementById("statusLine").innerText = "KLAAR ✔";
|
|
|
- polling = false;
|
|
|
- } else {
|
|
|
- document.getElementById("statusLine").innerText = "Bezig…";
|
|
|
- }
|
|
|
-
|
|
|
- if (polling) setTimeout(pollProgress, 1000);
|
|
|
- })
|
|
|
- .catch(() => {
|
|
|
- if (polling) setTimeout(pollProgress, 1500);
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- function pollLogs() {
|
|
|
- if (!polling) return;
|
|
|
-
|
|
|
- fetch(`/logs`)
|
|
|
- .then((r) => r.json())
|
|
|
- .then((data) => {
|
|
|
- const logbox = document.getElementById("logbox");
|
|
|
- logbox.innerHTML = "";
|
|
|
-
|
|
|
- data.logs.forEach((line) => {
|
|
|
- const div = document.createElement("div");
|
|
|
- div.textContent = line;
|
|
|
- logbox.appendChild(div);
|
|
|
- });
|
|
|
-
|
|
|
- logbox.scrollTop = logbox.scrollHeight;
|
|
|
- setTimeout(pollLogs, 1000);
|
|
|
- })
|
|
|
- .catch(() => setTimeout(pollLogs, 1500));
|
|
|
- }
|
|
|
-
|
|
|
- // =========================================================
|
|
|
- // NEW: Clear logs button handler
|
|
|
- // =========================================================
|
|
|
- function clearLogs() {
|
|
|
- fetch("/clear-logs", { method: "POST" })
|
|
|
- .then(() => {
|
|
|
- document.getElementById("logbox").innerHTML = "";
|
|
|
- })
|
|
|
- .catch((e) => console.error("Clear logs failed:", e));
|
|
|
- }
|
|
|
- </script>
|
|
|
- </body>
|
|
|
-</html>
|
|
|
+{% endblock %}
|