Cover download + controller async fix + robust cover parsing

2 weeks ago · 9a774c4955
parent ed341c727a
commit 9a774c4955
5 changed files with 187 additions and 49 deletions
--- a/bookscraper/app.py
+++ b/bookscraper/app.py
@ -21,9 +21,25 @@ from scraper.ui_log import get_ui_logs
 from celery.result import AsyncResult
 # ⬇⬇⬇ TOEGEVOEGD voor cover-serving
 from flask import send_from_directory
 import os
 app = Flask(__name__)
 # =====================================================
 # STATIC FILE SERVING FOR OUTPUT  ← TOEGEVOEGD
 # =====================================================
 OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@app.route("/output/<path:filename>")
 def serve_output(filename):
    """Serve output files such as cover.jpg and volumes."""
    return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
 # =====================================================
 # HOME PAGE
 # =====================================================
@ -54,6 +70,8 @@ def start_scraping():
        "result.html",
        message="Scraping gestart.",
        scraping_task_id=async_result.id,
        # voor result.html cover rendering
        book_title=None,
    )
@ -103,8 +121,6 @@ def celery_result(task_id):
 # RUN FLASK
 # =====================================================
 if __name__ == "__main__":
    import os
    debug = os.getenv("FLASK_DEBUG", "0") == "1"
    host = os.getenv("HOST", "0.0.0.0")
    port = int(os.getenv("PORT", "5000"))
--- a/bookscraper/scraper/book_scraper.py
+++ b/bookscraper/scraper/book_scraper.py
@ -3,6 +3,7 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 import re
 from scraper.logger import log_debug
 from scraper.utils import clean_text, load_replacements
@ -53,7 +54,7 @@ class BookScraper:
            "title": self.book_title,
            "author": self.book_author,
            "description": self.book_description,
-            "cover_url": self.cover_url,
+            "cover_url": self.cover_url,  # ← used by DownloadController
            "book_url": self.url,
            "chapters": [
                {"num": ch.number, "title": ch.title, "url": ch.url}
@ -106,12 +107,62 @@ class BookScraper:
    # ------------------------------------------------------------
    def _parse_cover(self, soup):
-        img = soup.find("img", src=lambda v: v and "files/article/image" in v)
+        """
-        if not img:
+        Extract correct cover based on book_id path logic.
-            log_debug("[BookScraper] No cover found")
+        1. primary: match "/files/article/image/{vol}/{book_id}/"
        2. fallback: endswith "/{book_id}s.jpg"
        """
        # Extract book_id from URL
        m = re.search(r"/(\d+)\.html$", self.url)
        if not m:
            log_debug("[BookScraper] No book_id found in URL → cannot match cover")
            return
-        self.cover_url = urljoin(self.site.root, img.get("src"))
+        book_id = m.group(1)
        # Extract vol folder from URL (bookinfo/<vol>/<id>.html)
        m2 = re.search(r"/bookinfo/(\d+)/", self.url)
        volume = m2.group(1) if m2 else None
        log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
        imgs = soup.find_all("img", src=True)
        chosen = None
        # --------------------------------------------------------
        # PRIORITY 1: Path-match
        # /files/article/image/{vol}/{book_id}/
        # --------------------------------------------------------
        if volume:
            target_path = f"/files/article/image/{volume}/{book_id}/"
            for img in imgs:
                src = img["src"]
                if target_path in src:
                    chosen = src
                    log_debug(f"[BookScraper] Cover matched by PATH: {src}")
                    break
        # --------------------------------------------------------
        # PRIORITY 2: endswith "/{book_id}s.jpg"
        # --------------------------------------------------------
        if not chosen:
            target_suffix = f"/{book_id}s.jpg"
            for img in imgs:
                src = img["src"]
                if src.endswith(target_suffix):
                    chosen = src
                    log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
                    break
        # --------------------------------------------------------
        # No match
        # --------------------------------------------------------
        if not chosen:
            log_debug("[BookScraper] No matching cover found")
            return
        self.cover_url = urljoin(self.site.root, chosen)
        log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
    # ------------------------------------------------------------
--- a/bookscraper/scraper/download_controller.py
+++ b/bookscraper/scraper/download_controller.py
@ -3,12 +3,15 @@
 # Purpose:
 #   Build Celery pipelines for all chapters
 #   and pass book_id for abort/progress/log functionality.
 #   + Download and replicate cover image to all volume folders
 # =========================================================
 from celery import group
 from scraper.tasks.pipeline import build_chapter_pipeline
 from logbus.publisher import log
 import os
 import requests
 import shutil
 class DownloadController:
@ -18,6 +21,7 @@ class DownloadController:
      - volume splitting
      - consistent meta propagation
      - book_id-based abort + progress tracking
      - cover download + volume replication
    """
    def __init__(self, book_id: str, scrape_result: dict):
@ -27,6 +31,7 @@ class DownloadController:
        # Core metadata
        self.title = scrape_result.get("title", "UnknownBook")
        self.chapters = scrape_result.get("chapters", []) or []
        self.cover_url = scrape_result.get("cover_url")
        # Output base dir
        root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@ -46,6 +51,62 @@ class DownloadController:
            "book_url": scrape_result.get("book_url"),
        }
    # ---------------------------------------------------------
    # Cover Download
    # ---------------------------------------------------------
    def download_cover(self):
        """Download one cover image into the root of the book folder."""
        if not self.cover_url:
            log(f"[CTRL] No cover URL found for '{self.title}'")
            return
        cover_path = os.path.join(self.book_base, "cover.jpg")
        # HEADERS that bypass 403 hotlink protection
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
                "Gecko/20100101 Firefox/118.0"
            ),
            "Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
        }
        try:
            log(f"[CTRL] Downloading cover: {self.cover_url}")
            resp = requests.get(self.cover_url, timeout=10, headers=headers)
            resp.raise_for_status()
            with open(cover_path, "wb") as f:
                f.write(resp.content)
            log(f"[CTRL] Cover saved to: {cover_path}")
        except Exception as e:
            log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
    # ---------------------------------------------------------
    # Cover Replication to Volumes
    # ---------------------------------------------------------
    def replicate_cover_to_volumes(self):
        """Copy cover.jpg into each existing Volume_xxx directory."""
        src = os.path.join(self.book_base, "cover.jpg")
        if not os.path.exists(src):
            log("[CTRL] No cover.jpg found, replication skipped")
            return
        try:
            for entry in os.listdir(self.book_base):
                if entry.lower().startswith("volume_"):
                    vol_dir = os.path.join(self.book_base, entry)
                    dst = os.path.join(vol_dir, "cover.jpg")
                    shutil.copyfile(src, dst)
                    log(f"[CTRL] Cover replicated into: {dst}")
        except Exception as e:
            log(f"[CTRL] Cover replication failed: {e}")
    # ---------------------------------------------------------
    # Volume isolation
    # ---------------------------------------------------------
@ -69,6 +130,11 @@ class DownloadController:
        )
        log(f"[CTRL] Output root: {self.book_base}")
        # -------------------------------------
        # 1) Download cover before any pipelines
        # -------------------------------------
        self.download_cover()
        tasks = []
        for ch in self.chapters:
@ -94,4 +160,9 @@ class DownloadController:
            f"(book_id={self.book_id}, group_id={async_result.id})"
        )
        # -------------------------------------------------------
        # 2) AFTER dispatch: cover replication to volume folders
        # -------------------------------------------------------
        self.replicate_cover_to_volumes()
        return async_result
--- a/bookscraper/scraper/tasks/controller_tasks.py
+++ b/bookscraper/scraper/tasks/controller_tasks.py
@ -3,7 +3,7 @@
 # Purpose:
 #   Start the download → parse → save pipeline for a scraped book,
 #   including progress/abort tracking via book_id.
-#   ONLY THE CONTROLLER UPDATES PROGRESS.
+#   ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
 # ============================================================
 from celery_app import celery_app
@ -12,9 +12,6 @@ from logbus.publisher import log
 from scraper.download_controller import DownloadController
 from scraper.progress import (
    set_total,
    inc_completed,
    inc_skipped,
    inc_failed,
 )
 from scraper.abort import abort_requested
@ -25,11 +22,10 @@ print(">>> [IMPORT] controller_tasks.py loaded")
 def launch_downloads(self, book_id: str, scrape_result: dict):
    """
    Launch the entire pipeline (download → parse → save),
-    AND maintain progress counters.
+    AND initialize progress counters.
-    EXPECTS:
+    Chapter-level progress is updated INSIDE the download/parse/save tasks.
-        book_id:        ID generated in scraping.start_scrape_book
+    This task MUST NOT call .get() on async subtasks (Celery restriction).
        scrape_result:  dict with title, author, url, chapters[]
    """
    title = scrape_result.get("title", "UnknownBook")
@ -50,46 +46,36 @@ def launch_downloads(self, book_id: str, scrape_result: dict):
    ctl = DownloadController(book_id, scrape_result)
    # ------------------------------------------------------------
-    # RUN PIPELINE IN SYNC LOOP
+    # START PIPELINES (ASYNC)
-    # (DownloadController.start() returns per-chapter generator)
+    # Returns a celery group AsyncResult. We DO NOT iterate or get().
    # Progress & failures are handled by the worker subtasks.
    # ------------------------------------------------------------
    try:
-        for result in ctl.start():  # new generator mode
+        group_result = ctl.start()
-            ch = result.get("chapter")
+
-
+        log(
-            if result.get("skipped"):
+            f"[CTRL] Pipelines dispatched for '{title}' "
-                inc_skipped(book_id)
+            f"(book_id={book_id}, group_id={group_result.id})"
-                inc_completed(book_id)
+        )
-                log(f"[CTRL] SKIPPED chapter {ch}")
+
-                continue
+        # Abort flag set BEFORE tasks start?
-
+        if abort_requested(book_id):
-            if result.get("failed"):
+            log(f"[CTRL] ABORT requested before tasks start")
-                inc_failed(book_id)
+            return {"book_id": book_id, "aborted": True}
                inc_completed(book_id)
                log(f"[CTRL] FAILED chapter {ch}")
                continue
            # Normal success
            inc_completed(book_id)
            log(f"[CTRL] DONE chapter {ch}")
            # Abort requested mid-run?
            if abort_requested(book_id):
                log(f"[CTRL] ABORT after chapter {ch}")
                break
    except Exception as exc:
-        log(f"[CTRL] ERROR while processing pipeline: {exc}")
+        log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
        inc_failed(book_id)
        raise
    # ------------------------------------------------------------
-    # FINISHED
+    # CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
    # (Download/parse/save tasks update progress themselves)
    # ------------------------------------------------------------
-    log(f"[CTRL] Pipeline finished for book_id={book_id}")
+    log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
    return {
        "book_id": book_id,
        "total": total,
-        "completed": int(total),  # For safety
+        "started": True,
        "group_id": group_result.id,
    }
--- a/bookscraper/templates/result.html
+++ b/bookscraper/templates/result.html
@ -68,6 +68,24 @@
    <div class="box">{{ message }}</div>
    {% endif %}
    <!-- COVER WEERGAVE (toegevoegd) -->
    {% if book_title %}
    <div class="box">
      <strong>Cover:</strong><br />
      <img
        src="/output/{{ book_title }}/cover.jpg"
        alt="Cover"
        style="
          margin-top: 10px;
          max-width: 250px;
          border: 1px solid #ccc;
          border-radius: 4px;
        "
        onerror="this.style.display='none'"
      />
    </div>
    {% endif %}
    <div id="statusBox" class="box hidden">
      <div id="statusLine">Status: bezig…</div>
      <div id="progressText"></div>
@ -122,7 +140,6 @@
      }
      function pollProgress() {
        // FIX → UI blijft renderen, polling stopt alleen herhaling
        if (!bookId) return;
        fetch(`/progress/${bookId}`)
@ -137,7 +154,6 @@
              p.skipped || 0
            } | Failed: ${p.failed || 0}`;
            // FAILED LIST
            const failedBox = document.getElementById("failedBox");
            const failedList = document.getElementById("failedList");
@ -151,7 +167,6 @@
              });
            }
            // STATUS
            if (p.abort) {
              document.getElementById("statusLine").innerText = "ABORTED";
              polling = false;
@ -162,7 +177,6 @@
              document.getElementById("statusLine").innerText = "Bezig…";
            }
            // STOP repetitieve polling, maar blijf renderen
            if (polling) setTimeout(pollProgress, 1000);
          })
          .catch(() => {