Cover download + controller async fix + robust cover parsing

2 weeks ago · 9a774c4955
parent ed341c727a
commit 9a774c4955
5 changed files with 187 additions and 49 deletions
--- a/bookscraper/app.py
+++ b/bookscraper/app.py
@ -21,9 +21,25 @@ from scraper.ui_log import get_ui_logs

 from celery.result import AsyncResult

+# ⬇⬇⬇ TOEGEVOEGD voor cover-serving
+from flask import send_from_directory
+import os
+
 app = Flask(__name__)


+# =====================================================
+# STATIC FILE SERVING FOR OUTPUT  ← TOEGEVOEGD
+# =====================================================
+OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
+
+
+@app.route("/output/<path:filename>")
+def serve_output(filename):
+    """Serve output files such as cover.jpg and volumes."""
+    return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
+
+
 # =====================================================
 # HOME PAGE
 # =====================================================
@ -54,6 +70,8 @@ def start_scraping():
        "result.html",
        message="Scraping gestart.",
        scraping_task_id=async_result.id,
+        # voor result.html cover rendering
+        book_title=None,
    )


@ -103,8 +121,6 @@ def celery_result(task_id):
 # RUN FLASK
 # =====================================================
 if __name__ == "__main__":
-    import os
-
    debug = os.getenv("FLASK_DEBUG", "0") == "1"
    host = os.getenv("HOST", "0.0.0.0")
    port = int(os.getenv("PORT", "5000"))
--- a/bookscraper/scraper/book_scraper.py
+++ b/bookscraper/scraper/book_scraper.py
@ -3,6 +3,7 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
+import re

 from scraper.logger import log_debug
 from scraper.utils import clean_text, load_replacements
@ -53,7 +54,7 @@ class BookScraper:
            "title": self.book_title,
            "author": self.book_author,
            "description": self.book_description,
-            "cover_url": self.cover_url,
+            "cover_url": self.cover_url,  # ← used by DownloadController
            "book_url": self.url,
            "chapters": [
                {"num": ch.number, "title": ch.title, "url": ch.url}
@ -106,12 +107,62 @@ class BookScraper:

    # ------------------------------------------------------------
    def _parse_cover(self, soup):
-        img = soup.find("img", src=lambda v: v and "files/article/image" in v)
-        if not img:
-            log_debug("[BookScraper] No cover found")
+        """
+        Extract correct cover based on book_id path logic.
+        1. primary: match "/files/article/image/{vol}/{book_id}/"
+        2. fallback: endswith "/{book_id}s.jpg"
+        """
+        # Extract book_id from URL
+        m = re.search(r"/(\d+)\.html$", self.url)
+        if not m:
+            log_debug("[BookScraper] No book_id found in URL → cannot match cover")
            return

-        self.cover_url = urljoin(self.site.root, img.get("src"))
+        book_id = m.group(1)
+
+        # Extract vol folder from URL (bookinfo/<vol>/<id>.html)
+        m2 = re.search(r"/bookinfo/(\d+)/", self.url)
+        volume = m2.group(1) if m2 else None
+
+        log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
+
+        imgs = soup.find_all("img", src=True)
+
+        chosen = None
+
+        # --------------------------------------------------------
+        # PRIORITY 1: Path-match
+        # /files/article/image/{vol}/{book_id}/
+        # --------------------------------------------------------
+        if volume:
+            target_path = f"/files/article/image/{volume}/{book_id}/"
+            for img in imgs:
+                src = img["src"]
+                if target_path in src:
+                    chosen = src
+                    log_debug(f"[BookScraper] Cover matched by PATH: {src}")
+                    break
+
+        # --------------------------------------------------------
+        # PRIORITY 2: endswith "/{book_id}s.jpg"
+        # --------------------------------------------------------
+        if not chosen:
+            target_suffix = f"/{book_id}s.jpg"
+            for img in imgs:
+                src = img["src"]
+                if src.endswith(target_suffix):
+                    chosen = src
+                    log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
+                    break
+
+        # --------------------------------------------------------
+        # No match
+        # --------------------------------------------------------
+        if not chosen:
+            log_debug("[BookScraper] No matching cover found")
+            return
+
+        self.cover_url = urljoin(self.site.root, chosen)
        log_debug(f"[BookScraper] Cover URL = {self.cover_url}")

    # ------------------------------------------------------------
--- a/bookscraper/scraper/download_controller.py
+++ b/bookscraper/scraper/download_controller.py
@ -3,12 +3,15 @@
 # Purpose:
 #   Build Celery pipelines for all chapters
 #   and pass book_id for abort/progress/log functionality.
+#   + Download and replicate cover image to all volume folders
 # =========================================================

 from celery import group
 from scraper.tasks.pipeline import build_chapter_pipeline
 from logbus.publisher import log
 import os
+import requests
+import shutil


 class DownloadController:
@ -18,6 +21,7 @@ class DownloadController:
      - volume splitting
      - consistent meta propagation
      - book_id-based abort + progress tracking
+      - cover download + volume replication
    """

    def __init__(self, book_id: str, scrape_result: dict):
@ -27,6 +31,7 @@ class DownloadController:
        # Core metadata
        self.title = scrape_result.get("title", "UnknownBook")
        self.chapters = scrape_result.get("chapters", []) or []
+        self.cover_url = scrape_result.get("cover_url")

        # Output base dir
        root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@ -46,6 +51,62 @@ class DownloadController:
            "book_url": scrape_result.get("book_url"),
        }

+    # ---------------------------------------------------------
+    # Cover Download
+    # ---------------------------------------------------------
+    def download_cover(self):
+        """Download one cover image into the root of the book folder."""
+        if not self.cover_url:
+            log(f"[CTRL] No cover URL found for '{self.title}'")
+            return
+
+        cover_path = os.path.join(self.book_base, "cover.jpg")
+
+        # HEADERS that bypass 403 hotlink protection
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
+                "Gecko/20100101 Firefox/118.0"
+            ),
+            "Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
+        }
+
+        try:
+            log(f"[CTRL] Downloading cover: {self.cover_url}")
+
+            resp = requests.get(self.cover_url, timeout=10, headers=headers)
+            resp.raise_for_status()
+
+            with open(cover_path, "wb") as f:
+                f.write(resp.content)
+
+            log(f"[CTRL] Cover saved to: {cover_path}")
+
+        except Exception as e:
+            log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
+
+    # ---------------------------------------------------------
+    # Cover Replication to Volumes
+    # ---------------------------------------------------------
+    def replicate_cover_to_volumes(self):
+        """Copy cover.jpg into each existing Volume_xxx directory."""
+        src = os.path.join(self.book_base, "cover.jpg")
+        if not os.path.exists(src):
+            log("[CTRL] No cover.jpg found, replication skipped")
+            return
+
+        try:
+            for entry in os.listdir(self.book_base):
+                if entry.lower().startswith("volume_"):
+                    vol_dir = os.path.join(self.book_base, entry)
+                    dst = os.path.join(vol_dir, "cover.jpg")
+
+                    shutil.copyfile(src, dst)
+                    log(f"[CTRL] Cover replicated into: {dst}")
+
+        except Exception as e:
+            log(f"[CTRL] Cover replication failed: {e}")
+
    # ---------------------------------------------------------
    # Volume isolation
    # ---------------------------------------------------------
@ -69,6 +130,11 @@ class DownloadController:
        )
        log(f"[CTRL] Output root: {self.book_base}")

+        # -------------------------------------
+        # 1) Download cover before any pipelines
+        # -------------------------------------
+        self.download_cover()
+
        tasks = []

        for ch in self.chapters:
@ -94,4 +160,9 @@ class DownloadController:
            f"(book_id={self.book_id}, group_id={async_result.id})"
        )

+        # -------------------------------------------------------
+        # 2) AFTER dispatch: cover replication to volume folders
+        # -------------------------------------------------------
+        self.replicate_cover_to_volumes()
+
        return async_result
--- a/bookscraper/scraper/tasks/controller_tasks.py
+++ b/bookscraper/scraper/tasks/controller_tasks.py
@ -3,7 +3,7 @@
 # Purpose:
 #   Start the download → parse → save pipeline for a scraped book,
 #   including progress/abort tracking via book_id.
-#   ONLY THE CONTROLLER UPDATES PROGRESS.
+#   ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
 # ============================================================

 from celery_app import celery_app
@ -12,9 +12,6 @@ from logbus.publisher import log
 from scraper.download_controller import DownloadController
 from scraper.progress import (
    set_total,
-    inc_completed,
-    inc_skipped,
-    inc_failed,
 )
 from scraper.abort import abort_requested

@ -25,11 +22,10 @@ print(">>> [IMPORT] controller_tasks.py loaded")
 def launch_downloads(self, book_id: str, scrape_result: dict):
    """
    Launch the entire pipeline (download → parse → save),
-    AND maintain progress counters.
+    AND initialize progress counters.

-    EXPECTS:
-        book_id:        ID generated in scraping.start_scrape_book
-        scrape_result:  dict with title, author, url, chapters[]
+    Chapter-level progress is updated INSIDE the download/parse/save tasks.
+    This task MUST NOT call .get() on async subtasks (Celery restriction).
    """

    title = scrape_result.get("title", "UnknownBook")
@ -50,46 +46,36 @@ def launch_downloads(self, book_id: str, scrape_result: dict):
    ctl = DownloadController(book_id, scrape_result)

    # ------------------------------------------------------------
-    # RUN PIPELINE IN SYNC LOOP
-    # (DownloadController.start() returns per-chapter generator)
+    # START PIPELINES (ASYNC)
+    # Returns a celery group AsyncResult. We DO NOT iterate or get().
+    # Progress & failures are handled by the worker subtasks.
    # ------------------------------------------------------------
    try:
-        for result in ctl.start():  # new generator mode
-            ch = result.get("chapter")
-
-            if result.get("skipped"):
-                inc_skipped(book_id)
-                inc_completed(book_id)
-                log(f"[CTRL] SKIPPED chapter {ch}")
-                continue
-
-            if result.get("failed"):
-                inc_failed(book_id)
-                inc_completed(book_id)
-                log(f"[CTRL] FAILED chapter {ch}")
-                continue
-
-            # Normal success
-            inc_completed(book_id)
-            log(f"[CTRL] DONE chapter {ch}")
-
-            # Abort requested mid-run?
-            if abort_requested(book_id):
-                log(f"[CTRL] ABORT after chapter {ch}")
-                break
+        group_result = ctl.start()
+
+        log(
+            f"[CTRL] Pipelines dispatched for '{title}' "
+            f"(book_id={book_id}, group_id={group_result.id})"
+        )
+
+        # Abort flag set BEFORE tasks start?
+        if abort_requested(book_id):
+            log(f"[CTRL] ABORT requested before tasks start")
+            return {"book_id": book_id, "aborted": True}

    except Exception as exc:
-        log(f"[CTRL] ERROR while processing pipeline: {exc}")
-        inc_failed(book_id)
+        log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
        raise

    # ------------------------------------------------------------
-    # FINISHED
+    # CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
+    # (Download/parse/save tasks update progress themselves)
    # ------------------------------------------------------------
-    log(f"[CTRL] Pipeline finished for book_id={book_id}")
+    log(f"[CTRL] Controller finished dispatch for book_id={book_id}")

    return {
        "book_id": book_id,
        "total": total,
-        "completed": int(total),  # For safety
+        "started": True,
+        "group_id": group_result.id,
    }
--- a/bookscraper/templates/result.html
+++ b/bookscraper/templates/result.html
@ -68,6 +68,24 @@
    <div class="box">{{ message }}</div>
    {% endif %}

+    <!-- COVER WEERGAVE (toegevoegd) -->
+    {% if book_title %}
+    <div class="box">
+      <strong>Cover:</strong><br />
+      <img
+        src="/output/{{ book_title }}/cover.jpg"
+        alt="Cover"
+        style="
+          margin-top: 10px;
+          max-width: 250px;
+          border: 1px solid #ccc;
+          border-radius: 4px;
+        "
+        onerror="this.style.display='none'"
+      />
+    </div>
+    {% endif %}
+
    <div id="statusBox" class="box hidden">
      <div id="statusLine">Status: bezig…</div>
      <div id="progressText"></div>
@ -122,7 +140,6 @@
      }

      function pollProgress() {
-        // FIX → UI blijft renderen, polling stopt alleen herhaling
        if (!bookId) return;

        fetch(`/progress/${bookId}`)
@ -137,7 +154,6 @@
              p.skipped || 0
            } | Failed: ${p.failed || 0}`;

-            // FAILED LIST
            const failedBox = document.getElementById("failedBox");
            const failedList = document.getElementById("failedList");

@ -151,7 +167,6 @@
              });
            }

-            // STATUS
            if (p.abort) {
              document.getElementById("statusLine").innerText = "ABORTED";
              polling = false;
@ -162,7 +177,6 @@
              document.getElementById("statusLine").innerText = "Bezig…";
            }

-            // STOP repetitieve polling, maar blijf renderen
            if (polling) setTimeout(pollProgress, 1000);
          })
          .catch(() => {