diff --git a/bookscraper/app.py b/bookscraper/app.py index 72d5da9..9840714 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -21,9 +21,25 @@ from scraper.ui_log import get_ui_logs from celery.result import AsyncResult +# ⬇⬇⬇ TOEGEVOEGD voor cover-serving +from flask import send_from_directory +import os + app = Flask(__name__) +# ===================================================== +# STATIC FILE SERVING FOR OUTPUT ← TOEGEVOEGD +# ===================================================== +OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") + + +@app.route("/output/") +def serve_output(filename): + """Serve output files such as cover.jpg and volumes.""" + return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False) + + # ===================================================== # HOME PAGE # ===================================================== @@ -54,6 +70,8 @@ def start_scraping(): "result.html", message="Scraping gestart.", scraping_task_id=async_result.id, + # voor result.html cover rendering + book_title=None, ) @@ -103,8 +121,6 @@ def celery_result(task_id): # RUN FLASK # ===================================================== if __name__ == "__main__": - import os - debug = os.getenv("FLASK_DEBUG", "0") == "1" host = os.getenv("HOST", "0.0.0.0") port = int(os.getenv("PORT", "5000")) diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py index 51a396d..922d0c7 100644 --- a/bookscraper/scraper/book_scraper.py +++ b/bookscraper/scraper/book_scraper.py @@ -3,6 +3,7 @@ import requests from bs4 import BeautifulSoup from urllib.parse import urljoin +import re from scraper.logger import log_debug from scraper.utils import clean_text, load_replacements @@ -53,7 +54,7 @@ class BookScraper: "title": self.book_title, "author": self.book_author, "description": self.book_description, - "cover_url": self.cover_url, + "cover_url": self.cover_url, # ← used by DownloadController "book_url": self.url, "chapters": [ {"num": ch.number, "title": ch.title, "url": ch.url} @@ -106,12 +107,62 @@ class BookScraper: # ------------------------------------------------------------ def _parse_cover(self, soup): - img = soup.find("img", src=lambda v: v and "files/article/image" in v) - if not img: - log_debug("[BookScraper] No cover found") + """ + Extract correct cover based on book_id path logic. + 1. primary: match "/files/article/image/{vol}/{book_id}/" + 2. fallback: endswith "/{book_id}s.jpg" + """ + # Extract book_id from URL + m = re.search(r"/(\d+)\.html$", self.url) + if not m: + log_debug("[BookScraper] No book_id found in URL → cannot match cover") return - self.cover_url = urljoin(self.site.root, img.get("src")) + book_id = m.group(1) + + # Extract vol folder from URL (bookinfo//.html) + m2 = re.search(r"/bookinfo/(\d+)/", self.url) + volume = m2.group(1) if m2 else None + + log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}") + + imgs = soup.find_all("img", src=True) + + chosen = None + + # -------------------------------------------------------- + # PRIORITY 1: Path-match + # /files/article/image/{vol}/{book_id}/ + # -------------------------------------------------------- + if volume: + target_path = f"/files/article/image/{volume}/{book_id}/" + for img in imgs: + src = img["src"] + if target_path in src: + chosen = src + log_debug(f"[BookScraper] Cover matched by PATH: {src}") + break + + # -------------------------------------------------------- + # PRIORITY 2: endswith "/{book_id}s.jpg" + # -------------------------------------------------------- + if not chosen: + target_suffix = f"/{book_id}s.jpg" + for img in imgs: + src = img["src"] + if src.endswith(target_suffix): + chosen = src + log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}") + break + + # -------------------------------------------------------- + # No match + # -------------------------------------------------------- + if not chosen: + log_debug("[BookScraper] No matching cover found") + return + + self.cover_url = urljoin(self.site.root, chosen) log_debug(f"[BookScraper] Cover URL = {self.cover_url}") # ------------------------------------------------------------ diff --git a/bookscraper/scraper/download_controller.py b/bookscraper/scraper/download_controller.py index 1b74ffd..f93bb33 100644 --- a/bookscraper/scraper/download_controller.py +++ b/bookscraper/scraper/download_controller.py @@ -3,12 +3,15 @@ # Purpose: # Build Celery pipelines for all chapters # and pass book_id for abort/progress/log functionality. +# + Download and replicate cover image to all volume folders # ========================================================= from celery import group from scraper.tasks.pipeline import build_chapter_pipeline from logbus.publisher import log import os +import requests +import shutil class DownloadController: @@ -18,6 +21,7 @@ class DownloadController: - volume splitting - consistent meta propagation - book_id-based abort + progress tracking + - cover download + volume replication """ def __init__(self, book_id: str, scrape_result: dict): @@ -27,6 +31,7 @@ class DownloadController: # Core metadata self.title = scrape_result.get("title", "UnknownBook") self.chapters = scrape_result.get("chapters", []) or [] + self.cover_url = scrape_result.get("cover_url") # Output base dir root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") @@ -46,6 +51,62 @@ class DownloadController: "book_url": scrape_result.get("book_url"), } + # --------------------------------------------------------- + # Cover Download + # --------------------------------------------------------- + def download_cover(self): + """Download one cover image into the root of the book folder.""" + if not self.cover_url: + log(f"[CTRL] No cover URL found for '{self.title}'") + return + + cover_path = os.path.join(self.book_base, "cover.jpg") + + # HEADERS that bypass 403 hotlink protection + headers = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) " + "Gecko/20100101 Firefox/118.0" + ), + "Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/", + } + + try: + log(f"[CTRL] Downloading cover: {self.cover_url}") + + resp = requests.get(self.cover_url, timeout=10, headers=headers) + resp.raise_for_status() + + with open(cover_path, "wb") as f: + f.write(resp.content) + + log(f"[CTRL] Cover saved to: {cover_path}") + + except Exception as e: + log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})") + + # --------------------------------------------------------- + # Cover Replication to Volumes + # --------------------------------------------------------- + def replicate_cover_to_volumes(self): + """Copy cover.jpg into each existing Volume_xxx directory.""" + src = os.path.join(self.book_base, "cover.jpg") + if not os.path.exists(src): + log("[CTRL] No cover.jpg found, replication skipped") + return + + try: + for entry in os.listdir(self.book_base): + if entry.lower().startswith("volume_"): + vol_dir = os.path.join(self.book_base, entry) + dst = os.path.join(vol_dir, "cover.jpg") + + shutil.copyfile(src, dst) + log(f"[CTRL] Cover replicated into: {dst}") + + except Exception as e: + log(f"[CTRL] Cover replication failed: {e}") + # --------------------------------------------------------- # Volume isolation # --------------------------------------------------------- @@ -69,6 +130,11 @@ class DownloadController: ) log(f"[CTRL] Output root: {self.book_base}") + # ------------------------------------- + # 1) Download cover before any pipelines + # ------------------------------------- + self.download_cover() + tasks = [] for ch in self.chapters: @@ -94,4 +160,9 @@ class DownloadController: f"(book_id={self.book_id}, group_id={async_result.id})" ) + # ------------------------------------------------------- + # 2) AFTER dispatch: cover replication to volume folders + # ------------------------------------------------------- + self.replicate_cover_to_volumes() + return async_result diff --git a/bookscraper/scraper/tasks/controller_tasks.py b/bookscraper/scraper/tasks/controller_tasks.py index 0f5d0ea..0f06405 100644 --- a/bookscraper/scraper/tasks/controller_tasks.py +++ b/bookscraper/scraper/tasks/controller_tasks.py @@ -3,7 +3,7 @@ # Purpose: # Start the download → parse → save pipeline for a scraped book, # including progress/abort tracking via book_id. -# ONLY THE CONTROLLER UPDATES PROGRESS. +# ONLY THE CONTROLLER UPDATES PROGRESS (initial total). # ============================================================ from celery_app import celery_app @@ -12,9 +12,6 @@ from logbus.publisher import log from scraper.download_controller import DownloadController from scraper.progress import ( set_total, - inc_completed, - inc_skipped, - inc_failed, ) from scraper.abort import abort_requested @@ -25,11 +22,10 @@ print(">>> [IMPORT] controller_tasks.py loaded") def launch_downloads(self, book_id: str, scrape_result: dict): """ Launch the entire pipeline (download → parse → save), - AND maintain progress counters. + AND initialize progress counters. - EXPECTS: - book_id: ID generated in scraping.start_scrape_book - scrape_result: dict with title, author, url, chapters[] + Chapter-level progress is updated INSIDE the download/parse/save tasks. + This task MUST NOT call .get() on async subtasks (Celery restriction). """ title = scrape_result.get("title", "UnknownBook") @@ -50,46 +46,36 @@ def launch_downloads(self, book_id: str, scrape_result: dict): ctl = DownloadController(book_id, scrape_result) # ------------------------------------------------------------ - # RUN PIPELINE IN SYNC LOOP - # (DownloadController.start() returns per-chapter generator) + # START PIPELINES (ASYNC) + # Returns a celery group AsyncResult. We DO NOT iterate or get(). + # Progress & failures are handled by the worker subtasks. # ------------------------------------------------------------ try: - for result in ctl.start(): # new generator mode - ch = result.get("chapter") - - if result.get("skipped"): - inc_skipped(book_id) - inc_completed(book_id) - log(f"[CTRL] SKIPPED chapter {ch}") - continue - - if result.get("failed"): - inc_failed(book_id) - inc_completed(book_id) - log(f"[CTRL] FAILED chapter {ch}") - continue - - # Normal success - inc_completed(book_id) - log(f"[CTRL] DONE chapter {ch}") - - # Abort requested mid-run? - if abort_requested(book_id): - log(f"[CTRL] ABORT after chapter {ch}") - break + group_result = ctl.start() + + log( + f"[CTRL] Pipelines dispatched for '{title}' " + f"(book_id={book_id}, group_id={group_result.id})" + ) + + # Abort flag set BEFORE tasks start? + if abort_requested(book_id): + log(f"[CTRL] ABORT requested before tasks start") + return {"book_id": book_id, "aborted": True} except Exception as exc: - log(f"[CTRL] ERROR while processing pipeline: {exc}") - inc_failed(book_id) + log(f"[CTRL] ERROR while dispatching pipelines: {exc}") raise # ------------------------------------------------------------ - # FINISHED + # CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS + # (Download/parse/save tasks update progress themselves) # ------------------------------------------------------------ - log(f"[CTRL] Pipeline finished for book_id={book_id}") + log(f"[CTRL] Controller finished dispatch for book_id={book_id}") return { "book_id": book_id, "total": total, - "completed": int(total), # For safety + "started": True, + "group_id": group_result.id, } diff --git a/bookscraper/templates/result.html b/bookscraper/templates/result.html index 1673c5a..81a12c8 100644 --- a/bookscraper/templates/result.html +++ b/bookscraper/templates/result.html @@ -68,6 +68,24 @@
{{ message }}
{% endif %} + + {% if book_title %} +
+ Cover:
+ Cover +
+ {% endif %} +