# ========================================================= # File: scraper/download_controller.py # Purpose: # Build Celery pipelines for all chapters # and pass book_id for abort/progress/log functionality. # + Download and replicate cover image to all volume folders # + Generate scripts (allinone.txt, makebook, say) # + Initialize Redis Book State Model (status + counters) # ========================================================= from celery import group from scraper.tasks.pipeline import build_chapter_pipeline from scraper.scriptgen import generate_all_scripts from logbus.publisher import log import os import requests import shutil from scraper.abort import abort_requested # DEBUG allowed from db.repository import create_or_update_book # NEW: Redis State Model (C&U) from scraper.progress import ( init_book_state, set_status, set_chapter_total, ) class DownloadController: """ Coordinates all chapter pipelines (download → parse → save), including: - volume splitting - consistent meta propagation - book_id-based abort + progress tracking - cover download + volume replication - script generation (allinone.txt, makebook, say) - Redis book state initialisation and status updates """ def __init__(self, book_id: str, scrape_result: dict): self.book_id = book_id self.scrape_result = scrape_result # Core metadata self.title = scrape_result.get("title", "UnknownBook") self.chapters = scrape_result.get("chapters", []) or [] self.cover_url = scrape_result.get("cover_url") # Output base dir root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") # Volume size self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) # Base folder for the whole book self.book_base = os.path.join(root, self.title) os.makedirs(self.book_base, exist_ok=True) # Meta passed to parse/save stage self.meta = { "title": self.title, "author": scrape_result.get("author"), "description": scrape_result.get("description"), "book_url": scrape_result.get("book_url"), } # ------------------------------------------------- # DEBUG — bevestig dat controller correct book_id ziet # ------------------------------------------------- log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'") try: abort_state = abort_requested(book_id) log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}") except Exception as e: log(f"[CTRL_DEBUG] abort_requested ERROR: {e}") # ------------------------------------------------- # NEW: Initialize Redis Book State Model # ------------------------------------------------- try: init_book_state( book_id=self.book_id, title=self.title, url=self.scrape_result.get("book_url"), chapters_total=len(self.chapters), ) log(f"[CTRL_STATE] init_book_state() completed for {self.title}") except Exception as e: log(f"[CTRL_STATE] init_book_state FAILED: {e}") # --------------------------------------------------------- # Cover Download # --------------------------------------------------------- def download_cover(self): """Download one cover image into the root of the book folder.""" if not self.cover_url: log(f"[CTRL] No cover URL found for '{self.title}'") return cover_path = os.path.join(self.book_base, "cover.jpg") headers = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) " "Gecko/20100101 Firefox/118.0" ), "Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/", } try: log(f"[CTRL] Downloading cover: {self.cover_url}") resp = requests.get(self.cover_url, timeout=10, headers=headers) resp.raise_for_status() with open(cover_path, "wb") as f: f.write(resp.content) log(f"[CTRL] Cover saved to: {cover_path}") except Exception as e: log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})") # --------------------------------------------------------- # Cover Replication to Volumes # --------------------------------------------------------- def replicate_cover_to_volumes(self): """Copy cover.jpg into each existing Volume_xxx directory.""" src = os.path.join(self.book_base, "cover.jpg") if not os.path.exists(src): log("[CTRL] No cover.jpg found, replication skipped") return try: for entry in os.listdir(self.book_base): if entry.lower().startswith("volume_"): vol_dir = os.path.join(self.book_base, entry) dst = os.path.join(vol_dir, "cover.jpg") shutil.copyfile(src, dst) log(f"[CTRL] Cover replicated into: {dst}") except Exception as e: log(f"[CTRL] Cover replication failed: {e}") def store_cover_in_static(self): """ Copy the main cover.jpg from book_base into static/covers/.jpg. This allows the Flask web UI to serve the cover directly. """ src = os.path.join(self.book_base, "cover.jpg") if not os.path.exists(src): log("[CTRL] No cover.jpg found, cannot store in static/covers") return # static/covers/.jpg static_dir = os.path.join("static", "covers") os.makedirs(static_dir, exist_ok=True) dst = os.path.join(static_dir, f"{self.book_id}.jpg") try: shutil.copyfile(src, dst) log(f"[CTRL] Cover stored for UI: {dst}") except Exception as e: log(f"[CTRL] Failed to store cover in static: {e}") # --------------------------------------------------------- # Volume isolation # --------------------------------------------------------- def get_volume_path(self, chapter_num: int) -> str: """Returns the correct volume directory for a chapter.""" vol_index = (chapter_num - 1) // self.max_vol + 1 vol_name = f"Volume_{vol_index:03d}" vol_path = os.path.join(self.book_base, vol_name) os.makedirs(vol_path, exist_ok=True) return vol_path # --------------------------------------------------------- # Pipeline launcher # --------------------------------------------------------- def start(self): total = len(self.chapters) log( f"[CTRL] Initialising pipeline for '{self.title}' " f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})" ) log(f"[CTRL] Output root: {self.book_base}") # ------------------------------------- # NEW: Redis state update # ------------------------------------- try: set_status(self.book_id, "downloading") set_chapter_total(self.book_id, total) log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}") except Exception as e: log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}") # ------------------------------------- # 1) Download cover # ------------------------------------- self.download_cover() tasks = [] for ch in self.chapters: # Build chapter_dict (NEW) chapter_num = ch["num"] chapter_url = ch["url"] chapter_title = ch.get("title") volume_path = self.get_volume_path(chapter_num) chapter_dict = { "num": chapter_num, "url": chapter_url, "title": chapter_title, "volume_path": volume_path, } # Dispatch pipeline with chapter_dict tasks.append( build_chapter_pipeline( self.book_id, chapter_dict, self.meta, ) ) async_result = group(tasks).apply_async() log( f"[CTRL] Pipelines dispatched for '{self.title}' " f"(book_id={self.book_id}, group_id={async_result.id})" ) # Debug abort state try: abort_state = abort_requested(self.book_id) log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}") except Exception as e: log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}") # ------------------------------------------------------- self.replicate_cover_to_volumes() self.store_cover_in_static() # ------------------------------------------------------- try: generate_all_scripts( self.book_base, self.title, self.meta.get("author"), ) log(f"[CTRL] Scripts generated for '{self.title}'") except Exception as e: log(f"[CTRL] Script generation failed: {e}") return async_result