# ========================================================= # File: scraper/download_controller.py # Purpose: # Build Celery pipelines for all chapters using book_idx # Handles: # • volume assignment # • cover download + replication # • script generation # • Redis Book State Model init # • abort tracking # ========================================================= from celery import group from scraper.tasks.pipeline import build_chapter_pipeline # ❗ IMPORTANT: # generate_all_scripts MUST NOT import DownloadController, otherwise circular import. # We keep the import, but scriptgen must be clean. from scraper import scriptgen from logbus.publisher import log import os import requests import shutil from scraper.abort import abort_requested from db.state_redis import init_book_state from db.repository import set_status, set_chapters_total class DownloadController: """ Coordinates all chapter pipelines (download → parse → save). """ def __init__(self, book_idx: str, scrape_result: dict): self.book_idx = str(book_idx) self.scrape_result = scrape_result # Metadata self.title = scrape_result.get("title", "UnknownBook") self.chapters = scrape_result.get("chapters", []) or [] self.cover_url = scrape_result.get("cover_url") # Output folder root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) self.book_base = os.path.join(root, self.title) os.makedirs(self.book_base, exist_ok=True) # Meta passed downstream self.meta = { "title": self.title, "author": scrape_result.get("author"), "description": scrape_result.get("description"), "book_url": scrape_result.get("book_url"), } log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}") # Init Redis Book State Model try: init_book_state( book_id=self.book_idx, title=self.title, url=self.meta["book_url"], chapters_total=len(self.chapters), ) except Exception as e: log(f"[CTRL_STATE] init_book_state FAILED: {e}") # --------------------------------------------------------- def download_cover(self): if not self.cover_url: return log(f"[CTRL] No cover URL for '{self.title}'") cover_path = os.path.join(self.book_base, "cover.jpg") headers = { "User-Agent": "Mozilla/5.0", "Referer": self.scrape_result.get("book_url") or "", } try: log(f"[CTRL] Downloading cover: {self.cover_url}") resp = requests.get(self.cover_url, timeout=10, headers=headers) resp.raise_for_status() with open(cover_path, "wb") as f: f.write(resp.content) log(f"[CTRL] Cover saved: {cover_path}") except Exception as e: log(f"[CTRL] Cover download failed: {e}") # --------------------------------------------------------- def replicate_cover_to_volumes(self): src = os.path.join(self.book_base, "cover.jpg") if not os.path.exists(src): return for entry in os.listdir(self.book_base): if entry.lower().startswith("volume_"): dst = os.path.join(self.book_base, entry, "cover.jpg") try: shutil.copyfile(src, dst) log(f"[CTRL] Cover replicated → {dst}") except Exception as e: log(f"[CTRL] Cover replication failed: {e}") # --------------------------------------------------------- def store_cover_in_static(self): src = os.path.join(self.book_base, "cover.jpg") if not os.path.exists(src): return os.makedirs("static/covers", exist_ok=True) dst = os.path.join("static/covers", f"{self.book_idx}.jpg") try: shutil.copyfile(src, dst) log(f"[CTRL] Cover stored for UI: {dst}") except Exception as e: log(f"[CTRL] Failed storing cover: {e}") # --------------------------------------------------------- def get_volume_path(self, chapter_num: int) -> str: vol_index = (chapter_num - 1) // self.max_vol + 1 vol_name = f"Volume_{vol_index:03d}" vol_path = os.path.join(self.book_base, vol_name) os.makedirs(vol_path, exist_ok=True) return vol_path # --------------------------------------------------------- def start(self): total = len(self.chapters) log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)") # Update Redis/SQLite state try: set_status(self.book_idx, "downloading") set_chapters_total(self.book_idx, total) except Exception as e: log(f"[CTRL_STATE] Unable to set state: {e}") # Download cover self.download_cover() # Build pipeline tasks tasks = [] for ch in self.chapters: num = ch["num"] chapter_info = { "num": num, "url": ch["url"], "title": ch.get("title"), "volume_path": self.get_volume_path(num), } tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta)) async_result = group(tasks).apply_async() # Replicate cover + place in static self.replicate_cover_to_volumes() self.store_cover_in_static() # Generate scripts (LATE IMPORT to avoid circular) try: scriptgen.generate_all_scripts( self.book_base, self.title, self.meta["author"] ) log("[CTRL] Scripts generated") except Exception as e: log(f"[CTRL] Script generation failed: {e}") return async_result