# scraper/download_controller.py from celery import group from scraper.tasks.pipeline import build_chapter_pipeline from logbus.publisher import log import os class DownloadController: """Coordinates parallel chapter pipelines, with optional volume splitting.""" def __init__(self, scrape_result: dict): self.scrape_result = scrape_result self.title = scrape_result.get("title", "UnknownBook") self.chapters = scrape_result.get("chapters", []) # Base output dir from .env root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") # Volume size self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) # Base directory for the whole book self.book_base = os.path.join(root, self.title) os.makedirs(self.book_base, exist_ok=True) # ------------------------------------------ # FIXED: meta now includes book_url # ------------------------------------------ self.meta = { "title": self.scrape_result.get("title"), "author": self.scrape_result.get("author"), "description": self.scrape_result.get("description"), "book_url": self.scrape_result.get("book_url"), } def get_volume_path(self, chapter_num: int) -> str: """Returns the correct volume directory based on chapter number.""" vol_index = (chapter_num - 1) // self.max_vol + 1 vol_name = f"Volume_{vol_index:03d}" vol_path = os.path.join(self.book_base, vol_name) os.makedirs(vol_path, exist_ok=True) return vol_path def start(self): log(f"[CTRL] Starting download pipeline for {self.title}") log(f"[CTRL] Chapters: {len(self.chapters)}") log(f"[CTRL] Output root: {self.book_base}") log(f"[CTRL] MAX_VOL_SIZE = {self.max_vol}") tasks = [] for ch in self.chapters: chapter_num = ch["num"] chapter_url = ch["url"] vol_path = self.get_volume_path(chapter_num) tasks.append( build_chapter_pipeline( chapter_num, chapter_url, vol_path, self.meta, ) ) job_group = group(tasks) async_result = job_group.apply_async() log("[CTRL] Pipelines launched.") return async_result