# ========================================================= # File: scraper/download_controller.py # Purpose: # Build Celery pipelines for all chapters # and pass book_id for abort/progress/log functionality. # ========================================================= from celery import group from scraper.tasks.pipeline import build_chapter_pipeline from logbus.publisher import log import os class DownloadController: """ Coordinates all chapter pipelines (download → parse → save), including: - volume splitting - consistent meta propagation - book_id-based abort + progress tracking """ def __init__(self, book_id: str, scrape_result: dict): self.book_id = book_id self.scrape_result = scrape_result # Core metadata self.title = scrape_result.get("title", "UnknownBook") self.chapters = scrape_result.get("chapters", []) or [] # Output base dir root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") # Volume size self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200")) # Base folder for the whole book self.book_base = os.path.join(root, self.title) os.makedirs(self.book_base, exist_ok=True) # Meta passed to parse/save stage self.meta = { "title": self.title, "author": scrape_result.get("author"), "description": scrape_result.get("description"), "book_url": scrape_result.get("book_url"), } # --------------------------------------------------------- # Volume isolation # --------------------------------------------------------- def get_volume_path(self, chapter_num: int) -> str: """Returns the correct volume directory for a chapter.""" vol_index = (chapter_num - 1) // self.max_vol + 1 vol_name = f"Volume_{vol_index:03d}" vol_path = os.path.join(self.book_base, vol_name) os.makedirs(vol_path, exist_ok=True) return vol_path # --------------------------------------------------------- # Pipeline launcher # --------------------------------------------------------- def start(self): total = len(self.chapters) log( f"[CTRL] Initialising pipeline for '{self.title}' " f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})" ) log(f"[CTRL] Output root: {self.book_base}") tasks = [] for ch in self.chapters: chapter_num = ch["num"] chapter_url = ch["url"] volume_path = self.get_volume_path(chapter_num) tasks.append( build_chapter_pipeline( self.book_id, # ← UUID from scraping.py chapter_num, chapter_url, volume_path, self.meta, ) ) async_result = group(tasks).apply_async() log( f"[CTRL] Pipelines dispatched for '{self.title}' " f"(book_id={self.book_id}, group_id={async_result.id})" ) return async_result