kmftools/bookscraper/scraper/download_controller.py

# =========================================================
# File: scraper/download_controller.py
# Purpose:
#   Build Celery pipelines for all chapters
#   and pass book_id for abort/progress/log functionality.
# =========================================================

from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline
from logbus.publisher import log
import os


class DownloadController:
    """
    Coordinates all chapter pipelines (download → parse → save),
    including:
      - volume splitting
      - consistent meta propagation
      - book_id-based abort + progress tracking
    """

    def __init__(self, book_id: str, scrape_result: dict):
        self.book_id = book_id
        self.scrape_result = scrape_result

        # Core metadata
        self.title = scrape_result.get("title", "UnknownBook")
        self.chapters = scrape_result.get("chapters", []) or []

        # Output base dir
        root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")

        # Volume size
        self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))

        # Base folder for the whole book
        self.book_base = os.path.join(root, self.title)
        os.makedirs(self.book_base, exist_ok=True)

        # Meta passed to parse/save stage
        self.meta = {
            "title": self.title,
            "author": scrape_result.get("author"),
            "description": scrape_result.get("description"),
            "book_url": scrape_result.get("book_url"),
        }

    # ---------------------------------------------------------
    # Volume isolation
    # ---------------------------------------------------------
    def get_volume_path(self, chapter_num: int) -> str:
        """Returns the correct volume directory for a chapter."""
        vol_index = (chapter_num - 1) // self.max_vol + 1
        vol_name = f"Volume_{vol_index:03d}"
        vol_path = os.path.join(self.book_base, vol_name)
        os.makedirs(vol_path, exist_ok=True)
        return vol_path

    # ---------------------------------------------------------
    # Pipeline launcher
    # ---------------------------------------------------------
    def start(self):
        total = len(self.chapters)

        log(
            f"[CTRL] Initialising pipeline for '{self.title}' "
            f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
        )
        log(f"[CTRL] Output root: {self.book_base}")

        tasks = []

        for ch in self.chapters:
            chapter_num = ch["num"]
            chapter_url = ch["url"]

            volume_path = self.get_volume_path(chapter_num)

            tasks.append(
                build_chapter_pipeline(
                    self.book_id,  # ← UUID from scraping.py
                    chapter_num,
                    chapter_url,
                    volume_path,
                    self.meta,
                )
            )

        async_result = group(tasks).apply_async()

        log(
            f"[CTRL] Pipelines dispatched for '{self.title}' "
            f"(book_id={self.book_id}, group_id={async_result.id})"
        )

        return async_result