kmftools/bookscraper/scraper/download_controller.py

# =========================================================
# File: scraper/download_controller.py
# Purpose:
#   Build Celery pipelines for all chapters using book_idx
#   Handles:
#     • volume assignment
#     • cover download + replication
#     • script generation
#     • Redis Book State Model init
#     • abort tracking
# =========================================================

from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline

# ❗ IMPORTANT:
# generate_all_scripts MUST NOT import DownloadController, otherwise circular import.
# We keep the import, but scriptgen must be clean.
from scraper import scriptgen

from logbus.publisher import log
import os
import requests
import shutil

from scraper.abort import abort_requested
from db.state_redis import init_book_state
from db.repository import set_status, set_chapters_total


class DownloadController:
    """
    Coordinates all chapter pipelines (download → parse → save).
    """

    def __init__(self, book_idx: str, scrape_result: dict):
        self.book_idx = str(book_idx)
        self.scrape_result = scrape_result

        # Metadata
        self.title = scrape_result.get("title", "UnknownBook")
        self.chapters = scrape_result.get("chapters", []) or []
        self.cover_url = scrape_result.get("cover_url")

        # Output folder
        root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
        self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
        self.book_base = os.path.join(root, self.title)
        os.makedirs(self.book_base, exist_ok=True)

        # Meta passed downstream
        self.meta = {
            "title": self.title,
            "author": scrape_result.get("author"),
            "description": scrape_result.get("description"),
            "book_url": scrape_result.get("book_url"),
        }

        log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}")

        # Init Redis Book State Model
        try:
            init_book_state(
                book_id=self.book_idx,
                title=self.title,
                url=self.meta["book_url"],
                chapters_total=len(self.chapters),
            )
        except Exception as e:
            log(f"[CTRL_STATE] init_book_state FAILED: {e}")

    # ---------------------------------------------------------
    def download_cover(self):
        if not self.cover_url:
            return log(f"[CTRL] No cover URL for '{self.title}'")

        cover_path = os.path.join(self.book_base, "cover.jpg")

        headers = {
            "User-Agent": "Mozilla/5.0",
            "Referer": self.scrape_result.get("book_url") or "",
        }

        try:
            log(f"[CTRL] Downloading cover: {self.cover_url}")
            resp = requests.get(self.cover_url, timeout=10, headers=headers)
            resp.raise_for_status()

            with open(cover_path, "wb") as f:
                f.write(resp.content)

            log(f"[CTRL] Cover saved: {cover_path}")
        except Exception as e:
            log(f"[CTRL] Cover download failed: {e}")

    # ---------------------------------------------------------
    def replicate_cover_to_volumes(self):
        src = os.path.join(self.book_base, "cover.jpg")
        if not os.path.exists(src):
            return

        for entry in os.listdir(self.book_base):
            if entry.lower().startswith("volume_"):
                dst = os.path.join(self.book_base, entry, "cover.jpg")
                try:
                    shutil.copyfile(src, dst)
                    log(f"[CTRL] Cover replicated → {dst}")
                except Exception as e:
                    log(f"[CTRL] Cover replication failed: {e}")

    # ---------------------------------------------------------
    def store_cover_in_static(self):
        src = os.path.join(self.book_base, "cover.jpg")
        if not os.path.exists(src):
            return

        os.makedirs("static/covers", exist_ok=True)
        dst = os.path.join("static/covers", f"{self.book_idx}.jpg")

        try:
            shutil.copyfile(src, dst)
            log(f"[CTRL] Cover stored for UI: {dst}")
        except Exception as e:
            log(f"[CTRL] Failed storing cover: {e}")

    # ---------------------------------------------------------
    def get_volume_path(self, chapter_num: int) -> str:
        vol_index = (chapter_num - 1) // self.max_vol + 1
        vol_name = f"Volume_{vol_index:03d}"
        vol_path = os.path.join(self.book_base, vol_name)
        os.makedirs(vol_path, exist_ok=True)
        return vol_path

    # ---------------------------------------------------------
    def start(self):
        total = len(self.chapters)
        log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)")

        # Update Redis/SQLite state
        try:
            set_status(self.book_idx, "downloading")
            set_chapters_total(self.book_idx, total)
        except Exception as e:
            log(f"[CTRL_STATE] Unable to set state: {e}")

        # Download cover
        self.download_cover()

        # Build pipeline tasks
        tasks = []
        for ch in self.chapters:
            num = ch["num"]
            chapter_info = {
                "num": num,
                "url": ch["url"],
                "title": ch.get("title"),
                "volume_path": self.get_volume_path(num),
            }
            tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta))

        async_result = group(tasks).apply_async()

        # Replicate cover + place in static
        self.replicate_cover_to_volumes()
        self.store_cover_in_static()

        # Generate scripts (LATE IMPORT to avoid circular)
        try:
            scriptgen.generate_all_scripts(
                self.book_base, self.title, self.meta["author"]
            )
            log("[CTRL] Scripts generated")
        except Exception as e:
            log(f"[CTRL] Script generation failed: {e}")

        return async_result