kmftools/bookscraper/scraper/download_controller.py

# =========================================================
# File: scraper/download_controller.py
# Purpose:
#   Build Celery pipelines for all chapters
#   and pass book_id for abort/progress/log functionality.
#   + Download and replicate cover image to all volume folders
#   + Generate scripts (allinone.txt, makebook, say)
#   + Initialize Redis Book State Model (status + counters)
# =========================================================

from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline
from scraper.scriptgen import generate_all_scripts
from logbus.publisher import log
import os
import requests
import shutil
from scraper.abort import abort_requested  # DEBUG allowed
from db.repository import create_or_update_book

# NEW: Redis State Model (C&U)
from scraper.progress import (
    init_book_state,
    set_status,
    set_chapter_total,
)


class DownloadController:
    """
    Coordinates all chapter pipelines (download → parse → save),
    including:
      - volume splitting
      - consistent meta propagation
      - book_id-based abort + progress tracking
      - cover download + volume replication
      - script generation (allinone.txt, makebook, say)
      - Redis book state initialisation and status updates
    """

    def __init__(self, book_id: str, scrape_result: dict):
        self.book_id = book_id
        self.scrape_result = scrape_result

        # Core metadata
        self.title = scrape_result.get("title", "UnknownBook")
        self.chapters = scrape_result.get("chapters", []) or []
        self.cover_url = scrape_result.get("cover_url")

        # Output base dir
        root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")

        # Volume size
        self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))

        # Base folder for the whole book
        self.book_base = os.path.join(root, self.title)
        os.makedirs(self.book_base, exist_ok=True)

        # Meta passed to parse/save stage
        self.meta = {
            "title": self.title,
            "author": scrape_result.get("author"),
            "description": scrape_result.get("description"),
            "book_url": scrape_result.get("book_url"),
        }

        # -------------------------------------------------
        # DEBUG — bevestig dat controller correct book_id ziet
        # -------------------------------------------------
        log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'")

        try:
            abort_state = abort_requested(book_id)
            log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}")
        except Exception as e:
            log(f"[CTRL_DEBUG] abort_requested ERROR: {e}")

        # -------------------------------------------------
        # NEW: Initialize Redis Book State Model
        # -------------------------------------------------
        try:
            init_book_state(
                book_id=self.book_id,
                title=self.title,
                url=self.scrape_result.get("book_url"),
                chapters_total=len(self.chapters),
            )
            log(f"[CTRL_STATE] init_book_state() completed for {self.title}")
        except Exception as e:
            log(f"[CTRL_STATE] init_book_state FAILED: {e}")

    # ---------------------------------------------------------
    # Cover Download
    # ---------------------------------------------------------
    def download_cover(self):
        """Download one cover image into the root of the book folder."""
        if not self.cover_url:
            log(f"[CTRL] No cover URL found for '{self.title}'")
            return

        cover_path = os.path.join(self.book_base, "cover.jpg")

        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
                "Gecko/20100101 Firefox/118.0"
            ),
            "Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
        }

        try:
            log(f"[CTRL] Downloading cover: {self.cover_url}")

            resp = requests.get(self.cover_url, timeout=10, headers=headers)
            resp.raise_for_status()

            with open(cover_path, "wb") as f:
                f.write(resp.content)

            log(f"[CTRL] Cover saved to: {cover_path}")

        except Exception as e:
            log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")

    # ---------------------------------------------------------
    # Cover Replication to Volumes
    # ---------------------------------------------------------
    def replicate_cover_to_volumes(self):
        """Copy cover.jpg into each existing Volume_xxx directory."""
        src = os.path.join(self.book_base, "cover.jpg")
        if not os.path.exists(src):
            log("[CTRL] No cover.jpg found, replication skipped")
            return

        try:

            for entry in os.listdir(self.book_base):
                if entry.lower().startswith("volume_"):
                    vol_dir = os.path.join(self.book_base, entry)
                    dst = os.path.join(vol_dir, "cover.jpg")

                    shutil.copyfile(src, dst)
                    log(f"[CTRL] Cover replicated into: {dst}")

        except Exception as e:
            log(f"[CTRL] Cover replication failed: {e}")

    def store_cover_in_static(self):
        """
        Copy the main cover.jpg from book_base into static/covers/<book_id>.jpg.
        This allows the Flask web UI to serve the cover directly.
        """

        src = os.path.join(self.book_base, "cover.jpg")
        if not os.path.exists(src):
            log("[CTRL] No cover.jpg found, cannot store in static/covers")
            return

        # static/covers/<book_id>.jpg
        static_dir = os.path.join("static", "covers")
        os.makedirs(static_dir, exist_ok=True)

        dst = os.path.join(static_dir, f"{self.book_id}.jpg")

        try:
            shutil.copyfile(src, dst)
            log(f"[CTRL] Cover stored for UI: {dst}")
        except Exception as e:
            log(f"[CTRL] Failed to store cover in static: {e}")

    # ---------------------------------------------------------
    # Volume isolation
    # ---------------------------------------------------------
    def get_volume_path(self, chapter_num: int) -> str:
        """Returns the correct volume directory for a chapter."""
        vol_index = (chapter_num - 1) // self.max_vol + 1
        vol_name = f"Volume_{vol_index:03d}"
        vol_path = os.path.join(self.book_base, vol_name)
        os.makedirs(vol_path, exist_ok=True)
        return vol_path

    # ---------------------------------------------------------
    # Pipeline launcher
    # ---------------------------------------------------------
    def start(self):
        total = len(self.chapters)

        log(
            f"[CTRL] Initialising pipeline for '{self.title}' "
            f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
        )
        log(f"[CTRL] Output root: {self.book_base}")

        # -------------------------------------
        # NEW: Redis state update
        # -------------------------------------
        try:
            set_status(self.book_id, "downloading")
            set_chapter_total(self.book_id, total)
            log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}")
        except Exception as e:
            log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}")

        # -------------------------------------
        # 1) Download cover
        # -------------------------------------
        self.download_cover()

        tasks = []

        for ch in self.chapters:

            # Build chapter_dict (NEW)
            chapter_num = ch["num"]
            chapter_url = ch["url"]
            chapter_title = ch.get("title")

            volume_path = self.get_volume_path(chapter_num)

            chapter_dict = {
                "num": chapter_num,
                "url": chapter_url,
                "title": chapter_title,
                "volume_path": volume_path,
            }

            # Dispatch pipeline with chapter_dict
            tasks.append(
                build_chapter_pipeline(
                    self.book_id,
                    chapter_dict,
                    self.meta,
                )
            )

        async_result = group(tasks).apply_async()

        log(
            f"[CTRL] Pipelines dispatched for '{self.title}' "
            f"(book_id={self.book_id}, group_id={async_result.id})"
        )

        # Debug abort state
        try:
            abort_state = abort_requested(self.book_id)
            log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}")
        except Exception as e:
            log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}")

        # -------------------------------------------------------
        self.replicate_cover_to_volumes()
        self.store_cover_in_static()
        # -------------------------------------------------------
        try:
            generate_all_scripts(
                self.book_base,
                self.title,
                self.meta.get("author"),
            )
            log(f"[CTRL] Scripts generated for '{self.title}'")
        except Exception as e:
            log(f"[CTRL] Script generation failed: {e}")

        return async_result