You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/download_controller.py

266 lines
9.2 KiB

# =========================================================
# File: scraper/download_controller.py
# Purpose:
# Build Celery pipelines for all chapters
# and pass book_id for abort/progress/log functionality.
# + Download and replicate cover image to all volume folders
# + Generate scripts (allinone.txt, makebook, say)
# + Initialize Redis Book State Model (status + counters)
# =========================================================
from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline
from scraper.scriptgen import generate_all_scripts
from logbus.publisher import log
import os
import requests
import shutil
from scraper.abort import abort_requested # DEBUG allowed
from db.repository import create_or_update_book
# NEW: Redis State Model (C&U)
from scraper.progress import (
init_book_state,
set_status,
set_chapter_total,
)
class DownloadController:
"""
Coordinates all chapter pipelines (download → parse → save),
including:
- volume splitting
- consistent meta propagation
- book_id-based abort + progress tracking
- cover download + volume replication
- script generation (allinone.txt, makebook, say)
- Redis book state initialisation and status updates
"""
def __init__(self, book_id: str, scrape_result: dict):
self.book_id = book_id
self.scrape_result = scrape_result
# Core metadata
self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or []
self.cover_url = scrape_result.get("cover_url")
# Output base dir
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
# Volume size
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
# Base folder for the whole book
self.book_base = os.path.join(root, self.title)
os.makedirs(self.book_base, exist_ok=True)
# Meta passed to parse/save stage
self.meta = {
"title": self.title,
"author": scrape_result.get("author"),
"description": scrape_result.get("description"),
"book_url": scrape_result.get("book_url"),
}
# -------------------------------------------------
# DEBUG — bevestig dat controller correct book_id ziet
# -------------------------------------------------
log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'")
try:
abort_state = abort_requested(book_id)
log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested ERROR: {e}")
# -------------------------------------------------
# NEW: Initialize Redis Book State Model
# -------------------------------------------------
try:
init_book_state(
book_id=self.book_id,
title=self.title,
url=self.scrape_result.get("book_url"),
chapters_total=len(self.chapters),
)
log(f"[CTRL_STATE] init_book_state() completed for {self.title}")
except Exception as e:
log(f"[CTRL_STATE] init_book_state FAILED: {e}")
# ---------------------------------------------------------
# Cover Download
# ---------------------------------------------------------
def download_cover(self):
"""Download one cover image into the root of the book folder."""
if not self.cover_url:
log(f"[CTRL] No cover URL found for '{self.title}'")
return
cover_path = os.path.join(self.book_base, "cover.jpg")
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
),
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
}
try:
log(f"[CTRL] Downloading cover: {self.cover_url}")
resp = requests.get(self.cover_url, timeout=10, headers=headers)
resp.raise_for_status()
with open(cover_path, "wb") as f:
f.write(resp.content)
log(f"[CTRL] Cover saved to: {cover_path}")
except Exception as e:
log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
# ---------------------------------------------------------
# Cover Replication to Volumes
# ---------------------------------------------------------
def replicate_cover_to_volumes(self):
"""Copy cover.jpg into each existing Volume_xxx directory."""
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
log("[CTRL] No cover.jpg found, replication skipped")
return
try:
for entry in os.listdir(self.book_base):
if entry.lower().startswith("volume_"):
vol_dir = os.path.join(self.book_base, entry)
dst = os.path.join(vol_dir, "cover.jpg")
shutil.copyfile(src, dst)
log(f"[CTRL] Cover replicated into: {dst}")
except Exception as e:
log(f"[CTRL] Cover replication failed: {e}")
def store_cover_in_static(self):
"""
Copy the main cover.jpg from book_base into static/covers/<book_id>.jpg.
This allows the Flask web UI to serve the cover directly.
"""
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
log("[CTRL] No cover.jpg found, cannot store in static/covers")
return
# static/covers/<book_id>.jpg
static_dir = os.path.join("static", "covers")
os.makedirs(static_dir, exist_ok=True)
dst = os.path.join(static_dir, f"{self.book_id}.jpg")
try:
shutil.copyfile(src, dst)
log(f"[CTRL] Cover stored for UI: {dst}")
except Exception as e:
log(f"[CTRL] Failed to store cover in static: {e}")
# ---------------------------------------------------------
# Volume isolation
# ---------------------------------------------------------
def get_volume_path(self, chapter_num: int) -> str:
"""Returns the correct volume directory for a chapter."""
vol_index = (chapter_num - 1) // self.max_vol + 1
vol_name = f"Volume_{vol_index:03d}"
vol_path = os.path.join(self.book_base, vol_name)
os.makedirs(vol_path, exist_ok=True)
return vol_path
# ---------------------------------------------------------
# Pipeline launcher
# ---------------------------------------------------------
def start(self):
total = len(self.chapters)
log(
f"[CTRL] Initialising pipeline for '{self.title}' "
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
)
log(f"[CTRL] Output root: {self.book_base}")
# -------------------------------------
# NEW: Redis state update
# -------------------------------------
try:
set_status(self.book_id, "downloading")
set_chapter_total(self.book_id, total)
log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}")
except Exception as e:
log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}")
# -------------------------------------
# 1) Download cover
# -------------------------------------
self.download_cover()
tasks = []
for ch in self.chapters:
# Build chapter_dict (NEW)
chapter_num = ch["num"]
chapter_url = ch["url"]
chapter_title = ch.get("title")
volume_path = self.get_volume_path(chapter_num)
chapter_dict = {
"num": chapter_num,
"url": chapter_url,
"title": chapter_title,
"volume_path": volume_path,
}
# Dispatch pipeline with chapter_dict
tasks.append(
build_chapter_pipeline(
self.book_id,
chapter_dict,
self.meta,
)
)
async_result = group(tasks).apply_async()
log(
f"[CTRL] Pipelines dispatched for '{self.title}' "
f"(book_id={self.book_id}, group_id={async_result.id})"
)
# Debug abort state
try:
abort_state = abort_requested(self.book_id)
log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}")
except Exception as e:
log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}")
# -------------------------------------------------------
self.replicate_cover_to_volumes()
self.store_cover_in_static()
# -------------------------------------------------------
try:
generate_all_scripts(
self.book_base,
self.title,
self.meta.get("author"),
)
log(f"[CTRL] Scripts generated for '{self.title}'")
except Exception as e:
log(f"[CTRL] Script generation failed: {e}")
return async_result