You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
266 lines
9.2 KiB
266 lines
9.2 KiB
# =========================================================
|
|
# File: scraper/download_controller.py
|
|
# Purpose:
|
|
# Build Celery pipelines for all chapters
|
|
# and pass book_id for abort/progress/log functionality.
|
|
# + Download and replicate cover image to all volume folders
|
|
# + Generate scripts (allinone.txt, makebook, say)
|
|
# + Initialize Redis Book State Model (status + counters)
|
|
# =========================================================
|
|
|
|
from celery import group
|
|
from scraper.tasks.pipeline import build_chapter_pipeline
|
|
from scraper.scriptgen import generate_all_scripts
|
|
from logbus.publisher import log
|
|
import os
|
|
import requests
|
|
import shutil
|
|
from scraper.abort import abort_requested # DEBUG allowed
|
|
from db.repository import create_or_update_book
|
|
|
|
# NEW: Redis State Model (C&U)
|
|
from scraper.progress import (
|
|
init_book_state,
|
|
set_status,
|
|
set_chapter_total,
|
|
)
|
|
|
|
|
|
class DownloadController:
|
|
"""
|
|
Coordinates all chapter pipelines (download → parse → save),
|
|
including:
|
|
- volume splitting
|
|
- consistent meta propagation
|
|
- book_id-based abort + progress tracking
|
|
- cover download + volume replication
|
|
- script generation (allinone.txt, makebook, say)
|
|
- Redis book state initialisation and status updates
|
|
"""
|
|
|
|
def __init__(self, book_id: str, scrape_result: dict):
|
|
self.book_id = book_id
|
|
self.scrape_result = scrape_result
|
|
|
|
# Core metadata
|
|
self.title = scrape_result.get("title", "UnknownBook")
|
|
self.chapters = scrape_result.get("chapters", []) or []
|
|
self.cover_url = scrape_result.get("cover_url")
|
|
|
|
# Output base dir
|
|
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
|
|
|
# Volume size
|
|
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
|
|
|
|
# Base folder for the whole book
|
|
self.book_base = os.path.join(root, self.title)
|
|
os.makedirs(self.book_base, exist_ok=True)
|
|
|
|
# Meta passed to parse/save stage
|
|
self.meta = {
|
|
"title": self.title,
|
|
"author": scrape_result.get("author"),
|
|
"description": scrape_result.get("description"),
|
|
"book_url": scrape_result.get("book_url"),
|
|
}
|
|
|
|
# -------------------------------------------------
|
|
# DEBUG — bevestig dat controller correct book_id ziet
|
|
# -------------------------------------------------
|
|
log(f"[CTRL_DEBUG] Controller init book_id={book_id} title='{self.title}'")
|
|
|
|
try:
|
|
abort_state = abort_requested(book_id)
|
|
log(f"[CTRL_DEBUG] abort_requested(book_id={book_id}) → {abort_state}")
|
|
except Exception as e:
|
|
log(f"[CTRL_DEBUG] abort_requested ERROR: {e}")
|
|
|
|
# -------------------------------------------------
|
|
# NEW: Initialize Redis Book State Model
|
|
# -------------------------------------------------
|
|
try:
|
|
init_book_state(
|
|
book_id=self.book_id,
|
|
title=self.title,
|
|
url=self.scrape_result.get("book_url"),
|
|
chapters_total=len(self.chapters),
|
|
)
|
|
log(f"[CTRL_STATE] init_book_state() completed for {self.title}")
|
|
except Exception as e:
|
|
log(f"[CTRL_STATE] init_book_state FAILED: {e}")
|
|
|
|
# ---------------------------------------------------------
|
|
# Cover Download
|
|
# ---------------------------------------------------------
|
|
def download_cover(self):
|
|
"""Download one cover image into the root of the book folder."""
|
|
if not self.cover_url:
|
|
log(f"[CTRL] No cover URL found for '{self.title}'")
|
|
return
|
|
|
|
cover_path = os.path.join(self.book_base, "cover.jpg")
|
|
|
|
headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
|
|
"Gecko/20100101 Firefox/118.0"
|
|
),
|
|
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
|
|
}
|
|
|
|
try:
|
|
log(f"[CTRL] Downloading cover: {self.cover_url}")
|
|
|
|
resp = requests.get(self.cover_url, timeout=10, headers=headers)
|
|
resp.raise_for_status()
|
|
|
|
with open(cover_path, "wb") as f:
|
|
f.write(resp.content)
|
|
|
|
log(f"[CTRL] Cover saved to: {cover_path}")
|
|
|
|
except Exception as e:
|
|
log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
|
|
|
|
# ---------------------------------------------------------
|
|
# Cover Replication to Volumes
|
|
# ---------------------------------------------------------
|
|
def replicate_cover_to_volumes(self):
|
|
"""Copy cover.jpg into each existing Volume_xxx directory."""
|
|
src = os.path.join(self.book_base, "cover.jpg")
|
|
if not os.path.exists(src):
|
|
log("[CTRL] No cover.jpg found, replication skipped")
|
|
return
|
|
|
|
try:
|
|
|
|
for entry in os.listdir(self.book_base):
|
|
if entry.lower().startswith("volume_"):
|
|
vol_dir = os.path.join(self.book_base, entry)
|
|
dst = os.path.join(vol_dir, "cover.jpg")
|
|
|
|
shutil.copyfile(src, dst)
|
|
log(f"[CTRL] Cover replicated into: {dst}")
|
|
|
|
except Exception as e:
|
|
log(f"[CTRL] Cover replication failed: {e}")
|
|
|
|
def store_cover_in_static(self):
|
|
"""
|
|
Copy the main cover.jpg from book_base into static/covers/<book_id>.jpg.
|
|
This allows the Flask web UI to serve the cover directly.
|
|
"""
|
|
|
|
src = os.path.join(self.book_base, "cover.jpg")
|
|
if not os.path.exists(src):
|
|
log("[CTRL] No cover.jpg found, cannot store in static/covers")
|
|
return
|
|
|
|
# static/covers/<book_id>.jpg
|
|
static_dir = os.path.join("static", "covers")
|
|
os.makedirs(static_dir, exist_ok=True)
|
|
|
|
dst = os.path.join(static_dir, f"{self.book_id}.jpg")
|
|
|
|
try:
|
|
shutil.copyfile(src, dst)
|
|
log(f"[CTRL] Cover stored for UI: {dst}")
|
|
except Exception as e:
|
|
log(f"[CTRL] Failed to store cover in static: {e}")
|
|
|
|
# ---------------------------------------------------------
|
|
# Volume isolation
|
|
# ---------------------------------------------------------
|
|
def get_volume_path(self, chapter_num: int) -> str:
|
|
"""Returns the correct volume directory for a chapter."""
|
|
vol_index = (chapter_num - 1) // self.max_vol + 1
|
|
vol_name = f"Volume_{vol_index:03d}"
|
|
vol_path = os.path.join(self.book_base, vol_name)
|
|
os.makedirs(vol_path, exist_ok=True)
|
|
return vol_path
|
|
|
|
# ---------------------------------------------------------
|
|
# Pipeline launcher
|
|
# ---------------------------------------------------------
|
|
def start(self):
|
|
total = len(self.chapters)
|
|
|
|
log(
|
|
f"[CTRL] Initialising pipeline for '{self.title}' "
|
|
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
|
|
)
|
|
log(f"[CTRL] Output root: {self.book_base}")
|
|
|
|
# -------------------------------------
|
|
# NEW: Redis state update
|
|
# -------------------------------------
|
|
try:
|
|
set_status(self.book_id, "downloading")
|
|
set_chapter_total(self.book_id, total)
|
|
log(f"[CTRL_STATE] Status set to 'downloading' for {self.book_id}")
|
|
except Exception as e:
|
|
log(f"[CTRL_STATE] set_status/set_chapter_total FAILED: {e}")
|
|
|
|
# -------------------------------------
|
|
# 1) Download cover
|
|
# -------------------------------------
|
|
self.download_cover()
|
|
|
|
tasks = []
|
|
|
|
for ch in self.chapters:
|
|
|
|
# Build chapter_dict (NEW)
|
|
chapter_num = ch["num"]
|
|
chapter_url = ch["url"]
|
|
chapter_title = ch.get("title")
|
|
|
|
volume_path = self.get_volume_path(chapter_num)
|
|
|
|
chapter_dict = {
|
|
"num": chapter_num,
|
|
"url": chapter_url,
|
|
"title": chapter_title,
|
|
"volume_path": volume_path,
|
|
}
|
|
|
|
# Dispatch pipeline with chapter_dict
|
|
tasks.append(
|
|
build_chapter_pipeline(
|
|
self.book_id,
|
|
chapter_dict,
|
|
self.meta,
|
|
)
|
|
)
|
|
|
|
async_result = group(tasks).apply_async()
|
|
|
|
log(
|
|
f"[CTRL] Pipelines dispatched for '{self.title}' "
|
|
f"(book_id={self.book_id}, group_id={async_result.id})"
|
|
)
|
|
|
|
# Debug abort state
|
|
try:
|
|
abort_state = abort_requested(self.book_id)
|
|
log(f"[CTRL_DEBUG] After-dispatch abort state: {abort_state}")
|
|
except Exception as e:
|
|
log(f"[CTRL_DEBUG] abort_requested error after dispatch: {e}")
|
|
|
|
# -------------------------------------------------------
|
|
self.replicate_cover_to_volumes()
|
|
self.store_cover_in_static()
|
|
# -------------------------------------------------------
|
|
try:
|
|
generate_all_scripts(
|
|
self.book_base,
|
|
self.title,
|
|
self.meta.get("author"),
|
|
)
|
|
log(f"[CTRL] Scripts generated for '{self.title}'")
|
|
except Exception as e:
|
|
log(f"[CTRL] Script generation failed: {e}")
|
|
|
|
return async_result
|