You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
177 lines
5.9 KiB
177 lines
5.9 KiB
# =========================================================
|
|
# File: scraper/download_controller.py
|
|
# Purpose:
|
|
# Build Celery pipelines for all chapters using book_idx
|
|
# Handles:
|
|
# • volume assignment
|
|
# • cover download + replication
|
|
# • script generation
|
|
# • Redis Book State Model init
|
|
# • abort tracking
|
|
# =========================================================
|
|
|
|
from celery import group
|
|
from scraper.tasks.pipeline import build_chapter_pipeline
|
|
|
|
# ❗ IMPORTANT:
|
|
# generate_all_scripts MUST NOT import DownloadController, otherwise circular import.
|
|
# We keep the import, but scriptgen must be clean.
|
|
from scraper import scriptgen
|
|
|
|
from logbus.publisher import log
|
|
import os
|
|
import requests
|
|
import shutil
|
|
|
|
from scraper.abort import abort_requested
|
|
from db.state_redis import init_book_state
|
|
from db.repository import set_status, set_chapters_total
|
|
|
|
|
|
class DownloadController:
|
|
"""
|
|
Coordinates all chapter pipelines (download → parse → save).
|
|
"""
|
|
|
|
def __init__(self, book_idx: str, scrape_result: dict):
|
|
self.book_idx = str(book_idx)
|
|
self.scrape_result = scrape_result
|
|
|
|
# Metadata
|
|
self.title = scrape_result.get("title", "UnknownBook")
|
|
self.chapters = scrape_result.get("chapters", []) or []
|
|
self.cover_url = scrape_result.get("cover_url")
|
|
|
|
# Output folder
|
|
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
|
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
|
|
self.book_base = os.path.join(root, self.title)
|
|
os.makedirs(self.book_base, exist_ok=True)
|
|
|
|
# Meta passed downstream
|
|
self.meta = {
|
|
"title": self.title,
|
|
"author": scrape_result.get("author"),
|
|
"description": scrape_result.get("description"),
|
|
"book_url": scrape_result.get("book_url"),
|
|
}
|
|
|
|
log(f"[CTRL_DEBUG] Controller init book_idx={self.book_idx}")
|
|
|
|
# Init Redis Book State Model
|
|
try:
|
|
init_book_state(
|
|
book_id=self.book_idx,
|
|
title=self.title,
|
|
url=self.meta["book_url"],
|
|
chapters_total=len(self.chapters),
|
|
)
|
|
except Exception as e:
|
|
log(f"[CTRL_STATE] init_book_state FAILED: {e}")
|
|
|
|
# ---------------------------------------------------------
|
|
def download_cover(self):
|
|
if not self.cover_url:
|
|
return log(f"[CTRL] No cover URL for '{self.title}'")
|
|
|
|
cover_path = os.path.join(self.book_base, "cover.jpg")
|
|
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0",
|
|
"Referer": self.scrape_result.get("book_url") or "",
|
|
}
|
|
|
|
try:
|
|
log(f"[CTRL] Downloading cover: {self.cover_url}")
|
|
resp = requests.get(self.cover_url, timeout=10, headers=headers)
|
|
resp.raise_for_status()
|
|
|
|
with open(cover_path, "wb") as f:
|
|
f.write(resp.content)
|
|
|
|
log(f"[CTRL] Cover saved: {cover_path}")
|
|
except Exception as e:
|
|
log(f"[CTRL] Cover download failed: {e}")
|
|
|
|
# ---------------------------------------------------------
|
|
def replicate_cover_to_volumes(self):
|
|
src = os.path.join(self.book_base, "cover.jpg")
|
|
if not os.path.exists(src):
|
|
return
|
|
|
|
for entry in os.listdir(self.book_base):
|
|
if entry.lower().startswith("volume_"):
|
|
dst = os.path.join(self.book_base, entry, "cover.jpg")
|
|
try:
|
|
shutil.copyfile(src, dst)
|
|
log(f"[CTRL] Cover replicated → {dst}")
|
|
except Exception as e:
|
|
log(f"[CTRL] Cover replication failed: {e}")
|
|
|
|
# ---------------------------------------------------------
|
|
def store_cover_in_static(self):
|
|
src = os.path.join(self.book_base, "cover.jpg")
|
|
if not os.path.exists(src):
|
|
return
|
|
|
|
os.makedirs("static/covers", exist_ok=True)
|
|
dst = os.path.join("static/covers", f"{self.book_idx}.jpg")
|
|
|
|
try:
|
|
shutil.copyfile(src, dst)
|
|
log(f"[CTRL] Cover stored for UI: {dst}")
|
|
except Exception as e:
|
|
log(f"[CTRL] Failed storing cover: {e}")
|
|
|
|
# ---------------------------------------------------------
|
|
def get_volume_path(self, chapter_num: int) -> str:
|
|
vol_index = (chapter_num - 1) // self.max_vol + 1
|
|
vol_name = f"Volume_{vol_index:03d}"
|
|
vol_path = os.path.join(self.book_base, vol_name)
|
|
os.makedirs(vol_path, exist_ok=True)
|
|
return vol_path
|
|
|
|
# ---------------------------------------------------------
|
|
def start(self):
|
|
total = len(self.chapters)
|
|
log(f"[CTRL] Starting pipeline for '{self.title}' ({total} chapters)")
|
|
|
|
# Update Redis/SQLite state
|
|
try:
|
|
set_status(self.book_idx, "downloading")
|
|
set_chapters_total(self.book_idx, total)
|
|
except Exception as e:
|
|
log(f"[CTRL_STATE] Unable to set state: {e}")
|
|
|
|
# Download cover
|
|
self.download_cover()
|
|
|
|
# Build pipeline tasks
|
|
tasks = []
|
|
for ch in self.chapters:
|
|
num = ch["num"]
|
|
chapter_info = {
|
|
"num": num,
|
|
"url": ch["url"],
|
|
"title": ch.get("title"),
|
|
"volume_path": self.get_volume_path(num),
|
|
}
|
|
tasks.append(build_chapter_pipeline(self.book_idx, chapter_info, self.meta))
|
|
|
|
async_result = group(tasks).apply_async()
|
|
|
|
# Replicate cover + place in static
|
|
self.replicate_cover_to_volumes()
|
|
self.store_cover_in_static()
|
|
|
|
# Generate scripts (LATE IMPORT to avoid circular)
|
|
try:
|
|
scriptgen.generate_all_scripts(
|
|
self.book_base, self.title, self.meta["author"]
|
|
)
|
|
log("[CTRL] Scripts generated")
|
|
except Exception as e:
|
|
log(f"[CTRL] Script generation failed: {e}")
|
|
|
|
return async_result
|