You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/download_controller.py

98 lines
3.1 KiB

# =========================================================
# File: scraper/download_controller.py
# Purpose:
# Build Celery pipelines for all chapters
# and pass book_id for abort/progress/log functionality.
# =========================================================
from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline
from logbus.publisher import log
import os
class DownloadController:
"""
Coordinates all chapter pipelines (download → parse → save),
including:
- volume splitting
- consistent meta propagation
- book_id-based abort + progress tracking
"""
def __init__(self, book_id: str, scrape_result: dict):
self.book_id = book_id
self.scrape_result = scrape_result
# Core metadata
self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or []
# Output base dir
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
# Volume size
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
# Base folder for the whole book
self.book_base = os.path.join(root, self.title)
os.makedirs(self.book_base, exist_ok=True)
# Meta passed to parse/save stage
self.meta = {
"title": self.title,
"author": scrape_result.get("author"),
"description": scrape_result.get("description"),
"book_url": scrape_result.get("book_url"),
}
# ---------------------------------------------------------
# Volume isolation
# ---------------------------------------------------------
def get_volume_path(self, chapter_num: int) -> str:
"""Returns the correct volume directory for a chapter."""
vol_index = (chapter_num - 1) // self.max_vol + 1
vol_name = f"Volume_{vol_index:03d}"
vol_path = os.path.join(self.book_base, vol_name)
os.makedirs(vol_path, exist_ok=True)
return vol_path
# ---------------------------------------------------------
# Pipeline launcher
# ---------------------------------------------------------
def start(self):
total = len(self.chapters)
log(
f"[CTRL] Initialising pipeline for '{self.title}' "
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
)
log(f"[CTRL] Output root: {self.book_base}")
tasks = []
for ch in self.chapters:
chapter_num = ch["num"]
chapter_url = ch["url"]
volume_path = self.get_volume_path(chapter_num)
tasks.append(
build_chapter_pipeline(
self.book_id, # ← UUID from scraping.py
chapter_num,
chapter_url,
volume_path,
self.meta,
)
)
async_result = group(tasks).apply_async()
log(
f"[CTRL] Pipelines dispatched for '{self.title}' "
f"(book_id={self.book_id}, group_id={async_result.id})"
)
return async_result