You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
98 lines
3.1 KiB
98 lines
3.1 KiB
# =========================================================
|
|
# File: scraper/download_controller.py
|
|
# Purpose:
|
|
# Build Celery pipelines for all chapters
|
|
# and pass book_id for abort/progress/log functionality.
|
|
# =========================================================
|
|
|
|
from celery import group
|
|
from scraper.tasks.pipeline import build_chapter_pipeline
|
|
from logbus.publisher import log
|
|
import os
|
|
|
|
|
|
class DownloadController:
|
|
"""
|
|
Coordinates all chapter pipelines (download → parse → save),
|
|
including:
|
|
- volume splitting
|
|
- consistent meta propagation
|
|
- book_id-based abort + progress tracking
|
|
"""
|
|
|
|
def __init__(self, book_id: str, scrape_result: dict):
|
|
self.book_id = book_id
|
|
self.scrape_result = scrape_result
|
|
|
|
# Core metadata
|
|
self.title = scrape_result.get("title", "UnknownBook")
|
|
self.chapters = scrape_result.get("chapters", []) or []
|
|
|
|
# Output base dir
|
|
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
|
|
|
# Volume size
|
|
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
|
|
|
|
# Base folder for the whole book
|
|
self.book_base = os.path.join(root, self.title)
|
|
os.makedirs(self.book_base, exist_ok=True)
|
|
|
|
# Meta passed to parse/save stage
|
|
self.meta = {
|
|
"title": self.title,
|
|
"author": scrape_result.get("author"),
|
|
"description": scrape_result.get("description"),
|
|
"book_url": scrape_result.get("book_url"),
|
|
}
|
|
|
|
# ---------------------------------------------------------
|
|
# Volume isolation
|
|
# ---------------------------------------------------------
|
|
def get_volume_path(self, chapter_num: int) -> str:
|
|
"""Returns the correct volume directory for a chapter."""
|
|
vol_index = (chapter_num - 1) // self.max_vol + 1
|
|
vol_name = f"Volume_{vol_index:03d}"
|
|
vol_path = os.path.join(self.book_base, vol_name)
|
|
os.makedirs(vol_path, exist_ok=True)
|
|
return vol_path
|
|
|
|
# ---------------------------------------------------------
|
|
# Pipeline launcher
|
|
# ---------------------------------------------------------
|
|
def start(self):
|
|
total = len(self.chapters)
|
|
|
|
log(
|
|
f"[CTRL] Initialising pipeline for '{self.title}' "
|
|
f"(book_id={self.book_id}, chapters={total}, max_vol={self.max_vol})"
|
|
)
|
|
log(f"[CTRL] Output root: {self.book_base}")
|
|
|
|
tasks = []
|
|
|
|
for ch in self.chapters:
|
|
chapter_num = ch["num"]
|
|
chapter_url = ch["url"]
|
|
|
|
volume_path = self.get_volume_path(chapter_num)
|
|
|
|
tasks.append(
|
|
build_chapter_pipeline(
|
|
self.book_id, # ← UUID from scraping.py
|
|
chapter_num,
|
|
chapter_url,
|
|
volume_path,
|
|
self.meta,
|
|
)
|
|
)
|
|
|
|
async_result = group(tasks).apply_async()
|
|
|
|
log(
|
|
f"[CTRL] Pipelines dispatched for '{self.title}' "
|
|
f"(book_id={self.book_id}, group_id={async_result.id})"
|
|
)
|
|
|
|
return async_result
|