You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/download_controller.py

73 lines
2.4 KiB

# scraper/download_controller.py
from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline
from logbus.publisher import log
import os
class DownloadController:
"""Coordinates parallel chapter pipelines, with optional volume splitting."""
def __init__(self, scrape_result: dict):
self.scrape_result = scrape_result
self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", [])
# Base output dir from .env
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
# Volume size
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
# Base directory for the whole book
self.book_base = os.path.join(root, self.title)
os.makedirs(self.book_base, exist_ok=True)
# constant metadata for all chapters
self.meta = {
"title": self.scrape_result.get("title"),
"author": self.scrape_result.get("author"),
"description": self.scrape_result.get("description"),
}
def get_volume_path(self, chapter_num: int) -> str:
"""Returns the correct volume directory based on chapter number."""
vol_index = (chapter_num - 1) // self.max_vol + 1
vol_name = f"Volume_{vol_index:03d}"
vol_path = os.path.join(self.book_base, vol_name)
os.makedirs(vol_path, exist_ok=True)
return vol_path
def start(self):
log(f"[CTRL] Starting download pipeline for {self.title}")
log(f"[CTRL] Chapters: {len(self.chapters)}")
log(f"[CTRL] Output root: {self.book_base}")
log(f"[CTRL] MAX_VOL_SIZE = {self.max_vol}")
tasks = []
for ch in self.chapters:
chapter_num = ch["num"]
chapter_url = ch["url"]
# compute volume directory
vol_path = self.get_volume_path(chapter_num)
# build the pipeline for this chapter
tasks.append(
build_chapter_pipeline(
chapter_num,
chapter_url,
vol_path, # ✔ correct volume path!!
self.meta, # ✔ pass metadata once
)
)
# parallel processing
job_group = group(tasks)
async_result = job_group.apply_async()
log("[CTRL] Pipelines launched.")
return async_result