You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/controller_tasks.py

96 lines
3.1 KiB

# ============================================================
# File: scraper/tasks/controller_tasks.py
# Purpose:
# Start the download → parse → save pipeline for a scraped book,
# including progress/abort tracking via book_id.
# ONLY THE CONTROLLER UPDATES PROGRESS.
# ============================================================
from celery_app import celery_app
from logbus.publisher import log
from scraper.download_controller import DownloadController
from scraper.progress import (
set_total,
inc_completed,
inc_skipped,
inc_failed,
)
from scraper.abort import abort_requested
print(">>> [IMPORT] controller_tasks.py loaded")
@celery_app.task(bind=True, queue="controller", ignore_result=False)
def launch_downloads(self, book_id: str, scrape_result: dict):
"""
Launch the entire pipeline (download → parse → save),
AND maintain progress counters.
EXPECTS:
book_id: ID generated in scraping.start_scrape_book
scrape_result: dict with title, author, url, chapters[]
"""
title = scrape_result.get("title", "UnknownBook")
chapters = scrape_result.get("chapters", []) or []
total = len(chapters)
log(f"[CTRL] Book '{title}'{total} chapters (book_id={book_id})")
# ------------------------------------------------------------
# INIT PROGRESS
# ------------------------------------------------------------
set_total(book_id, total)
log(f"[CTRL] Progress initialized for {book_id}: total={total}")
# ------------------------------------------------------------
# BUILD CONTROLLER
# ------------------------------------------------------------
ctl = DownloadController(book_id, scrape_result)
# ------------------------------------------------------------
# RUN PIPELINE IN SYNC LOOP
# (DownloadController.start() returns per-chapter generator)
# ------------------------------------------------------------
try:
for result in ctl.start(): # new generator mode
ch = result.get("chapter")
if result.get("skipped"):
inc_skipped(book_id)
inc_completed(book_id)
log(f"[CTRL] SKIPPED chapter {ch}")
continue
if result.get("failed"):
inc_failed(book_id)
inc_completed(book_id)
log(f"[CTRL] FAILED chapter {ch}")
continue
# Normal success
inc_completed(book_id)
log(f"[CTRL] DONE chapter {ch}")
# Abort requested mid-run?
if abort_requested(book_id):
log(f"[CTRL] ABORT after chapter {ch}")
break
except Exception as exc:
log(f"[CTRL] ERROR while processing pipeline: {exc}")
inc_failed(book_id)
raise
# ------------------------------------------------------------
# FINISHED
# ------------------------------------------------------------
log(f"[CTRL] Pipeline finished for book_id={book_id}")
return {
"book_id": book_id,
"total": total,
"completed": int(total), # For safety
}