kmftools/bookscraper/scraper/tasks/pipeline.py

# =========================================================
# File: scraper/tasks/pipeline.py
# Purpose: Construct Celery chains for chapter processing.
#
# Pipeline:
#   download_chapter(book_id, chapter_num, url, base_path)
#   → parse_chapter(download_result, meta)
#   → save_chapter(parsed_result, base_path)
#
# Abort behavior:
#   - download_chapter uses book_id to decide skip vs execute
#   - parse/save always run once download has started
# =========================================================

"""
Build the pipeline for a single chapter:
download → parse → save

This module must NOT import scraping.py or controllers,
otherwise Celery will hit circular imports on worker startup.
Only import task functions here.
"""

from celery import chain

from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter


def build_chapter_pipeline(
    book_id: str,
    chapter_number: int,
    chapter_url: str,
    base_path: str,
    meta: dict,
):
    """
    Construct a Celery chain for one chapter:
      1. download_chapter(book_id, chapter_number, chapter_url, base_path)
      2. parse_chapter(download_result, meta)
      3. save_chapter(parsed_result, base_path)
    """

    return chain(
        # download_chapter needs: book_id, chapter_num, url, base_path
        download_chapter.s(book_id, chapter_number, chapter_url, base_path),
        # parse_chapter gets output of download_chapter + meta
        parse_chapter.s(meta),
        # save_chapter gets parsed result + base_path
        save_chapter.s(base_path),
    )