kmftools/bookscraper/scraper/tasks/pipeline.py

# =========================================================
# File: scraper/tasks/pipeline.py
# Purpose:
#   Build Celery chains for chapter processing using payload dict.
#
# Pipeline v3:
#   download_chapter(payload)
#       → parse_chapter(payload)
#           → save_chapter(payload)
#
# NOTE:
#   - book_idx is the single authoritative key for all tasks
#   - payload travels unchanged through the entire pipeline
# =========================================================

from celery import chain

from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter

from scraper.logger_decorators import logcall


@logcall
def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict):
    """
    Create a payload object passed through the pipeline.
    Consistent with the chapter_dict-based task signature.
    """

    payload = {
        "book_idx": book_idx,
        "chapter": chapter_dict,
        "book_meta": book_meta,
        # Will be filled by download_chapter
        "html": None,
        # Will be filled by parse_chapter
        "parsed": None,
        # Set by download or parse on skip/404/etc
        "skipped": False,
        # Final path written by save_chapter
        "path": None,
    }

    return chain(
        download_chapter.s(payload),
        parse_chapter.s(),
        save_chapter.s(),
    )