# ========================================================= # File: scraper/tasks/pipeline.py # Purpose: # Build Celery chains for chapter processing using payload dict. # # Pipeline v3: # download_chapter(payload) # → parse_chapter(payload) # → save_chapter(payload) # # NOTE: # - book_idx is the single authoritative key for all tasks # - payload travels unchanged through the entire pipeline # ========================================================= from celery import chain from scraper.tasks.download_tasks import download_chapter from scraper.tasks.parse_tasks import parse_chapter from scraper.tasks.save_tasks import save_chapter from scraper.logger_decorators import logcall @logcall def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict): """ Create a payload object passed through the pipeline. Consistent with the chapter_dict-based task signature. """ payload = { "book_idx": book_idx, "chapter": chapter_dict, "book_meta": book_meta, # Will be filled by download_chapter "html": None, # Will be filled by parse_chapter "parsed": None, # Set by download or parse on skip/404/etc "skipped": False, # Final path written by save_chapter "path": None, } return chain( download_chapter.s(payload), parse_chapter.s(), save_chapter.s(), )