# ========================================================= # File: scraper/tasks/pipeline.py # Purpose: Construct Celery chains for chapter processing. # # Pipeline: # download_chapter(book_id, chapter_num, url, base_path) # → parse_chapter(download_result, meta) # → save_chapter(parsed_result, base_path) # # Abort behavior: # - download_chapter uses book_id to decide skip vs execute # - parse/save always run once download has started # ========================================================= """ Build the pipeline for a single chapter: download → parse → save This module must NOT import scraping.py or controllers, otherwise Celery will hit circular imports on worker startup. Only import task functions here. """ from celery import chain from scraper.tasks.download_tasks import download_chapter from scraper.tasks.parse_tasks import parse_chapter from scraper.tasks.save_tasks import save_chapter def build_chapter_pipeline( book_id: str, chapter_number: int, chapter_url: str, base_path: str, meta: dict, ): """ Construct a Celery chain for one chapter: 1. download_chapter(book_id, chapter_number, chapter_url, base_path) 2. parse_chapter(download_result, meta) 3. save_chapter(parsed_result, base_path) """ return chain( # download_chapter needs: book_id, chapter_num, url, base_path download_chapter.s(book_id, chapter_number, chapter_url, base_path), # parse_chapter gets output of download_chapter + meta parse_chapter.s(meta), # save_chapter gets parsed result + base_path save_chapter.s(base_path), )