You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/pipeline.py

53 lines
1.6 KiB

# =========================================================
# File: scraper/tasks/pipeline.py
# Purpose: Construct Celery chains for chapter processing.
#
# Pipeline:
# download_chapter(book_id, chapter_num, url, base_path)
# → parse_chapter(download_result, meta)
# → save_chapter(parsed_result, base_path)
#
# Abort behavior:
# - download_chapter uses book_id to decide skip vs execute
# - parse/save always run once download has started
# =========================================================
"""
Build the pipeline for a single chapter:
download → parse → save
This module must NOT import scraping.py or controllers,
otherwise Celery will hit circular imports on worker startup.
Only import task functions here.
"""
from celery import chain
from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter
def build_chapter_pipeline(
book_id: str,
chapter_number: int,
chapter_url: str,
base_path: str,
meta: dict,
):
"""
Construct a Celery chain for one chapter:
1. download_chapter(book_id, chapter_number, chapter_url, base_path)
2. parse_chapter(download_result, meta)
3. save_chapter(parsed_result, base_path)
"""
return chain(
# download_chapter needs: book_id, chapter_num, url, base_path
download_chapter.s(book_id, chapter_number, chapter_url, base_path),
# parse_chapter gets output of download_chapter + meta
parse_chapter.s(meta),
# save_chapter gets parsed result + base_path
save_chapter.s(base_path),
)