You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
53 lines
1.6 KiB
53 lines
1.6 KiB
# =========================================================
|
|
# File: scraper/tasks/pipeline.py
|
|
# Purpose: Construct Celery chains for chapter processing.
|
|
#
|
|
# Pipeline:
|
|
# download_chapter(book_id, chapter_num, url, base_path)
|
|
# → parse_chapter(download_result, meta)
|
|
# → save_chapter(parsed_result, base_path)
|
|
#
|
|
# Abort behavior:
|
|
# - download_chapter uses book_id to decide skip vs execute
|
|
# - parse/save always run once download has started
|
|
# =========================================================
|
|
|
|
"""
|
|
Build the pipeline for a single chapter:
|
|
download → parse → save
|
|
|
|
This module must NOT import scraping.py or controllers,
|
|
otherwise Celery will hit circular imports on worker startup.
|
|
Only import task functions here.
|
|
"""
|
|
|
|
from celery import chain
|
|
|
|
from scraper.tasks.download_tasks import download_chapter
|
|
from scraper.tasks.parse_tasks import parse_chapter
|
|
from scraper.tasks.save_tasks import save_chapter
|
|
|
|
|
|
def build_chapter_pipeline(
|
|
book_id: str,
|
|
chapter_number: int,
|
|
chapter_url: str,
|
|
base_path: str,
|
|
meta: dict,
|
|
):
|
|
"""
|
|
Construct a Celery chain for one chapter:
|
|
1. download_chapter(book_id, chapter_number, chapter_url, base_path)
|
|
2. parse_chapter(download_result, meta)
|
|
3. save_chapter(parsed_result, base_path)
|
|
"""
|
|
|
|
return chain(
|
|
# download_chapter needs: book_id, chapter_num, url, base_path
|
|
download_chapter.s(book_id, chapter_number, chapter_url, base_path),
|
|
# parse_chapter gets output of download_chapter + meta
|
|
parse_chapter.s(meta),
|
|
# save_chapter gets parsed result + base_path
|
|
save_chapter.s(base_path),
|
|
)
|