You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/pipeline.py

51 lines
1.4 KiB

# =========================================================
# File: scraper/tasks/pipeline.py
# Purpose:
# Build Celery chains for chapter processing using payload dict.
#
# Pipeline v3:
# download_chapter(payload)
# → parse_chapter(payload)
# → save_chapter(payload)
#
# NOTE:
# - book_idx is the single authoritative key for all tasks
# - payload travels unchanged through the entire pipeline
# =========================================================
from celery import chain
from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter
from scraper.logger_decorators import logcall
@logcall
def build_chapter_pipeline(book_idx: str, chapter_dict: dict, book_meta: dict):
"""
Create a payload object passed through the pipeline.
Consistent with the chapter_dict-based task signature.
"""
payload = {
"book_idx": book_idx,
"chapter": chapter_dict,
"book_meta": book_meta,
# Will be filled by download_chapter
"html": None,
# Will be filled by parse_chapter
"parsed": None,
# Set by download or parse on skip/404/etc
"skipped": False,
# Final path written by save_chapter
"path": None,
}
return chain(
download_chapter.s(payload),
parse_chapter.s(),
save_chapter.s(),
)