You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/pipeline.py

29 lines
883 B

# scraper/tasks/pipeline.py
from celery import chain
from logbus.publisher import log
from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter
def build_chapter_pipeline(chapter_number: int, chapter_url: str, base_path: str):
"""
Build a Celery pipeline for a single chapter:
download -> parse -> save
"""
log(f"[PIPELINE] Building chain for chapter {chapter_number}")
# Important: download returns dict {chapter, url, html}
# parse accepts html + chapter_url (via s())
# save accepts chapter_number, text, base_path
workflow = chain(
download_chapter.s(chapter_number, chapter_url),
parse_chapter.s(), # takes previous result dict
save_chapter.s(base_path=base_path)
)
return workflow