You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
29 lines
883 B
29 lines
883 B
# scraper/tasks/pipeline.py
|
|
|
|
from celery import chain
|
|
from logbus.publisher import log
|
|
|
|
from scraper.tasks.download_tasks import download_chapter
|
|
from scraper.tasks.parse_tasks import parse_chapter
|
|
from scraper.tasks.save_tasks import save_chapter
|
|
|
|
|
|
def build_chapter_pipeline(chapter_number: int, chapter_url: str, base_path: str):
|
|
"""
|
|
Build a Celery pipeline for a single chapter:
|
|
download -> parse -> save
|
|
"""
|
|
|
|
log(f"[PIPELINE] Building chain for chapter {chapter_number}")
|
|
|
|
# Important: download returns dict {chapter, url, html}
|
|
# parse accepts html + chapter_url (via s())
|
|
# save accepts chapter_number, text, base_path
|
|
workflow = chain(
|
|
download_chapter.s(chapter_number, chapter_url),
|
|
parse_chapter.s(), # takes previous result dict
|
|
save_chapter.s(base_path=base_path)
|
|
)
|
|
|
|
return workflow
|