You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/scraping.py

53 lines
1.5 KiB

# scraper/tasks/scraping.py
from celery_app import celery_app
from logbus.publisher import log
import os
from scraper.sites import BookSite
from scraper.book_scraper import BookScraper
print(">>> [IMPORT] scraping.py loaded")
@celery_app.task(bind=True, queue="scraping", ignore_result=False)
def start_scrape_book(self, url: str):
"""Scrapes metadata + chapter list using new BookScraper.execute()."""
log(f"[SCRAPING] Start scraping for: {url}")
site = BookSite()
scraper = BookScraper(site, url)
# ----------------------------------------
# NEW API (old: scraper.parse_book_info())
# ----------------------------------------
result = scraper.execute()
chapters = result.get("chapters", [])
full_count = len(chapters)
# ----------------------------------------
# DRY RUN logic
# ----------------------------------------
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
if DRY_RUN:
log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}")
chapters = chapters[:TEST_LIMIT]
result["chapters"] = chapters
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
# ----------------------------------------
# Dispatch download pipelines
# ----------------------------------------
celery_app.send_task(
"scraper.tasks.controller_tasks.launch_downloads",
args=[result],
queue="controller",
)
return result