# scraper/tasks/scraping.py # from celery_app import celery_app from logbus.publisher import log import os from scraper.sites import BookSite from scraper.book_scraper import BookScraper from scraper.tasks.controller_tasks import launch_downloads print(">>> [IMPORT] scraping.py loaded") @celery_app.task(bind=True, queue="scraping", ignore_result=False) def start_scrape_book(self, url: str): """Scrapes metadata + chapter list.""" log(f"[SCRAPING] Start scraping for: {url}") site = BookSite() scraper = BookScraper(site, url) scraper.parse_book_info() chapters = scraper.get_chapter_list() full_count = len(chapters) DRY_RUN = os.getenv("DRY_RUN", "0") == "1" TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5")) if DRY_RUN: log(f"[SCRAPING] DRY_RUN: limiting chapters to first {TEST_LIMIT}") chapters = chapters[:TEST_LIMIT] # --------------------------------------------------- # FIX: add book_url so parse_chapter has the real url # --------------------------------------------------- result = { "title": scraper.book_title, "author": scraper.book_author, "description": scraper.book_description, "cover": scraper.cover_url, "book_url": url, "chapters": [ {"num": ch.number, "title": ch.title, "url": ch.url} for ch in chapters ], } log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters") celery_app.send_task( "scraper.tasks.controller_tasks.launch_downloads", args=[result], queue="controller", ) return result