# ============================================================ # File: scraper/tasks/scraping.py # Purpose: Scrape metadata + chapter list and initialise # Redis progress tracking + launch download controller # ============================================================ from celery_app import celery_app from logbus.publisher import log import os import redis from scraper.sites import BookSite from scraper.book_scraper import BookScraper from scraper.abort import clear_abort # no circular deps from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT print(">>> [IMPORT] scraping.py loaded") # Redis connection (same as Celery broker) REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") r = redis.Redis.from_url(REDIS_URL, decode_responses=True) @celery_app.task(bind=True, queue="scraping", ignore_result=False) def start_scrape_book(self, url: str): """Scrapes metadata + chapters and prepares download tracking.""" # ------------------------------------------------------------ # NEW: clear UI log buffer at start of new run # ------------------------------------------------------------ reset_ui_logs() log(f"[SCRAPING] Start scraping for: {url}") # ------------------------------------------------------------ # Book scrape # ------------------------------------------------------------ site = BookSite() scraper = BookScraper(site, url) result = scraper.execute() # returns dict with metadata + chapters chapters = result.get("chapters", []) full_count = len(chapters) # ------------------------------------------------------------ # DRY RUN # ------------------------------------------------------------ DRY_RUN = os.getenv("DRY_RUN", "0") == "1" TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5")) if DRY_RUN: log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}") chapters = chapters[:TEST_LIMIT] result["chapters"] = chapters log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters") # ------------------------------------------------------------ # BOOK RUN ID (using title as ID) # ------------------------------------------------------------ title = result.get("title") or "UnknownBook" book_id = title # user requirement result["book_id"] = book_id log(f"[SCRAPING] Assigned book_id = '{book_id}'") # ------------------------------------------------------------ # RESET ABORT + INITIALISE PROGRESS # ------------------------------------------------------------ clear_abort(book_id) r.set(f"progress:{book_id}:total", len(chapters)) r.set(f"progress:{book_id}:done", 0) r.delete(f"logs:{book_id}") # clear old logs if any r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}") r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters") # ------------------------------------------------------------ # DISPATCH DOWNLOAD CONTROLLER # ------------------------------------------------------------ celery_app.send_task( "scraper.tasks.controller_tasks.launch_downloads", args=[book_id, result], queue="controller", ) log(f"[SCRAPING] Dispatched download controller for '{book_id}'") return { "book_id": book_id, "title": result.get("title"), "author": result.get("author"), "chapters": len(chapters), }