# ============================================================ # File: scraper/tasks/scraping.py # Purpose: # Scrape ONLY metadata + chapter list. # Does NOT launch download controller anymore. # Controller decides when pipelines start. # ============================================================ from celery_app import celery_app from logbus.publisher import log import os import redis from scraper.logger_decorators import logcall from scraper.sites import BookSite from scraper.book_scraper import BookScraper from scraper.abort import clear_abort from scraper.ui_log import reset_ui_logs from scraper.services.init_service import InitService print(">>> [IMPORT] scraping.py loaded") # Redis connection (same DB as Celery broker) REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") r = redis.Redis.from_url(REDIS_URL, decode_responses=True) @celery_app.task( bind=True, queue="scraping", ignore_result=False, name="scraper.tasks.scraping.start_scrape_book", ) @logcall def start_scrape_book(self, url: str): """ Scrapes metadata + chapters. DOES NOT START download / pipeline controller. The controller_tasks.start_full_scrape() task will call this one. """ # ------------------------------------------------------------ # CLEAR UI LOG BUFFER # ------------------------------------------------------------ reset_ui_logs() log(f"[SCRAPING] Start scraping for: {url}") # ------------------------------------------------------------ # SCRAPE (old engine) # ------------------------------------------------------------ site = BookSite() scraper = BookScraper(site, url) result = scraper.execute() # → { title, author, chapters, cover_url, ... } chapters = result.get("chapters", []) full_count = len(chapters) # ------------------------------------------------------------ # Compute unified book_idx # ------------------------------------------------------------ book_idx = InitService.derive_book_id(url) result["book_idx"] = book_idx log(f"[SCRAPING] Assigned book_idx = {book_idx}") # ------------------------------------------------------------ # DRY RUN TEST LIMIT # ------------------------------------------------------------ DRY_RUN = os.getenv("DRY_RUN", "0") == "1" TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5")) if DRY_RUN: log(f"[SCRAPING] DRY RUN active → limiting chapters to {TEST_LIMIT}") result["chapters"] = chapters[:TEST_LIMIT] # ------------------------------------------------------------ # LOG RESULTS # ------------------------------------------------------------ log( f"[SCRAPING] Completed scrape: " f"{len(result['chapters'])}/{full_count} chapters" ) # ------------------------------------------------------------ # RESET ABORT + INITIALIZE LEGACY PROGRESS # ------------------------------------------------------------ clear_abort(book_idx) r.set(f"progress:{book_idx}:total", len(result["chapters"])) r.set(f"progress:{book_idx}:done", 0) r.delete(f"logs:{book_idx}") r.rpush(f"logs:{book_idx}", f":: SCRAPING STARTED for {url}") r.rpush(f"logs:{book_idx}", f":: Found {len(result['chapters'])} chapters") # ------------------------------------------------------------ # IMPORTANT: DO NOT DISPATCH any pipelines here # Controller will receive scrape_result and continue. # ------------------------------------------------------------ return result