# ============================================================ # File: scraper/tasks/statuscheck.py # Purpose: # Final status check after audio completion. # # Responsibilities: # - Verify Redis counters (sanity check) # - Verify filesystem (Audio files present) # - Queue m4btool task # # Design rules: # - Book-scope ONLY # - No direct Redis usage # - Repository is the single source of truth # - Idempotent, defensive, non-blocking # ============================================================ import os from celery_app import celery_app from logbus.publisher import log from scraper.logger_decorators import logcall from db.repository import ( get_audio_done, get_chapters_total, set_status, fetch_book, ) from scraper.tasks.m4b_tasks import run_m4btool # ------------------------------------------------------------ # Helpers # ------------------------------------------------------------ @log def _detect_volumes(book_base: str): """ Return sorted list of Volume_XXX directories. """ vols = [] for name in os.listdir(book_base): if name.lower().startswith("volume_"): full = os.path.join(book_base, name) if os.path.isdir(full): vols.append(name) vols.sort() return vols @logcall def _count_audio_files(audio_dir: str) -> int: """ Count .m4b files in an Audio directory. """ if not os.path.isdir(audio_dir): return 0 return len([f for f in os.listdir(audio_dir) if f.lower().endswith(".m4b")]) # ------------------------------------------------------------ # Celery task # ------------------------------------------------------------ @celery_app.task(bind=True, queue="controller", ignore_result=True) @logcall def run_statuscheck(self, book_idx: str): """ Final statuscheck before m4btool execution. Triggered exactly once by audio_completion quickcheck. """ log(f"[STATUSCHECK] START book={book_idx}") # -------------------------------------------------------- # 1. Redis sanity check (via repository) # -------------------------------------------------------- audio_done = get_audio_done(book_idx) chapters_total = get_chapters_total(book_idx) log( f"[STATUSCHECK] Counters book={book_idx} " f"audio_done={audio_done} chapters_total={chapters_total}" ) if chapters_total <= 0: log(f"[STATUSCHECK] No chapters_total → abort") return if audio_done < chapters_total: # Defensive: should not happen, but never assume log( f"[STATUSCHECK] Audio not complete yet " f"({audio_done}/{chapters_total}) → abort" ) return # -------------------------------------------------------- # 2. Fetch book metadata (for paths & m4b meta) # -------------------------------------------------------- book = fetch_book(book_idx) if not book: log(f"[STATUSCHECK] Book not found in DB: {book_idx}") return title = book.get("title") or book_idx author = book.get("author") or "Unknown" # Base output directory root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") book_base = os.path.join(root, title) if not os.path.isdir(book_base): log(f"[STATUSCHECK] Book directory missing: {book_base}") return # -------------------------------------------------------- # 3. Filesystem validation (light, non-blocking) # -------------------------------------------------------- volumes = _detect_volumes(book_base) if not volumes: log(f"[STATUSCHECK] No volumes found for {book_idx}") # Still allow m4btool to decide (it will no-op) else: for vol in volumes: audio_dir = os.path.join(book_base, vol, "Audio") count = _count_audio_files(audio_dir) log(f"[STATUSCHECK] {vol}: " f"{count} audio files detected") # -------------------------------------------------------- # 4. Queue m4btool (final pipeline step) # -------------------------------------------------------- log(f"[STATUSCHECK] Queue m4btool for book={book_idx}") set_status(book_idx, "m4b_running") run_m4btool.delay( book_idx=book_idx, book_base=book_base, meta={ "title": title, "author": author, }, ) log(f"[STATUSCHECK] DONE book={book_idx}")