kmftools/bookscraper/scraper/tasks/statuscheck.py

# ============================================================
# File: scraper/tasks/statuscheck.py
# Purpose:
#   Final status check after audio completion.
#
# Responsibilities:
#   - Verify Redis counters (sanity check)
#   - Verify filesystem (Audio files present)
#   - Queue m4btool task
#
# Design rules:
#   - Book-scope ONLY
#   - No direct Redis usage
#   - Repository is the single source of truth
#   - Idempotent, defensive, non-blocking
# ============================================================

import os
from celery_app import celery_app
from logbus.publisher import log

from scraper.logger_decorators import logcall

from db.repository import (
    get_audio_done,
    get_chapters_total,
    set_status,
    fetch_book,
)

from scraper.tasks.m4b_tasks import run_m4btool


# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
@log
def _detect_volumes(book_base: str):
    """
    Return sorted list of Volume_XXX directories.
    """
    vols = []
    for name in os.listdir(book_base):
        if name.lower().startswith("volume_"):
            full = os.path.join(book_base, name)
            if os.path.isdir(full):
                vols.append(name)
    vols.sort()
    return vols


@logcall
def _count_audio_files(audio_dir: str) -> int:
    """
    Count .m4b files in an Audio directory.
    """
    if not os.path.isdir(audio_dir):
        return 0
    return len([f for f in os.listdir(audio_dir) if f.lower().endswith(".m4b")])


# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="controller", ignore_result=True)
@logcall
def run_statuscheck(self, book_idx: str):
    """
    Final statuscheck before m4btool execution.

    Triggered exactly once by audio_completion quickcheck.
    """

    log(f"[STATUSCHECK] START book={book_idx}")

    # --------------------------------------------------------
    # 1. Redis sanity check (via repository)
    # --------------------------------------------------------
    audio_done = get_audio_done(book_idx)
    chapters_total = get_chapters_total(book_idx)

    log(
        f"[STATUSCHECK] Counters book={book_idx} "
        f"audio_done={audio_done} chapters_total={chapters_total}"
    )

    if chapters_total <= 0:
        log(f"[STATUSCHECK] No chapters_total → abort")
        return

    if audio_done < chapters_total:
        # Defensive: should not happen, but never assume
        log(
            f"[STATUSCHECK] Audio not complete yet "
            f"({audio_done}/{chapters_total}) → abort"
        )
        return

    # --------------------------------------------------------
    # 2. Fetch book metadata (for paths & m4b meta)
    # --------------------------------------------------------
    book = fetch_book(book_idx)
    if not book:
        log(f"[STATUSCHECK] Book not found in DB: {book_idx}")
        return

    title = book.get("title") or book_idx
    author = book.get("author") or "Unknown"

    # Base output directory
    root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
    book_base = os.path.join(root, title)

    if not os.path.isdir(book_base):
        log(f"[STATUSCHECK] Book directory missing: {book_base}")
        return

    # --------------------------------------------------------
    # 3. Filesystem validation (light, non-blocking)
    # --------------------------------------------------------
    volumes = _detect_volumes(book_base)

    if not volumes:
        log(f"[STATUSCHECK] No volumes found for {book_idx}")
        # Still allow m4btool to decide (it will no-op)
    else:
        for vol in volumes:
            audio_dir = os.path.join(book_base, vol, "Audio")
            count = _count_audio_files(audio_dir)

            log(f"[STATUSCHECK] {vol}: " f"{count} audio files detected")

    # --------------------------------------------------------
    # 4. Queue m4btool (final pipeline step)
    # --------------------------------------------------------
    log(f"[STATUSCHECK] Queue m4btool for book={book_idx}")

    set_status(book_idx, "m4b_running")

    run_m4btool.delay(
        book_idx=book_idx,
        book_base=book_base,
        meta={
            "title": title,
            "author": author,
        },
    )

    log(f"[STATUSCHECK] DONE book={book_idx}")