You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/statuscheck.py

150 lines
4.3 KiB

# ============================================================
# File: scraper/tasks/statuscheck.py
# Purpose:
# Final status check after audio completion.
#
# Responsibilities:
# - Verify Redis counters (sanity check)
# - Verify filesystem (Audio files present)
# - Queue m4btool task
#
# Design rules:
# - Book-scope ONLY
# - No direct Redis usage
# - Repository is the single source of truth
# - Idempotent, defensive, non-blocking
# ============================================================
import os
from celery_app import celery_app
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import (
get_audio_done,
get_chapters_total,
set_status,
fetch_book,
)
from scraper.tasks.m4b_tasks import run_m4btool
# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
@log
def _detect_volumes(book_base: str):
"""
Return sorted list of Volume_XXX directories.
"""
vols = []
for name in os.listdir(book_base):
if name.lower().startswith("volume_"):
full = os.path.join(book_base, name)
if os.path.isdir(full):
vols.append(name)
vols.sort()
return vols
@logcall
def _count_audio_files(audio_dir: str) -> int:
"""
Count .m4b files in an Audio directory.
"""
if not os.path.isdir(audio_dir):
return 0
return len([f for f in os.listdir(audio_dir) if f.lower().endswith(".m4b")])
# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="controller", ignore_result=True)
@logcall
def run_statuscheck(self, book_idx: str):
"""
Final statuscheck before m4btool execution.
Triggered exactly once by audio_completion quickcheck.
"""
log(f"[STATUSCHECK] START book={book_idx}")
# --------------------------------------------------------
# 1. Redis sanity check (via repository)
# --------------------------------------------------------
audio_done = get_audio_done(book_idx)
chapters_total = get_chapters_total(book_idx)
log(
f"[STATUSCHECK] Counters book={book_idx} "
f"audio_done={audio_done} chapters_total={chapters_total}"
)
if chapters_total <= 0:
log(f"[STATUSCHECK] No chapters_total → abort")
return
if audio_done < chapters_total:
# Defensive: should not happen, but never assume
log(
f"[STATUSCHECK] Audio not complete yet "
f"({audio_done}/{chapters_total}) → abort"
)
return
# --------------------------------------------------------
# 2. Fetch book metadata (for paths & m4b meta)
# --------------------------------------------------------
book = fetch_book(book_idx)
if not book:
log(f"[STATUSCHECK] Book not found in DB: {book_idx}")
return
title = book.get("title") or book_idx
author = book.get("author") or "Unknown"
# Base output directory
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
book_base = os.path.join(root, title)
if not os.path.isdir(book_base):
log(f"[STATUSCHECK] Book directory missing: {book_base}")
return
# --------------------------------------------------------
# 3. Filesystem validation (light, non-blocking)
# --------------------------------------------------------
volumes = _detect_volumes(book_base)
if not volumes:
log(f"[STATUSCHECK] No volumes found for {book_idx}")
# Still allow m4btool to decide (it will no-op)
else:
for vol in volumes:
audio_dir = os.path.join(book_base, vol, "Audio")
count = _count_audio_files(audio_dir)
log(f"[STATUSCHECK] {vol}: " f"{count} audio files detected")
# --------------------------------------------------------
# 4. Queue m4btool (final pipeline step)
# --------------------------------------------------------
log(f"[STATUSCHECK] Queue m4btool for book={book_idx}")
set_status(book_idx, "m4b_running")
run_m4btool.delay(
book_idx=book_idx,
book_base=book_base,
meta={
"title": title,
"author": author,
},
)
log(f"[STATUSCHECK] DONE book={book_idx}")