You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/m4b_tasks.py

133 lines
3.9 KiB

# ============================================================
# File: scraper/tasks/m4b_tasks.py
# ============================================================
import os
import subprocess
from typing import List
from celery_app import celery_app
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.repository import fetch_book, store_m4b_error
from scraper.scriptgen import build_merge_block
# ------------------------------------------------------------
# Helper: detect volumes (UNCHANGED)
# ------------------------------------------------------------
def detect_volumes(book_base: str) -> List[str]:
volumes = []
for name in os.listdir(book_base):
if name.lower().startswith("volume_"):
full = os.path.join(book_base, name)
if os.path.isdir(full):
volumes.append(name)
volumes.sort()
return volumes
# ------------------------------------------------------------
# Celery task
# ------------------------------------------------------------
@celery_app.task(bind=True, queue="m4b", ignore_result=True)
@logcall
def run_m4btool(self, book_idx: str):
log(f"[M4B] START book_idx={book_idx}")
book = fetch_book(book_idx)
if not book:
log(f"[M4B] Book not found in SQL: book_idx={book_idx}")
return
title = book.get("title", book_idx)
author = book.get("author", "Unknown")
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
book_base = os.path.join(output_root, title)
log(f"[M4B] Book base directory: {book_base}")
if not os.path.isdir(book_base):
log(f"[M4B] Book directory missing: {book_base}")
return
volumes = detect_volumes(book_base)
if not volumes:
log(f"[M4B] No volumes found for book_idx={book_idx}")
return
log(f"[M4B] Volumes detected: {volumes}")
# --------------------------------------------------------
# Build canonical commands via scriptgen
# --------------------------------------------------------
merge_block = build_merge_block(
title, author, [(i + 1, v) for i, v in enumerate(volumes)]
)
commands = [c.strip() for c in merge_block.split("&&") if c.strip()]
for volume, cmd in zip(volumes, commands):
audio_dir = os.path.join(book_base, volume, "Audio")
if not os.path.isdir(audio_dir):
log(f"[M4B] SKIP {volume}: no Audio directory")
continue
log(f"[M4B] Running for volume={volume}")
log(f"[M4B] CMD: {cmd}")
try:
result = subprocess.run(
cmd,
cwd=book_base,
shell=True,
capture_output=True,
text=True,
check=True,
)
if result.stdout:
log(f"[M4B][STDOUT] {result.stdout}")
except subprocess.CalledProcessError as exc:
log(f"[M4B][FAILED] volume={volume}")
if exc.stdout:
log(f"[M4B][STDOUT] {exc.stdout}")
if exc.stderr:
log(f"[M4B][STDERR] {exc.stderr}")
store_m4b_error(
book_idx=book_idx,
volume=volume,
error_text=exc.stderr or str(exc),
)
continue
except Exception as exc:
log(f"[M4B][UNEXPECTED ERROR] volume={volume}: {exc}")
store_m4b_error(
book_idx=book_idx,
volume=volume,
error_text=str(exc),
)
continue
log(f"[M4B] FINISHED book_idx={book_idx}")
# ------------------------------------------------------------
# Orchestration helper (UNCHANGED)
# ------------------------------------------------------------
@logcall
def queue_m4b_for_book(book_idx: str):
log(f"[M4B] Queuing m4b-tool for book_idx={book_idx}")
celery_app.send_task(
"scraper.tasks.m4b_tasks.run_m4btool",
args=[book_idx],
queue="m4b",
)