Compare commits
No commits in common. 'main' and 'feature/bookstate-progress-fix' have entirely different histories.
main
...
feature/bo
@ -1,70 +0,0 @@
|
||||
FROM debian:12
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# System + PHP (PHP 8.2 native)
|
||||
# ----------------------------------------------------------
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
curl \
|
||||
ca-certificates \
|
||||
bash \
|
||||
php-cli \
|
||||
php-intl \
|
||||
php-json \
|
||||
php-mbstring \
|
||||
php-xml \
|
||||
php-curl \
|
||||
php-zip \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-venv \
|
||||
\
|
||||
# build deps for mp4v2
|
||||
git \
|
||||
build-essential \
|
||||
autoconf \
|
||||
automake \
|
||||
libtool \
|
||||
pkg-config \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Python venv (PEP 668 compliant)
|
||||
# ----------------------------------------------------------
|
||||
RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:/usr/local/bin:$PATH"
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Build & install mp4v2 (mp4info)
|
||||
# ----------------------------------------------------------
|
||||
WORKDIR /tmp
|
||||
|
||||
RUN git clone https://github.com/sandreas/mp4v2 \
|
||||
&& cd mp4v2 \
|
||||
&& ./configure \
|
||||
&& make -j$(nproc) \
|
||||
&& make install \
|
||||
&& echo "/usr/local/lib" > /etc/ld.so.conf.d/mp4v2.conf \
|
||||
&& ldconfig \
|
||||
&& cd / \
|
||||
&& rm -rf /tmp/mp4v2
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Install m4b-tool
|
||||
# ----------------------------------------------------------
|
||||
RUN curl -L https://github.com/sandreas/m4b-tool/releases/latest/download/m4b-tool.phar \
|
||||
-o /usr/local/bin/m4b-tool \
|
||||
&& chmod +x /usr/local/bin/m4b-tool
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# App
|
||||
# ----------------------------------------------------------
|
||||
WORKDIR /app
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . /app
|
||||
|
||||
CMD ["bash"]
|
||||
@ -1 +0,0 @@
|
||||
Subproject commit 480a73324f53d0d24bea4931c3902097f8e2a663
|
||||
Binary file not shown.
@ -1,94 +0,0 @@
|
||||
# ============================================================
|
||||
# File: scraper/services/audio_completion.py
|
||||
# Purpose:
|
||||
# Orchestration hook after audio completion.
|
||||
#
|
||||
# Rules (STRICT):
|
||||
# - ALWAYS read via get_book_state()
|
||||
# - Use ONLY merged counters from repository
|
||||
# - NO usage of derived status field
|
||||
# - Completion rule:
|
||||
# audio_completed < chapters_total → NOT DONE
|
||||
# ============================================================
|
||||
|
||||
from logbus.publisher import log
|
||||
from scraper.logger_decorators import logcall
|
||||
|
||||
from db.repository import (
|
||||
get_book_state,
|
||||
try_trigger_statuscheck,
|
||||
)
|
||||
|
||||
from scraper.services.status_check_service import StatusCheckService
|
||||
from scraper.tasks.m4b_tasks import queue_m4b_for_book
|
||||
|
||||
|
||||
@logcall
|
||||
def trigger_audio_completion_check(book_idx: str):
|
||||
"""
|
||||
Called after inc_audio_done() OR inc_audio_skipped().
|
||||
|
||||
Flow:
|
||||
1. Fetch canonical merged state from repository
|
||||
2. Evaluate completion via merged counters ONLY
|
||||
3. Run filesystem validation (authoritative)
|
||||
4. Apply idempotency guard
|
||||
5. Queue m4b exactly once
|
||||
"""
|
||||
|
||||
try:
|
||||
# ----------------------------------------------------
|
||||
# STEP 1 — CANONICAL MERGED STATE
|
||||
# ----------------------------------------------------
|
||||
state = get_book_state(book_idx)
|
||||
|
||||
chapters_total = int(state.get("chapters_total", 0))
|
||||
audio_done = int(state.get("audio_done", 0))
|
||||
audio_skipped = int(state.get("audio_skipped", 0))
|
||||
audio_completed = audio_done + audio_skipped
|
||||
|
||||
log(
|
||||
f"[AUDIO-COMPLETION] book={book_idx} "
|
||||
f"audio_completed={audio_completed} chapters_total={chapters_total}"
|
||||
)
|
||||
|
||||
# ----------------------------------------------------
|
||||
# STEP 2 — FAST REJECT (MERGED COUNTERS ONLY)
|
||||
# ----------------------------------------------------
|
||||
if chapters_total <= 0 or audio_completed < chapters_total:
|
||||
log(f"[AUDIO-COMPLETION] not yet complete for book={book_idx}")
|
||||
return
|
||||
|
||||
# ----------------------------------------------------
|
||||
# STEP 3 — FILESYSTEM VALIDATION (AUTHORITATIVE)
|
||||
# ----------------------------------------------------
|
||||
result = StatusCheckService.run(book_idx)
|
||||
fs = result.get("filesystem", {})
|
||||
|
||||
audio_files = fs.get("audio_files", 0)
|
||||
chapters_txt = fs.get("chapters_txt", 0)
|
||||
effective_audio = audio_files + audio_skipped
|
||||
|
||||
if effective_audio < chapters_txt:
|
||||
log(
|
||||
f"[AUDIO-COMPLETION] FS validation failed "
|
||||
f"(audio_files={audio_files}, skipped={audio_skipped}, txt={chapters_txt})"
|
||||
)
|
||||
return
|
||||
|
||||
# ----------------------------------------------------
|
||||
# STEP 4 — IDEMPOTENCY GUARD (AFTER FS CONFIRMATION)
|
||||
# ----------------------------------------------------
|
||||
if not try_trigger_statuscheck(book_idx):
|
||||
log(f"[AUDIO-COMPLETION] statuscheck already triggered for {book_idx}")
|
||||
return
|
||||
|
||||
# ----------------------------------------------------
|
||||
# STEP 5 — FINAL ACTION
|
||||
# ----------------------------------------------------
|
||||
log(f"[AUDIO-COMPLETION] DONE → queue m4b for book={book_idx}")
|
||||
queue_m4b_for_book(book_idx)
|
||||
|
||||
except Exception as exc:
|
||||
# MUST NEVER break audio workers
|
||||
log(f"[AUDIO-COMPLETION][ERROR] book={book_idx} error={exc}")
|
||||
@ -1,135 +0,0 @@
|
||||
# ============================================================
|
||||
# File: scraper/services/status_check_service.py
|
||||
# Purpose:
|
||||
# Handmatige, idempotente statuscheck per boek.
|
||||
#
|
||||
# Bepaalt op basis van het filesystem:
|
||||
# - aantal gedownloade chapters (.txt)
|
||||
# - aantal gegenereerde audiofiles (.m4b)
|
||||
#
|
||||
# En schrijft deze gevalideerde werkelijkheid naar SQL.
|
||||
#
|
||||
# LET OP:
|
||||
# - Geen Redis
|
||||
# - Geen Celery
|
||||
# - Geen status-transities
|
||||
# - Geen pipeline-logica
|
||||
# ============================================================
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
from logbus.publisher import log
|
||||
from scraper.logger_decorators import logcall
|
||||
|
||||
from db.state_sql import sql_fetch_book, sql_update_book
|
||||
|
||||
|
||||
class StatusCheckService:
|
||||
"""
|
||||
Statuscheck op basis van filesystem.
|
||||
Single source of truth = disk.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@logcall
|
||||
def run(book_idx: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Voer statuscheck uit voor één boek.
|
||||
|
||||
Returns een inspecteerbaar dict met:
|
||||
- filesystem tellingen
|
||||
- SQL before / after snapshot
|
||||
"""
|
||||
|
||||
# ----------------------------------------------------
|
||||
# 1. SQL fetch (bestaat het boek?)
|
||||
# ----------------------------------------------------
|
||||
sql_before = sql_fetch_book(book_idx)
|
||||
|
||||
if not sql_before:
|
||||
raise ValueError(f"[STATUSCHECK] Book not found in SQL: {book_idx}")
|
||||
|
||||
# ----------------------------------------------------
|
||||
# 2. Bepaal filesystem root
|
||||
# ----------------------------------------------------
|
||||
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
||||
title = sql_before.get("title")
|
||||
book_dir = os.path.join(output_root, title)
|
||||
|
||||
if not os.path.isdir(book_dir):
|
||||
log(
|
||||
f"[STATUSCHECK] No output directory for book_idx={book_idx} : title='{title}')"
|
||||
)
|
||||
chapters_txt = 0
|
||||
audio_files = 0
|
||||
volumes = 0
|
||||
else:
|
||||
chapters_txt = 0
|
||||
audio_files = 0
|
||||
volumes = 0
|
||||
|
||||
# ------------------------------------------------
|
||||
# 3. Scan volumes
|
||||
# ------------------------------------------------
|
||||
for entry in os.listdir(book_dir):
|
||||
if not entry.lower().startswith("volume_"):
|
||||
continue
|
||||
|
||||
volumes += 1
|
||||
volume_path = os.path.join(book_dir, entry)
|
||||
|
||||
if not os.path.isdir(volume_path):
|
||||
continue
|
||||
|
||||
# ---- TXT chapters ----
|
||||
for fname in os.listdir(volume_path):
|
||||
if fname.lower().endswith(".txt"):
|
||||
chapters_txt += 1
|
||||
|
||||
# ---- Audio ----
|
||||
audio_dir = os.path.join(volume_path, "Audio")
|
||||
if os.path.isdir(audio_dir):
|
||||
for fname in os.listdir(audio_dir):
|
||||
if fname.lower().endswith(".m4b"):
|
||||
audio_files += 1
|
||||
|
||||
# ----------------------------------------------------
|
||||
# 4. SQL update (snapshot)
|
||||
# ----------------------------------------------------
|
||||
now = datetime.utcnow().isoformat(timespec="seconds")
|
||||
|
||||
update_fields = {
|
||||
"downloaded": chapters_txt,
|
||||
"audio_done": audio_files,
|
||||
"last_update": now,
|
||||
}
|
||||
|
||||
sql_update_book(book_idx, update_fields)
|
||||
|
||||
sql_after = sql_fetch_book(book_idx)
|
||||
|
||||
# ----------------------------------------------------
|
||||
# 5. Resultaat voor inspect/debug
|
||||
# ----------------------------------------------------
|
||||
result = {
|
||||
"book_idx": book_idx,
|
||||
"filesystem": {
|
||||
"book_dir": book_dir,
|
||||
"exists": os.path.isdir(book_dir),
|
||||
"volumes": volumes,
|
||||
"chapters_txt": chapters_txt,
|
||||
"audio_files": audio_files,
|
||||
},
|
||||
"sql_before": sql_before,
|
||||
"sql_after": sql_after,
|
||||
"notes": [],
|
||||
}
|
||||
|
||||
log(
|
||||
f"[STATUSCHECK] book_idx={book_idx} "
|
||||
f"chapters={chapters_txt} audio={audio_files}"
|
||||
)
|
||||
|
||||
return result
|
||||
@ -1,132 +0,0 @@
|
||||
# ============================================================
|
||||
# File: scraper/tasks/m4b_tasks.py
|
||||
# ============================================================
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from typing import List
|
||||
|
||||
from celery_app import celery_app
|
||||
from logbus.publisher import log
|
||||
from scraper.logger_decorators import logcall
|
||||
|
||||
from db.repository import fetch_book, store_m4b_error
|
||||
from scraper.scriptgen import build_merge_block
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Helper: detect volumes (UNCHANGED)
|
||||
# ------------------------------------------------------------
|
||||
def detect_volumes(book_base: str) -> List[str]:
|
||||
volumes = []
|
||||
for name in os.listdir(book_base):
|
||||
if name.lower().startswith("volume_"):
|
||||
full = os.path.join(book_base, name)
|
||||
if os.path.isdir(full):
|
||||
volumes.append(name)
|
||||
volumes.sort()
|
||||
return volumes
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Celery task
|
||||
# ------------------------------------------------------------
|
||||
@celery_app.task(bind=True, queue="m4b", ignore_result=True)
|
||||
@logcall
|
||||
def run_m4btool(self, book_idx: str):
|
||||
|
||||
log(f"[M4B] START book_idx={book_idx}")
|
||||
|
||||
book = fetch_book(book_idx)
|
||||
if not book:
|
||||
log(f"[M4B] Book not found in SQL: book_idx={book_idx}")
|
||||
return
|
||||
|
||||
title = book.get("title", book_idx)
|
||||
author = book.get("author", "Unknown")
|
||||
|
||||
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
||||
book_base = os.path.join(output_root, title)
|
||||
|
||||
log(f"[M4B] Book base directory: {book_base}")
|
||||
|
||||
if not os.path.isdir(book_base):
|
||||
log(f"[M4B] Book directory missing: {book_base}")
|
||||
return
|
||||
|
||||
volumes = detect_volumes(book_base)
|
||||
if not volumes:
|
||||
log(f"[M4B] No volumes found for book_idx={book_idx}")
|
||||
return
|
||||
|
||||
log(f"[M4B] Volumes detected: {volumes}")
|
||||
|
||||
# --------------------------------------------------------
|
||||
# Build canonical commands via scriptgen
|
||||
# --------------------------------------------------------
|
||||
merge_block = build_merge_block(
|
||||
title, author, [(i + 1, v) for i, v in enumerate(volumes)]
|
||||
)
|
||||
commands = [c.strip() for c in merge_block.split("&&") if c.strip()]
|
||||
|
||||
for volume, cmd in zip(volumes, commands):
|
||||
audio_dir = os.path.join(book_base, volume, "Audio")
|
||||
if not os.path.isdir(audio_dir):
|
||||
log(f"[M4B] SKIP {volume}: no Audio directory")
|
||||
continue
|
||||
|
||||
log(f"[M4B] Running for volume={volume}")
|
||||
log(f"[M4B] CMD: {cmd}")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
cwd=book_base,
|
||||
shell=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
if result.stdout:
|
||||
log(f"[M4B][STDOUT] {result.stdout}")
|
||||
|
||||
except subprocess.CalledProcessError as exc:
|
||||
log(f"[M4B][FAILED] volume={volume}")
|
||||
|
||||
if exc.stdout:
|
||||
log(f"[M4B][STDOUT] {exc.stdout}")
|
||||
if exc.stderr:
|
||||
log(f"[M4B][STDERR] {exc.stderr}")
|
||||
|
||||
store_m4b_error(
|
||||
book_idx=book_idx,
|
||||
volume=volume,
|
||||
error_text=exc.stderr or str(exc),
|
||||
)
|
||||
continue
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[M4B][UNEXPECTED ERROR] volume={volume}: {exc}")
|
||||
|
||||
store_m4b_error(
|
||||
book_idx=book_idx,
|
||||
volume=volume,
|
||||
error_text=str(exc),
|
||||
)
|
||||
continue
|
||||
|
||||
log(f"[M4B] FINISHED book_idx={book_idx}")
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Orchestration helper (UNCHANGED)
|
||||
# ------------------------------------------------------------
|
||||
@logcall
|
||||
def queue_m4b_for_book(book_idx: str):
|
||||
log(f"[M4B] Queuing m4b-tool for book_idx={book_idx}")
|
||||
celery_app.send_task(
|
||||
"scraper.tasks.m4b_tasks.run_m4btool",
|
||||
args=[book_idx],
|
||||
queue="m4b",
|
||||
)
|
||||
@ -1,149 +0,0 @@
|
||||
# ============================================================
|
||||
# File: scraper/tasks/statuscheck.py
|
||||
# Purpose:
|
||||
# Final status check after audio completion.
|
||||
#
|
||||
# Responsibilities:
|
||||
# - Verify Redis counters (sanity check)
|
||||
# - Verify filesystem (Audio files present)
|
||||
# - Queue m4btool task
|
||||
#
|
||||
# Design rules:
|
||||
# - Book-scope ONLY
|
||||
# - No direct Redis usage
|
||||
# - Repository is the single source of truth
|
||||
# - Idempotent, defensive, non-blocking
|
||||
# ============================================================
|
||||
|
||||
import os
|
||||
from celery_app import celery_app
|
||||
from logbus.publisher import log
|
||||
|
||||
from scraper.logger_decorators import logcall
|
||||
|
||||
from db.repository import (
|
||||
get_audio_done,
|
||||
get_chapters_total,
|
||||
set_status,
|
||||
fetch_book,
|
||||
)
|
||||
|
||||
from scraper.tasks.m4b_tasks import run_m4btool
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Helpers
|
||||
# ------------------------------------------------------------
|
||||
@log
|
||||
def _detect_volumes(book_base: str):
|
||||
"""
|
||||
Return sorted list of Volume_XXX directories.
|
||||
"""
|
||||
vols = []
|
||||
for name in os.listdir(book_base):
|
||||
if name.lower().startswith("volume_"):
|
||||
full = os.path.join(book_base, name)
|
||||
if os.path.isdir(full):
|
||||
vols.append(name)
|
||||
vols.sort()
|
||||
return vols
|
||||
|
||||
|
||||
@logcall
|
||||
def _count_audio_files(audio_dir: str) -> int:
|
||||
"""
|
||||
Count .m4b files in an Audio directory.
|
||||
"""
|
||||
if not os.path.isdir(audio_dir):
|
||||
return 0
|
||||
return len([f for f in os.listdir(audio_dir) if f.lower().endswith(".m4b")])
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Celery task
|
||||
# ------------------------------------------------------------
|
||||
@celery_app.task(bind=True, queue="controller", ignore_result=True)
|
||||
@logcall
|
||||
def run_statuscheck(self, book_idx: str):
|
||||
"""
|
||||
Final statuscheck before m4btool execution.
|
||||
|
||||
Triggered exactly once by audio_completion quickcheck.
|
||||
"""
|
||||
|
||||
log(f"[STATUSCHECK] START book={book_idx}")
|
||||
|
||||
# --------------------------------------------------------
|
||||
# 1. Redis sanity check (via repository)
|
||||
# --------------------------------------------------------
|
||||
audio_done = get_audio_done(book_idx)
|
||||
chapters_total = get_chapters_total(book_idx)
|
||||
|
||||
log(
|
||||
f"[STATUSCHECK] Counters book={book_idx} "
|
||||
f"audio_done={audio_done} chapters_total={chapters_total}"
|
||||
)
|
||||
|
||||
if chapters_total <= 0:
|
||||
log(f"[STATUSCHECK] No chapters_total → abort")
|
||||
return
|
||||
|
||||
if audio_done < chapters_total:
|
||||
# Defensive: should not happen, but never assume
|
||||
log(
|
||||
f"[STATUSCHECK] Audio not complete yet "
|
||||
f"({audio_done}/{chapters_total}) → abort"
|
||||
)
|
||||
return
|
||||
|
||||
# --------------------------------------------------------
|
||||
# 2. Fetch book metadata (for paths & m4b meta)
|
||||
# --------------------------------------------------------
|
||||
book = fetch_book(book_idx)
|
||||
if not book:
|
||||
log(f"[STATUSCHECK] Book not found in DB: {book_idx}")
|
||||
return
|
||||
|
||||
title = book.get("title") or book_idx
|
||||
author = book.get("author") or "Unknown"
|
||||
|
||||
# Base output directory
|
||||
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
||||
book_base = os.path.join(root, title)
|
||||
|
||||
if not os.path.isdir(book_base):
|
||||
log(f"[STATUSCHECK] Book directory missing: {book_base}")
|
||||
return
|
||||
|
||||
# --------------------------------------------------------
|
||||
# 3. Filesystem validation (light, non-blocking)
|
||||
# --------------------------------------------------------
|
||||
volumes = _detect_volumes(book_base)
|
||||
|
||||
if not volumes:
|
||||
log(f"[STATUSCHECK] No volumes found for {book_idx}")
|
||||
# Still allow m4btool to decide (it will no-op)
|
||||
else:
|
||||
for vol in volumes:
|
||||
audio_dir = os.path.join(book_base, vol, "Audio")
|
||||
count = _count_audio_files(audio_dir)
|
||||
|
||||
log(f"[STATUSCHECK] {vol}: " f"{count} audio files detected")
|
||||
|
||||
# --------------------------------------------------------
|
||||
# 4. Queue m4btool (final pipeline step)
|
||||
# --------------------------------------------------------
|
||||
log(f"[STATUSCHECK] Queue m4btool for book={book_idx}")
|
||||
|
||||
set_status(book_idx, "m4b_running")
|
||||
|
||||
run_m4btool.delay(
|
||||
book_idx=book_idx,
|
||||
book_base=book_base,
|
||||
meta={
|
||||
"title": title,
|
||||
"author": author,
|
||||
},
|
||||
)
|
||||
|
||||
log(f"[STATUSCHECK] DONE book={book_idx}")
|
||||
@ -1,13 +0,0 @@
|
||||
#!/bin/sh
|
||||
# mp4info shim for m4b-tool (ffprobe-based)
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "Usage: mp4info <file>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ffprobe outputs float seconds; m4b-tool expects an integer
|
||||
ffprobe -v error \
|
||||
-show_entries format=duration \
|
||||
-of default=noprint_wrappers=1:nokey=1 \
|
||||
"$1" | awk '{ printf "%d\n", ($1 + 0.5) }'
|
||||
Loading…
Reference in new issue