# ============================================================ # File: scraper/services/status_check_service.py # Purpose: # Handmatige, idempotente statuscheck per boek. # # Bepaalt op basis van het filesystem: # - aantal gedownloade chapters (.txt) # - aantal gegenereerde audiofiles (.m4b) # # En schrijft deze gevalideerde werkelijkheid naar SQL. # # LET OP: # - Geen Redis # - Geen Celery # - Geen status-transities # - Geen pipeline-logica # ============================================================ import os from datetime import datetime from typing import Dict, Any from logbus.publisher import log from scraper.logger_decorators import logcall from db.state_sql import sql_fetch_book, sql_update_book class StatusCheckService: """ Statuscheck op basis van filesystem. Single source of truth = disk. """ @staticmethod @logcall def run(book_idx: str) -> Dict[str, Any]: """ Voer statuscheck uit voor één boek. Returns een inspecteerbaar dict met: - filesystem tellingen - SQL before / after snapshot """ # ---------------------------------------------------- # 1. SQL fetch (bestaat het boek?) # ---------------------------------------------------- sql_before = sql_fetch_book(book_idx) if not sql_before: raise ValueError(f"[STATUSCHECK] Book not found in SQL: {book_idx}") # ---------------------------------------------------- # 2. Bepaal filesystem root # ---------------------------------------------------- output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") title = sql_before.get("title") book_dir = os.path.join(output_root, title) if not os.path.isdir(book_dir): log( f"[STATUSCHECK] No output directory for book_idx={book_idx} : title='{title}')" ) chapters_txt = 0 audio_files = 0 volumes = 0 else: chapters_txt = 0 audio_files = 0 volumes = 0 # ------------------------------------------------ # 3. Scan volumes # ------------------------------------------------ for entry in os.listdir(book_dir): if not entry.lower().startswith("volume_"): continue volumes += 1 volume_path = os.path.join(book_dir, entry) if not os.path.isdir(volume_path): continue # ---- TXT chapters ---- for fname in os.listdir(volume_path): if fname.lower().endswith(".txt"): chapters_txt += 1 # ---- Audio ---- audio_dir = os.path.join(volume_path, "Audio") if os.path.isdir(audio_dir): for fname in os.listdir(audio_dir): if fname.lower().endswith(".m4b"): audio_files += 1 # ---------------------------------------------------- # 4. SQL update (snapshot) # ---------------------------------------------------- now = datetime.utcnow().isoformat(timespec="seconds") update_fields = { "downloaded": chapters_txt, "audio_done": audio_files, "last_update": now, } sql_update_book(book_idx, update_fields) sql_after = sql_fetch_book(book_idx) # ---------------------------------------------------- # 5. Resultaat voor inspect/debug # ---------------------------------------------------- result = { "book_idx": book_idx, "filesystem": { "book_dir": book_dir, "exists": os.path.isdir(book_dir), "volumes": volumes, "chapters_txt": chapters_txt, "audio_files": audio_files, }, "sql_before": sql_before, "sql_after": sql_after, "notes": [], } log( f"[STATUSCHECK] book_idx={book_idx} " f"chapters={chapters_txt} audio={audio_files}" ) return result