You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/services/status_check_service.py

136 lines
4.2 KiB

# ============================================================
# File: scraper/services/status_check_service.py
# Purpose:
# Handmatige, idempotente statuscheck per boek.
#
# Bepaalt op basis van het filesystem:
# - aantal gedownloade chapters (.txt)
# - aantal gegenereerde audiofiles (.m4b)
#
# En schrijft deze gevalideerde werkelijkheid naar SQL.
#
# LET OP:
# - Geen Redis
# - Geen Celery
# - Geen status-transities
# - Geen pipeline-logica
# ============================================================
import os
from datetime import datetime
from typing import Dict, Any
from logbus.publisher import log
from scraper.logger_decorators import logcall
from db.state_sql import sql_fetch_book, sql_update_book
class StatusCheckService:
"""
Statuscheck op basis van filesystem.
Single source of truth = disk.
"""
@staticmethod
@logcall
def run(book_idx: str) -> Dict[str, Any]:
"""
Voer statuscheck uit voor één boek.
Returns een inspecteerbaar dict met:
- filesystem tellingen
- SQL before / after snapshot
"""
# ----------------------------------------------------
# 1. SQL fetch (bestaat het boek?)
# ----------------------------------------------------
sql_before = sql_fetch_book(book_idx)
if not sql_before:
raise ValueError(f"[STATUSCHECK] Book not found in SQL: {book_idx}")
# ----------------------------------------------------
# 2. Bepaal filesystem root
# ----------------------------------------------------
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
title = sql_before.get("title")
book_dir = os.path.join(output_root, title)
if not os.path.isdir(book_dir):
log(
f"[STATUSCHECK] No output directory for book_idx={book_idx} : title='{title}')"
)
chapters_txt = 0
audio_files = 0
volumes = 0
else:
chapters_txt = 0
audio_files = 0
volumes = 0
# ------------------------------------------------
# 3. Scan volumes
# ------------------------------------------------
for entry in os.listdir(book_dir):
if not entry.lower().startswith("volume_"):
continue
volumes += 1
volume_path = os.path.join(book_dir, entry)
if not os.path.isdir(volume_path):
continue
# ---- TXT chapters ----
for fname in os.listdir(volume_path):
if fname.lower().endswith(".txt"):
chapters_txt += 1
# ---- Audio ----
audio_dir = os.path.join(volume_path, "Audio")
if os.path.isdir(audio_dir):
for fname in os.listdir(audio_dir):
if fname.lower().endswith(".m4b"):
audio_files += 1
# ----------------------------------------------------
# 4. SQL update (snapshot)
# ----------------------------------------------------
now = datetime.utcnow().isoformat(timespec="seconds")
update_fields = {
"downloaded": chapters_txt,
"audio_done": audio_files,
"last_update": now,
}
sql_update_book(book_idx, update_fields)
sql_after = sql_fetch_book(book_idx)
# ----------------------------------------------------
# 5. Resultaat voor inspect/debug
# ----------------------------------------------------
result = {
"book_idx": book_idx,
"filesystem": {
"book_dir": book_dir,
"exists": os.path.isdir(book_dir),
"volumes": volumes,
"chapters_txt": chapters_txt,
"audio_files": audio_files,
},
"sql_before": sql_before,
"sql_after": sql_after,
"notes": [],
}
log(
f"[STATUSCHECK] book_idx={book_idx} "
f"chapters={chapters_txt} audio={audio_files}"
)
return result