You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
136 lines
4.2 KiB
136 lines
4.2 KiB
# ============================================================
|
|
# File: scraper/services/status_check_service.py
|
|
# Purpose:
|
|
# Handmatige, idempotente statuscheck per boek.
|
|
#
|
|
# Bepaalt op basis van het filesystem:
|
|
# - aantal gedownloade chapters (.txt)
|
|
# - aantal gegenereerde audiofiles (.m4b)
|
|
#
|
|
# En schrijft deze gevalideerde werkelijkheid naar SQL.
|
|
#
|
|
# LET OP:
|
|
# - Geen Redis
|
|
# - Geen Celery
|
|
# - Geen status-transities
|
|
# - Geen pipeline-logica
|
|
# ============================================================
|
|
|
|
import os
|
|
from datetime import datetime
|
|
from typing import Dict, Any
|
|
|
|
from logbus.publisher import log
|
|
from scraper.logger_decorators import logcall
|
|
|
|
from db.state_sql import sql_fetch_book, sql_update_book
|
|
|
|
|
|
class StatusCheckService:
|
|
"""
|
|
Statuscheck op basis van filesystem.
|
|
Single source of truth = disk.
|
|
"""
|
|
|
|
@staticmethod
|
|
@logcall
|
|
def run(book_idx: str) -> Dict[str, Any]:
|
|
"""
|
|
Voer statuscheck uit voor één boek.
|
|
|
|
Returns een inspecteerbaar dict met:
|
|
- filesystem tellingen
|
|
- SQL before / after snapshot
|
|
"""
|
|
|
|
# ----------------------------------------------------
|
|
# 1. SQL fetch (bestaat het boek?)
|
|
# ----------------------------------------------------
|
|
sql_before = sql_fetch_book(book_idx)
|
|
|
|
if not sql_before:
|
|
raise ValueError(f"[STATUSCHECK] Book not found in SQL: {book_idx}")
|
|
|
|
# ----------------------------------------------------
|
|
# 2. Bepaal filesystem root
|
|
# ----------------------------------------------------
|
|
output_root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
|
title = sql_before.get("title")
|
|
book_dir = os.path.join(output_root, title)
|
|
|
|
if not os.path.isdir(book_dir):
|
|
log(
|
|
f"[STATUSCHECK] No output directory for book_idx={book_idx} : title='{title}')"
|
|
)
|
|
chapters_txt = 0
|
|
audio_files = 0
|
|
volumes = 0
|
|
else:
|
|
chapters_txt = 0
|
|
audio_files = 0
|
|
volumes = 0
|
|
|
|
# ------------------------------------------------
|
|
# 3. Scan volumes
|
|
# ------------------------------------------------
|
|
for entry in os.listdir(book_dir):
|
|
if not entry.lower().startswith("volume_"):
|
|
continue
|
|
|
|
volumes += 1
|
|
volume_path = os.path.join(book_dir, entry)
|
|
|
|
if not os.path.isdir(volume_path):
|
|
continue
|
|
|
|
# ---- TXT chapters ----
|
|
for fname in os.listdir(volume_path):
|
|
if fname.lower().endswith(".txt"):
|
|
chapters_txt += 1
|
|
|
|
# ---- Audio ----
|
|
audio_dir = os.path.join(volume_path, "Audio")
|
|
if os.path.isdir(audio_dir):
|
|
for fname in os.listdir(audio_dir):
|
|
if fname.lower().endswith(".m4b"):
|
|
audio_files += 1
|
|
|
|
# ----------------------------------------------------
|
|
# 4. SQL update (snapshot)
|
|
# ----------------------------------------------------
|
|
now = datetime.utcnow().isoformat(timespec="seconds")
|
|
|
|
update_fields = {
|
|
"downloaded": chapters_txt,
|
|
"audio_done": audio_files,
|
|
"last_update": now,
|
|
}
|
|
|
|
sql_update_book(book_idx, update_fields)
|
|
|
|
sql_after = sql_fetch_book(book_idx)
|
|
|
|
# ----------------------------------------------------
|
|
# 5. Resultaat voor inspect/debug
|
|
# ----------------------------------------------------
|
|
result = {
|
|
"book_idx": book_idx,
|
|
"filesystem": {
|
|
"book_dir": book_dir,
|
|
"exists": os.path.isdir(book_dir),
|
|
"volumes": volumes,
|
|
"chapters_txt": chapters_txt,
|
|
"audio_files": audio_files,
|
|
},
|
|
"sql_before": sql_before,
|
|
"sql_after": sql_after,
|
|
"notes": [],
|
|
}
|
|
|
|
log(
|
|
f"[STATUSCHECK] book_idx={book_idx} "
|
|
f"chapters={chapters_txt} audio={audio_files}"
|
|
)
|
|
|
|
return result
|