You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/save_tasks.py

140 lines
4.3 KiB

# ============================================================
# File: scraper/tasks/save_tasks.py
# Purpose: Save parsed chapter text to disk + trigger audio.
# Updated for chapter_dict + book_meta pipeline model.
# ============================================================
print(">>> [IMPORT] save_tasks.py loaded")
from celery import shared_task
import os
from scraper.utils import get_save_path
from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.progress import (
inc_completed,
inc_chapter_done,
inc_chapter_download_skipped,
)
from scraper.tasks.audio_tasks import generate_audio
@shared_task(bind=True, queue="save", ignore_result=False)
def save_chapter(self, parsed: dict):
"""
New pipeline model:
parsed = {
"book_id": str,
"chapter": chapter_dict,
"text": str,
"length": int,
"book_meta": dict,
"skipped": bool,
"path": optional str (if skipped)
}
"""
book_id = parsed.get("book_id", "NOBOOK")
chapter_dict = parsed.get("chapter") or {}
book_meta = parsed.get("book_meta") or {}
chapter_num = chapter_dict.get("num")
chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
volume_path = chapter_dict.get("volume_path")
# ------------------------------------------------------------
# VALIDATION
# ------------------------------------------------------------
if chapter_num is None or volume_path is None:
raise ValueError("Invalid parsed payload: chapter_dict missing fields.")
# ------------------------------------------------------------
# SKIPPED CASE
# ------------------------------------------------------------
if parsed.get("skipped"):
path = parsed.get("path", None)
log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num}{path}")
inc_chapter_download_skipped(book_id)
volume_name = os.path.basename(volume_path.rstrip("/"))
# Queue audio only if a valid file exists
if path and os.path.exists(path):
try:
generate_audio.delay(
book_id,
volume_name,
chapter_num,
chapter_title,
path,
)
log_msg(
book_id,
f"[AUDIO] Task queued (SKIPPED) for chapter {chapter_num} in {volume_name}",
)
except Exception as audio_exc:
log_msg(
book_id,
f"[AUDIO] ERROR queueing (SKIPPED) chapter {chapter_num}: {audio_exc}",
)
return {
"book_id": book_id,
"chapter": chapter_dict,
"path": path,
"skipped": True,
"book_meta": book_meta,
}
# ------------------------------------------------------------
# NORMAL SAVE CASE
# ------------------------------------------------------------
try:
text = parsed.get("text", "")
# Ensure volume folder exists
os.makedirs(volume_path, exist_ok=True)
# Build final chapter file path
path = get_save_path(chapter_num, volume_path)
# Write chapter text to file
with open(path, "w", encoding="utf-8") as f:
f.write(text)
log_msg(book_id, f"[SAVE] Saved chapter {chapter_num}{path}")
inc_chapter_done(book_id)
inc_completed(book_id)
# Determine volume name
volume_name = os.path.basename(volume_path.rstrip("/"))
# Queue audio task
try:
generate_audio.delay(
book_id,
volume_name,
chapter_num,
chapter_title,
path,
)
log_msg(
book_id,
f"[AUDIO] Task queued for chapter {chapter_num} in {volume_name}",
)
except Exception as audio_exc:
log_msg(
book_id, f"[AUDIO] ERROR queueing chapter {chapter_num}: {audio_exc}"
)
return {
"book_id": book_id,
"chapter": chapter_dict,
"path": path,
"book_meta": book_meta,
}
except Exception as exc:
log_msg(book_id, f"[SAVE] ERROR saving chapter {chapter_num}: {exc}")
raise