You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
140 lines
4.3 KiB
140 lines
4.3 KiB
# ============================================================
|
|
# File: scraper/tasks/save_tasks.py
|
|
# Purpose: Save parsed chapter text to disk + trigger audio.
|
|
# Updated for chapter_dict + book_meta pipeline model.
|
|
# ============================================================
|
|
|
|
print(">>> [IMPORT] save_tasks.py loaded")
|
|
|
|
from celery import shared_task
|
|
import os
|
|
from scraper.utils import get_save_path
|
|
from scraper.tasks.download_tasks import log_msg # unified logger
|
|
from scraper.progress import (
|
|
inc_completed,
|
|
inc_chapter_done,
|
|
inc_chapter_download_skipped,
|
|
)
|
|
from scraper.tasks.audio_tasks import generate_audio
|
|
|
|
|
|
@shared_task(bind=True, queue="save", ignore_result=False)
|
|
def save_chapter(self, parsed: dict):
|
|
"""
|
|
New pipeline model:
|
|
parsed = {
|
|
"book_id": str,
|
|
"chapter": chapter_dict,
|
|
"text": str,
|
|
"length": int,
|
|
"book_meta": dict,
|
|
"skipped": bool,
|
|
"path": optional str (if skipped)
|
|
}
|
|
"""
|
|
|
|
book_id = parsed.get("book_id", "NOBOOK")
|
|
chapter_dict = parsed.get("chapter") or {}
|
|
book_meta = parsed.get("book_meta") or {}
|
|
|
|
chapter_num = chapter_dict.get("num")
|
|
chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
|
|
volume_path = chapter_dict.get("volume_path")
|
|
|
|
# ------------------------------------------------------------
|
|
# VALIDATION
|
|
# ------------------------------------------------------------
|
|
if chapter_num is None or volume_path is None:
|
|
raise ValueError("Invalid parsed payload: chapter_dict missing fields.")
|
|
|
|
# ------------------------------------------------------------
|
|
# SKIPPED CASE
|
|
# ------------------------------------------------------------
|
|
if parsed.get("skipped"):
|
|
path = parsed.get("path", None)
|
|
log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num} → {path}")
|
|
|
|
inc_chapter_download_skipped(book_id)
|
|
|
|
volume_name = os.path.basename(volume_path.rstrip("/"))
|
|
|
|
# Queue audio only if a valid file exists
|
|
if path and os.path.exists(path):
|
|
try:
|
|
generate_audio.delay(
|
|
book_id,
|
|
volume_name,
|
|
chapter_num,
|
|
chapter_title,
|
|
path,
|
|
)
|
|
log_msg(
|
|
book_id,
|
|
f"[AUDIO] Task queued (SKIPPED) for chapter {chapter_num} in {volume_name}",
|
|
)
|
|
except Exception as audio_exc:
|
|
log_msg(
|
|
book_id,
|
|
f"[AUDIO] ERROR queueing (SKIPPED) chapter {chapter_num}: {audio_exc}",
|
|
)
|
|
|
|
return {
|
|
"book_id": book_id,
|
|
"chapter": chapter_dict,
|
|
"path": path,
|
|
"skipped": True,
|
|
"book_meta": book_meta,
|
|
}
|
|
|
|
# ------------------------------------------------------------
|
|
# NORMAL SAVE CASE
|
|
# ------------------------------------------------------------
|
|
try:
|
|
text = parsed.get("text", "")
|
|
|
|
# Ensure volume folder exists
|
|
os.makedirs(volume_path, exist_ok=True)
|
|
|
|
# Build final chapter file path
|
|
path = get_save_path(chapter_num, volume_path)
|
|
|
|
# Write chapter text to file
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(text)
|
|
|
|
log_msg(book_id, f"[SAVE] Saved chapter {chapter_num} → {path}")
|
|
inc_chapter_done(book_id)
|
|
inc_completed(book_id)
|
|
|
|
# Determine volume name
|
|
volume_name = os.path.basename(volume_path.rstrip("/"))
|
|
|
|
# Queue audio task
|
|
try:
|
|
generate_audio.delay(
|
|
book_id,
|
|
volume_name,
|
|
chapter_num,
|
|
chapter_title,
|
|
path,
|
|
)
|
|
log_msg(
|
|
book_id,
|
|
f"[AUDIO] Task queued for chapter {chapter_num} in {volume_name}",
|
|
)
|
|
except Exception as audio_exc:
|
|
log_msg(
|
|
book_id, f"[AUDIO] ERROR queueing chapter {chapter_num}: {audio_exc}"
|
|
)
|
|
|
|
return {
|
|
"book_id": book_id,
|
|
"chapter": chapter_dict,
|
|
"path": path,
|
|
"book_meta": book_meta,
|
|
}
|
|
|
|
except Exception as exc:
|
|
log_msg(book_id, f"[SAVE] ERROR saving chapter {chapter_num}: {exc}")
|
|
raise
|