You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/save_tasks.py

66 lines
2.3 KiB

# =========================================================
# File: scraper/tasks/save_tasks.py
# Purpose: Save parsed chapter text to disk.
#
# Abort Behavior:
# - Save MUST ALWAYS RUN once download has started.
# - Abort only prevents new chapters from starting (download skip).
# - Save is skipped ONLY when download/parse indicated "skipped".
#
# This guarantees no half-written chapters.
# =========================================================
print(">>> [IMPORT] save_tasks.py loaded")
from celery import shared_task
from logbus.publisher import log
import os
from scraper.utils import get_save_path
@shared_task(bind=True, queue="save", ignore_result=False)
def save_chapter(self, parsed: dict, base_path: str):
print(f">>> [save_tasks] save_chapter() CALLED for chapter {parsed.get('chapter')}")
# ------------------------------------------------------------
# SKIP CASE:
# - Only skip when download OR parse indicated skip
# - NOT related to abort (abort never skips parse/save)
# ------------------------------------------------------------
if parsed.get("skipped"):
chapter = parsed.get("chapter")
path = parsed.get("path")
log(f"[SAVE] SKIP chapter {chapter} (already exists or skipped) → {path}")
print(f">>> [save_tasks] SKIPPED {path}")
return {"chapter": chapter, "path": path, "skipped": True}
try:
chapter_number = parsed.get("chapter")
url = parsed.get("url")
text = parsed.get("text", "")
if not chapter_number:
raise ValueError("Missing chapter_number in parsed payload")
# Ensure base path exists
os.makedirs(base_path, exist_ok=True)
# Unified filename logic
path = get_save_path(chapter_number, base_path)
# ------------------------------------------------------------
# WRITE CHAPTER TEXT TO FILE
# ------------------------------------------------------------
with open(path, "w", encoding="utf-8") as f:
f.write(text)
log(f"[SAVE] Saved chapter {chapter_number}{path}")
print(f">>> [save_tasks] SAVED {path}")
return {"chapter": chapter_number, "path": path}
except Exception as exc:
log(f"[SAVE] ERROR saving chapter from {url}: {exc}")
print(f">>> [save_tasks] ERROR: {exc}")
raise