- Removed old scraping→controller chain (no more async .get)
- New DownloadController pipeline structure
- Unified Redis Book State Model (book:{idx}:state)
- Updated all Celery tasks for unified IDs
- Removed old scraper/db.py
- Updated templates and dashboard components
- Added debug Inspect State system with bookcard preview
- Updated JS dashboard pipeline refresh
- Updated init_service + scrape_engine
- Improved abort logic
feature/bookstate-progress-fix
parent
feb8ca60d7
commit
3a62dfae79
@ -1,109 +1,167 @@
|
||||
# ============================================================
|
||||
# File: scraper/tasks/controller_tasks.py
|
||||
# Purpose:
|
||||
# Start the download → parse → save pipeline for a scraped book,
|
||||
# including progress/abort tracking via book_id.
|
||||
# ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
|
||||
# FULL scrape entrypoint + launching download/parse/save pipelines.
|
||||
# NO result.get() anywhere. Scraping is done inline.
|
||||
# ============================================================
|
||||
|
||||
from celery_app import celery_app
|
||||
from logbus.publisher import log
|
||||
|
||||
from scraper.download_controller import DownloadController
|
||||
|
||||
import os
|
||||
import time
|
||||
import redis
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from scraper.logger_decorators import logcall
|
||||
import redis
|
||||
import os
|
||||
from scraper.abort import abort_requested
|
||||
from db.repository import set_chapters_total
|
||||
|
||||
from scraper.services.scrape_engine import ScrapeEngine
|
||||
from scraper.services.site_resolver import SiteResolver
|
||||
|
||||
from db.repository import fetch_book, set_chapters_total
|
||||
from scraper.download_controller import DownloadController
|
||||
|
||||
|
||||
print(">>> [IMPORT] controller_tasks.py loaded")
|
||||
|
||||
|
||||
@celery_app.task(bind=True, queue="controller", ignore_result=False)
|
||||
# =============================================================
|
||||
# 1) PUBLIC ENTRYPOINT — CALLED FROM /start
|
||||
# =============================================================
|
||||
@celery_app.task(
|
||||
bind=True,
|
||||
queue="controller",
|
||||
ignore_result=False,
|
||||
name="scraper.tasks.controller_tasks.start_full_scrape",
|
||||
)
|
||||
@logcall
|
||||
def launch_downloads(self, book_id: str, scrape_result: dict):
|
||||
def start_full_scrape(self, book_idx: str):
|
||||
"""
|
||||
Launch the entire pipeline (download → parse → save),
|
||||
AND initialize progress counters.
|
||||
FULL SCRAPE ENTRYPOINT.
|
||||
Scraping is done inline → no Celery .get() needed.
|
||||
"""
|
||||
|
||||
log(f"[CTRL] start_full_scrape(book_idx={book_idx})")
|
||||
|
||||
# Abort before doing anything
|
||||
if abort_requested(book_idx):
|
||||
log(f"[CTRL] PRE-ABORT flag detected for {book_idx}")
|
||||
return {"book_idx": book_idx, "aborted": True, "reason": "pre-abort"}
|
||||
|
||||
# --------------------------------------------------------
|
||||
# 1) Load book metadata from SQLite
|
||||
# --------------------------------------------------------
|
||||
book = fetch_book(book_idx)
|
||||
if not book:
|
||||
msg = f"[CTRL] Book '{book_idx}' not found in DB"
|
||||
log(msg)
|
||||
raise ValueError(msg)
|
||||
|
||||
url = book.get("book_url")
|
||||
if not url:
|
||||
msg = f"[CTRL] No book_url stored for {book_idx}"
|
||||
log(msg)
|
||||
raise ValueError(msg)
|
||||
|
||||
# --------------------------------------------------------
|
||||
# 2) INLINE SCRAPE (fast, no Celery wait)
|
||||
# --------------------------------------------------------
|
||||
site = SiteResolver.resolve(url)
|
||||
|
||||
try:
|
||||
scrape_result = ScrapeEngine.fetch_metadata_and_chapters(site, url)
|
||||
log(f"[CTRL] Scrape OK for {book_idx}: {scrape_result.get('title')}")
|
||||
except Exception as e:
|
||||
log(f"[CTRL] ERROR during scrape of {book_idx}: {e}")
|
||||
raise
|
||||
|
||||
# --------------------------------------------------------
|
||||
# 3) Continue → dispatch pipelines
|
||||
# --------------------------------------------------------
|
||||
return launch_downloads(book_idx, scrape_result)
|
||||
|
||||
Chapter-level progress is updated INSIDE the download/parse/save tasks.
|
||||
This task MUST NOT call .get() on async subtasks (Celery restriction).
|
||||
|
||||
# =============================================================
|
||||
# 2) PIPELINE DISPATCH (NOT a Celery task)
|
||||
# =============================================================
|
||||
@logcall
|
||||
def launch_downloads(book_idx: str, scrape_result: dict):
|
||||
"""
|
||||
Launches the entire processing pipeline:
|
||||
- initialize Redis UI state
|
||||
- initialize SQLite totals
|
||||
- dispatch per-chapter pipelines via DownloadController
|
||||
"""
|
||||
|
||||
title = scrape_result.get("title", "UnknownBook")
|
||||
chapters = scrape_result.get("chapters", []) or []
|
||||
total = len(chapters)
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# INIT BOOK STATE MODEL (required for Active Books dashboard)
|
||||
# INIT REDIS STATE
|
||||
# ------------------------------------------------------------
|
||||
|
||||
broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
|
||||
parsed = urlparse(broker_url)
|
||||
|
||||
state = redis.Redis(
|
||||
r = redis.Redis(
|
||||
host=parsed.hostname,
|
||||
port=parsed.port,
|
||||
db=int(parsed.path.strip("/")),
|
||||
decode_responses=True,
|
||||
)
|
||||
|
||||
# Book metadata
|
||||
state.set(f"book:{book_id}:title", title)
|
||||
state.set(f"book:{book_id}:status", "starting")
|
||||
|
||||
# Download counters
|
||||
state.set(f"book:{book_id}:download:total", total)
|
||||
state.set(f"book:{book_id}:download:done", 0)
|
||||
base = f"book:{book_idx}:state"
|
||||
|
||||
# Audio counters (start at zero)
|
||||
state.set(f"book:{book_id}:audio:done", 0)
|
||||
r.hset(base, "title", title)
|
||||
r.hset(base, "status", "starting")
|
||||
r.hset(base, "chapters_total", total)
|
||||
r.hset(base, "chapters_download_done", 0)
|
||||
r.hset(base, "chapters_download_skipped", 0)
|
||||
r.hset(base, "chapters_parsed_done", 0)
|
||||
r.hset(base, "audio_done", 0)
|
||||
r.hset(base, "audio_skipped", 0)
|
||||
r.hset(base, "last_update", int(time.time()))
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# INIT PROGRESS
|
||||
# INIT SQLITE SNAPSHOT
|
||||
# ------------------------------------------------------------
|
||||
set_chapters_total(book_id, total)
|
||||
try:
|
||||
set_chapters_total(book_idx, total)
|
||||
except Exception as e:
|
||||
log(f"[CTRL] ERROR updating SQLite totals: {e}")
|
||||
raise
|
||||
|
||||
log(f"[CTRL] Progress initialized for {book_id}: total={total}")
|
||||
log(f"[CTRL] Initialized totals for {book_idx}: {total}")
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# BUILD CONTROLLER
|
||||
# ABORT CHECK BEFORE LAUNCHING JOBS
|
||||
# ------------------------------------------------------------
|
||||
ctl = DownloadController(book_id, scrape_result)
|
||||
if abort_requested(book_idx):
|
||||
log(f"[CTRL] ABORT flag detected — stopping BEFORE dispatch for {book_idx}")
|
||||
r.hset(base, "status", "aborted")
|
||||
return {"book_idx": book_idx, "aborted": True, "reason": "abort-before-start"}
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# START PIPELINES (ASYNC)
|
||||
# Returns a celery group AsyncResult. We DO NOT iterate or get().
|
||||
# Progress & failures are handled by the worker subtasks.
|
||||
# BUILD + DISPATCH PER-CHAPTER PIPELINES
|
||||
# ------------------------------------------------------------
|
||||
try:
|
||||
group_result = ctl.start()
|
||||
|
||||
log(
|
||||
f"[CTRL] Pipelines dispatched for '{title}' "
|
||||
f"(book_id={book_id}, group_id={group_result.id})"
|
||||
)
|
||||
|
||||
# Abort flag set BEFORE tasks start?
|
||||
if abort_requested(book_id):
|
||||
log(f"[CTRL] ABORT requested before tasks start")
|
||||
return {"book_id": book_id, "aborted": True}
|
||||
controller = DownloadController(book_idx, scrape_result)
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
|
||||
try:
|
||||
group_result = controller.start()
|
||||
gid = getattr(group_result, "id", None)
|
||||
log(f"[CTRL] Pipelines dispatched for {book_idx} (group_id={gid})")
|
||||
except Exception as e:
|
||||
log(f"[CTRL] ERROR dispatching pipelines for {book_idx}: {e}")
|
||||
raise
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
|
||||
# (Download/parse/save tasks update progress themselves)
|
||||
# ------------------------------------------------------------
|
||||
log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
|
||||
# Update UI state to "downloading"
|
||||
r.hset(base, "status", "downloading")
|
||||
r.hset(base, "last_update", int(time.time()))
|
||||
|
||||
return {
|
||||
"book_id": book_id,
|
||||
"book_idx": book_idx,
|
||||
"total": total,
|
||||
"started": True,
|
||||
"group_id": group_result.id,
|
||||
"group_id": gid,
|
||||
}
|
||||
|
||||
Loading…
Reference in new issue