- Removed old scraping→controller chain (no more async .get)
- New DownloadController pipeline structure
- Unified Redis Book State Model (book:{idx}:state)
- Updated all Celery tasks for unified IDs
- Removed old scraper/db.py
- Updated templates and dashboard components
- Added debug Inspect State system with bookcard preview
- Updated JS dashboard pipeline refresh
- Updated init_service + scrape_engine
- Improved abort logic
feature/bookstate-progress-fix
parent
feb8ca60d7
commit
3a62dfae79
@ -1,109 +1,167 @@
|
|||||||
# ============================================================
|
# ============================================================
|
||||||
# File: scraper/tasks/controller_tasks.py
|
# File: scraper/tasks/controller_tasks.py
|
||||||
# Purpose:
|
# Purpose:
|
||||||
# Start the download → parse → save pipeline for a scraped book,
|
# FULL scrape entrypoint + launching download/parse/save pipelines.
|
||||||
# including progress/abort tracking via book_id.
|
# NO result.get() anywhere. Scraping is done inline.
|
||||||
# ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
|
|
||||||
from celery_app import celery_app
|
from celery_app import celery_app
|
||||||
from logbus.publisher import log
|
from logbus.publisher import log
|
||||||
|
|
||||||
from scraper.download_controller import DownloadController
|
import os
|
||||||
|
import time
|
||||||
|
import redis
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from scraper.logger_decorators import logcall
|
from scraper.logger_decorators import logcall
|
||||||
import redis
|
|
||||||
import os
|
|
||||||
from scraper.abort import abort_requested
|
from scraper.abort import abort_requested
|
||||||
from db.repository import set_chapters_total
|
|
||||||
|
from scraper.services.scrape_engine import ScrapeEngine
|
||||||
|
from scraper.services.site_resolver import SiteResolver
|
||||||
|
|
||||||
|
from db.repository import fetch_book, set_chapters_total
|
||||||
|
from scraper.download_controller import DownloadController
|
||||||
|
|
||||||
|
|
||||||
print(">>> [IMPORT] controller_tasks.py loaded")
|
print(">>> [IMPORT] controller_tasks.py loaded")
|
||||||
|
|
||||||
|
|
||||||
@celery_app.task(bind=True, queue="controller", ignore_result=False)
|
# =============================================================
|
||||||
|
# 1) PUBLIC ENTRYPOINT — CALLED FROM /start
|
||||||
|
# =============================================================
|
||||||
|
@celery_app.task(
|
||||||
|
bind=True,
|
||||||
|
queue="controller",
|
||||||
|
ignore_result=False,
|
||||||
|
name="scraper.tasks.controller_tasks.start_full_scrape",
|
||||||
|
)
|
||||||
@logcall
|
@logcall
|
||||||
def launch_downloads(self, book_id: str, scrape_result: dict):
|
def start_full_scrape(self, book_idx: str):
|
||||||
"""
|
"""
|
||||||
Launch the entire pipeline (download → parse → save),
|
FULL SCRAPE ENTRYPOINT.
|
||||||
AND initialize progress counters.
|
Scraping is done inline → no Celery .get() needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
log(f"[CTRL] start_full_scrape(book_idx={book_idx})")
|
||||||
|
|
||||||
|
# Abort before doing anything
|
||||||
|
if abort_requested(book_idx):
|
||||||
|
log(f"[CTRL] PRE-ABORT flag detected for {book_idx}")
|
||||||
|
return {"book_idx": book_idx, "aborted": True, "reason": "pre-abort"}
|
||||||
|
|
||||||
|
# --------------------------------------------------------
|
||||||
|
# 1) Load book metadata from SQLite
|
||||||
|
# --------------------------------------------------------
|
||||||
|
book = fetch_book(book_idx)
|
||||||
|
if not book:
|
||||||
|
msg = f"[CTRL] Book '{book_idx}' not found in DB"
|
||||||
|
log(msg)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
url = book.get("book_url")
|
||||||
|
if not url:
|
||||||
|
msg = f"[CTRL] No book_url stored for {book_idx}"
|
||||||
|
log(msg)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
# --------------------------------------------------------
|
||||||
|
# 2) INLINE SCRAPE (fast, no Celery wait)
|
||||||
|
# --------------------------------------------------------
|
||||||
|
site = SiteResolver.resolve(url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
scrape_result = ScrapeEngine.fetch_metadata_and_chapters(site, url)
|
||||||
|
log(f"[CTRL] Scrape OK for {book_idx}: {scrape_result.get('title')}")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"[CTRL] ERROR during scrape of {book_idx}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
Chapter-level progress is updated INSIDE the download/parse/save tasks.
|
# --------------------------------------------------------
|
||||||
This task MUST NOT call .get() on async subtasks (Celery restriction).
|
# 3) Continue → dispatch pipelines
|
||||||
|
# --------------------------------------------------------
|
||||||
|
return launch_downloads(book_idx, scrape_result)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================
|
||||||
|
# 2) PIPELINE DISPATCH (NOT a Celery task)
|
||||||
|
# =============================================================
|
||||||
|
@logcall
|
||||||
|
def launch_downloads(book_idx: str, scrape_result: dict):
|
||||||
|
"""
|
||||||
|
Launches the entire processing pipeline:
|
||||||
|
- initialize Redis UI state
|
||||||
|
- initialize SQLite totals
|
||||||
|
- dispatch per-chapter pipelines via DownloadController
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = scrape_result.get("title", "UnknownBook")
|
title = scrape_result.get("title", "UnknownBook")
|
||||||
chapters = scrape_result.get("chapters", []) or []
|
chapters = scrape_result.get("chapters", []) or []
|
||||||
total = len(chapters)
|
total = len(chapters)
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# INIT BOOK STATE MODEL (required for Active Books dashboard)
|
# INIT REDIS STATE
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
|
broker_url = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
|
||||||
parsed = urlparse(broker_url)
|
parsed = urlparse(broker_url)
|
||||||
|
|
||||||
state = redis.Redis(
|
r = redis.Redis(
|
||||||
host=parsed.hostname,
|
host=parsed.hostname,
|
||||||
port=parsed.port,
|
port=parsed.port,
|
||||||
db=int(parsed.path.strip("/")),
|
db=int(parsed.path.strip("/")),
|
||||||
decode_responses=True,
|
decode_responses=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Book metadata
|
base = f"book:{book_idx}:state"
|
||||||
state.set(f"book:{book_id}:title", title)
|
|
||||||
state.set(f"book:{book_id}:status", "starting")
|
|
||||||
|
|
||||||
# Download counters
|
r.hset(base, "title", title)
|
||||||
state.set(f"book:{book_id}:download:total", total)
|
r.hset(base, "status", "starting")
|
||||||
state.set(f"book:{book_id}:download:done", 0)
|
r.hset(base, "chapters_total", total)
|
||||||
|
r.hset(base, "chapters_download_done", 0)
|
||||||
# Audio counters (start at zero)
|
r.hset(base, "chapters_download_skipped", 0)
|
||||||
state.set(f"book:{book_id}:audio:done", 0)
|
r.hset(base, "chapters_parsed_done", 0)
|
||||||
|
r.hset(base, "audio_done", 0)
|
||||||
|
r.hset(base, "audio_skipped", 0)
|
||||||
|
r.hset(base, "last_update", int(time.time()))
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# INIT PROGRESS
|
# INIT SQLITE SNAPSHOT
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
set_chapters_total(book_id, total)
|
try:
|
||||||
|
set_chapters_total(book_idx, total)
|
||||||
|
except Exception as e:
|
||||||
|
log(f"[CTRL] ERROR updating SQLite totals: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
log(f"[CTRL] Progress initialized for {book_id}: total={total}")
|
log(f"[CTRL] Initialized totals for {book_idx}: {total}")
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# BUILD CONTROLLER
|
# ABORT CHECK BEFORE LAUNCHING JOBS
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
ctl = DownloadController(book_id, scrape_result)
|
if abort_requested(book_idx):
|
||||||
|
log(f"[CTRL] ABORT flag detected — stopping BEFORE dispatch for {book_idx}")
|
||||||
|
r.hset(base, "status", "aborted")
|
||||||
|
return {"book_idx": book_idx, "aborted": True, "reason": "abort-before-start"}
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# START PIPELINES (ASYNC)
|
# BUILD + DISPATCH PER-CHAPTER PIPELINES
|
||||||
# Returns a celery group AsyncResult. We DO NOT iterate or get().
|
|
||||||
# Progress & failures are handled by the worker subtasks.
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
try:
|
controller = DownloadController(book_idx, scrape_result)
|
||||||
group_result = ctl.start()
|
|
||||||
|
|
||||||
log(
|
|
||||||
f"[CTRL] Pipelines dispatched for '{title}' "
|
|
||||||
f"(book_id={book_id}, group_id={group_result.id})"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Abort flag set BEFORE tasks start?
|
|
||||||
if abort_requested(book_id):
|
|
||||||
log(f"[CTRL] ABORT requested before tasks start")
|
|
||||||
return {"book_id": book_id, "aborted": True}
|
|
||||||
|
|
||||||
except Exception as exc:
|
try:
|
||||||
log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
|
group_result = controller.start()
|
||||||
|
gid = getattr(group_result, "id", None)
|
||||||
|
log(f"[CTRL] Pipelines dispatched for {book_idx} (group_id={gid})")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"[CTRL] ERROR dispatching pipelines for {book_idx}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# Update UI state to "downloading"
|
||||||
# CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
|
r.hset(base, "status", "downloading")
|
||||||
# (Download/parse/save tasks update progress themselves)
|
r.hset(base, "last_update", int(time.time()))
|
||||||
# ------------------------------------------------------------
|
|
||||||
log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"book_id": book_id,
|
"book_idx": book_idx,
|
||||||
"total": total,
|
"total": total,
|
||||||
"started": True,
|
"started": True,
|
||||||
"group_id": group_result.id,
|
"group_id": gid,
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in new issue