Cover download + controller async fix + robust cover parsing

feat/cover-download
peter.fong 2 weeks ago
parent ed341c727a
commit 9a774c4955

@ -21,9 +21,25 @@ from scraper.ui_log import get_ui_logs
from celery.result import AsyncResult from celery.result import AsyncResult
# ⬇⬇⬇ TOEGEVOEGD voor cover-serving
from flask import send_from_directory
import os
app = Flask(__name__) app = Flask(__name__)
# =====================================================
# STATIC FILE SERVING FOR OUTPUT ← TOEGEVOEGD
# =====================================================
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@app.route("/output/<path:filename>")
def serve_output(filename):
"""Serve output files such as cover.jpg and volumes."""
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
# ===================================================== # =====================================================
# HOME PAGE # HOME PAGE
# ===================================================== # =====================================================
@ -54,6 +70,8 @@ def start_scraping():
"result.html", "result.html",
message="Scraping gestart.", message="Scraping gestart.",
scraping_task_id=async_result.id, scraping_task_id=async_result.id,
# voor result.html cover rendering
book_title=None,
) )
@ -103,8 +121,6 @@ def celery_result(task_id):
# RUN FLASK # RUN FLASK
# ===================================================== # =====================================================
if __name__ == "__main__": if __name__ == "__main__":
import os
debug = os.getenv("FLASK_DEBUG", "0") == "1" debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0") host = os.getenv("HOST", "0.0.0.0")
port = int(os.getenv("PORT", "5000")) port = int(os.getenv("PORT", "5000"))

@ -3,6 +3,7 @@
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
import re
from scraper.logger import log_debug from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements from scraper.utils import clean_text, load_replacements
@ -53,7 +54,7 @@ class BookScraper:
"title": self.book_title, "title": self.book_title,
"author": self.book_author, "author": self.book_author,
"description": self.book_description, "description": self.book_description,
"cover_url": self.cover_url, "cover_url": self.cover_url, # ← used by DownloadController
"book_url": self.url, "book_url": self.url,
"chapters": [ "chapters": [
{"num": ch.number, "title": ch.title, "url": ch.url} {"num": ch.number, "title": ch.title, "url": ch.url}
@ -106,12 +107,62 @@ class BookScraper:
# ------------------------------------------------------------ # ------------------------------------------------------------
def _parse_cover(self, soup): def _parse_cover(self, soup):
img = soup.find("img", src=lambda v: v and "files/article/image" in v) """
if not img: Extract correct cover based on book_id path logic.
log_debug("[BookScraper] No cover found") 1. primary: match "/files/article/image/{vol}/{book_id}/"
2. fallback: endswith "/{book_id}s.jpg"
"""
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", self.url)
if not m:
log_debug("[BookScraper] No book_id found in URL → cannot match cover")
return return
self.cover_url = urljoin(self.site.root, img.get("src")) book_id = m.group(1)
# Extract vol folder from URL (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", self.url)
volume = m2.group(1) if m2 else None
log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
imgs = soup.find_all("img", src=True)
chosen = None
# --------------------------------------------------------
# PRIORITY 1: Path-match
# /files/article/image/{vol}/{book_id}/
# --------------------------------------------------------
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
log_debug(f"[BookScraper] Cover matched by PATH: {src}")
break
# --------------------------------------------------------
# PRIORITY 2: endswith "/{book_id}s.jpg"
# --------------------------------------------------------
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
break
# --------------------------------------------------------
# No match
# --------------------------------------------------------
if not chosen:
log_debug("[BookScraper] No matching cover found")
return
self.cover_url = urljoin(self.site.root, chosen)
log_debug(f"[BookScraper] Cover URL = {self.cover_url}") log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
# ------------------------------------------------------------ # ------------------------------------------------------------

@ -3,12 +3,15 @@
# Purpose: # Purpose:
# Build Celery pipelines for all chapters # Build Celery pipelines for all chapters
# and pass book_id for abort/progress/log functionality. # and pass book_id for abort/progress/log functionality.
# + Download and replicate cover image to all volume folders
# ========================================================= # =========================================================
from celery import group from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline from scraper.tasks.pipeline import build_chapter_pipeline
from logbus.publisher import log from logbus.publisher import log
import os import os
import requests
import shutil
class DownloadController: class DownloadController:
@ -18,6 +21,7 @@ class DownloadController:
- volume splitting - volume splitting
- consistent meta propagation - consistent meta propagation
- book_id-based abort + progress tracking - book_id-based abort + progress tracking
- cover download + volume replication
""" """
def __init__(self, book_id: str, scrape_result: dict): def __init__(self, book_id: str, scrape_result: dict):
@ -27,6 +31,7 @@ class DownloadController:
# Core metadata # Core metadata
self.title = scrape_result.get("title", "UnknownBook") self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or [] self.chapters = scrape_result.get("chapters", []) or []
self.cover_url = scrape_result.get("cover_url")
# Output base dir # Output base dir
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@ -46,6 +51,62 @@ class DownloadController:
"book_url": scrape_result.get("book_url"), "book_url": scrape_result.get("book_url"),
} }
# ---------------------------------------------------------
# Cover Download
# ---------------------------------------------------------
def download_cover(self):
"""Download one cover image into the root of the book folder."""
if not self.cover_url:
log(f"[CTRL] No cover URL found for '{self.title}'")
return
cover_path = os.path.join(self.book_base, "cover.jpg")
# HEADERS that bypass 403 hotlink protection
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
),
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
}
try:
log(f"[CTRL] Downloading cover: {self.cover_url}")
resp = requests.get(self.cover_url, timeout=10, headers=headers)
resp.raise_for_status()
with open(cover_path, "wb") as f:
f.write(resp.content)
log(f"[CTRL] Cover saved to: {cover_path}")
except Exception as e:
log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
# ---------------------------------------------------------
# Cover Replication to Volumes
# ---------------------------------------------------------
def replicate_cover_to_volumes(self):
"""Copy cover.jpg into each existing Volume_xxx directory."""
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
log("[CTRL] No cover.jpg found, replication skipped")
return
try:
for entry in os.listdir(self.book_base):
if entry.lower().startswith("volume_"):
vol_dir = os.path.join(self.book_base, entry)
dst = os.path.join(vol_dir, "cover.jpg")
shutil.copyfile(src, dst)
log(f"[CTRL] Cover replicated into: {dst}")
except Exception as e:
log(f"[CTRL] Cover replication failed: {e}")
# --------------------------------------------------------- # ---------------------------------------------------------
# Volume isolation # Volume isolation
# --------------------------------------------------------- # ---------------------------------------------------------
@ -69,6 +130,11 @@ class DownloadController:
) )
log(f"[CTRL] Output root: {self.book_base}") log(f"[CTRL] Output root: {self.book_base}")
# -------------------------------------
# 1) Download cover before any pipelines
# -------------------------------------
self.download_cover()
tasks = [] tasks = []
for ch in self.chapters: for ch in self.chapters:
@ -94,4 +160,9 @@ class DownloadController:
f"(book_id={self.book_id}, group_id={async_result.id})" f"(book_id={self.book_id}, group_id={async_result.id})"
) )
# -------------------------------------------------------
# 2) AFTER dispatch: cover replication to volume folders
# -------------------------------------------------------
self.replicate_cover_to_volumes()
return async_result return async_result

@ -3,7 +3,7 @@
# Purpose: # Purpose:
# Start the download → parse → save pipeline for a scraped book, # Start the download → parse → save pipeline for a scraped book,
# including progress/abort tracking via book_id. # including progress/abort tracking via book_id.
# ONLY THE CONTROLLER UPDATES PROGRESS. # ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
# ============================================================ # ============================================================
from celery_app import celery_app from celery_app import celery_app
@ -12,9 +12,6 @@ from logbus.publisher import log
from scraper.download_controller import DownloadController from scraper.download_controller import DownloadController
from scraper.progress import ( from scraper.progress import (
set_total, set_total,
inc_completed,
inc_skipped,
inc_failed,
) )
from scraper.abort import abort_requested from scraper.abort import abort_requested
@ -25,11 +22,10 @@ print(">>> [IMPORT] controller_tasks.py loaded")
def launch_downloads(self, book_id: str, scrape_result: dict): def launch_downloads(self, book_id: str, scrape_result: dict):
""" """
Launch the entire pipeline (download parse save), Launch the entire pipeline (download parse save),
AND maintain progress counters. AND initialize progress counters.
EXPECTS: Chapter-level progress is updated INSIDE the download/parse/save tasks.
book_id: ID generated in scraping.start_scrape_book This task MUST NOT call .get() on async subtasks (Celery restriction).
scrape_result: dict with title, author, url, chapters[]
""" """
title = scrape_result.get("title", "UnknownBook") title = scrape_result.get("title", "UnknownBook")
@ -50,46 +46,36 @@ def launch_downloads(self, book_id: str, scrape_result: dict):
ctl = DownloadController(book_id, scrape_result) ctl = DownloadController(book_id, scrape_result)
# ------------------------------------------------------------ # ------------------------------------------------------------
# RUN PIPELINE IN SYNC LOOP # START PIPELINES (ASYNC)
# (DownloadController.start() returns per-chapter generator) # Returns a celery group AsyncResult. We DO NOT iterate or get().
# Progress & failures are handled by the worker subtasks.
# ------------------------------------------------------------ # ------------------------------------------------------------
try: try:
for result in ctl.start(): # new generator mode group_result = ctl.start()
ch = result.get("chapter")
log(
if result.get("skipped"): f"[CTRL] Pipelines dispatched for '{title}' "
inc_skipped(book_id) f"(book_id={book_id}, group_id={group_result.id})"
inc_completed(book_id) )
log(f"[CTRL] SKIPPED chapter {ch}")
continue # Abort flag set BEFORE tasks start?
if abort_requested(book_id):
if result.get("failed"): log(f"[CTRL] ABORT requested before tasks start")
inc_failed(book_id) return {"book_id": book_id, "aborted": True}
inc_completed(book_id)
log(f"[CTRL] FAILED chapter {ch}")
continue
# Normal success
inc_completed(book_id)
log(f"[CTRL] DONE chapter {ch}")
# Abort requested mid-run?
if abort_requested(book_id):
log(f"[CTRL] ABORT after chapter {ch}")
break
except Exception as exc: except Exception as exc:
log(f"[CTRL] ERROR while processing pipeline: {exc}") log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
inc_failed(book_id)
raise raise
# ------------------------------------------------------------ # ------------------------------------------------------------
# FINISHED # CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
# (Download/parse/save tasks update progress themselves)
# ------------------------------------------------------------ # ------------------------------------------------------------
log(f"[CTRL] Pipeline finished for book_id={book_id}") log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
return { return {
"book_id": book_id, "book_id": book_id,
"total": total, "total": total,
"completed": int(total), # For safety "started": True,
"group_id": group_result.id,
} }

@ -68,6 +68,24 @@
<div class="box">{{ message }}</div> <div class="box">{{ message }}</div>
{% endif %} {% endif %}
<!-- COVER WEERGAVE (toegevoegd) -->
{% if book_title %}
<div class="box">
<strong>Cover:</strong><br />
<img
src="/output/{{ book_title }}/cover.jpg"
alt="Cover"
style="
margin-top: 10px;
max-width: 250px;
border: 1px solid #ccc;
border-radius: 4px;
"
onerror="this.style.display='none'"
/>
</div>
{% endif %}
<div id="statusBox" class="box hidden"> <div id="statusBox" class="box hidden">
<div id="statusLine">Status: bezig…</div> <div id="statusLine">Status: bezig…</div>
<div id="progressText"></div> <div id="progressText"></div>
@ -122,7 +140,6 @@
} }
function pollProgress() { function pollProgress() {
// FIX → UI blijft renderen, polling stopt alleen herhaling
if (!bookId) return; if (!bookId) return;
fetch(`/progress/${bookId}`) fetch(`/progress/${bookId}`)
@ -137,7 +154,6 @@
p.skipped || 0 p.skipped || 0
} | Failed: ${p.failed || 0}`; } | Failed: ${p.failed || 0}`;
// FAILED LIST
const failedBox = document.getElementById("failedBox"); const failedBox = document.getElementById("failedBox");
const failedList = document.getElementById("failedList"); const failedList = document.getElementById("failedList");
@ -151,7 +167,6 @@
}); });
} }
// STATUS
if (p.abort) { if (p.abort) {
document.getElementById("statusLine").innerText = "ABORTED"; document.getElementById("statusLine").innerText = "ABORTED";
polling = false; polling = false;
@ -162,7 +177,6 @@
document.getElementById("statusLine").innerText = "Bezig…"; document.getElementById("statusLine").innerText = "Bezig…";
} }
// STOP repetitieve polling, maar blijf renderen
if (polling) setTimeout(pollProgress, 1000); if (polling) setTimeout(pollProgress, 1000);
}) })
.catch(() => { .catch(() => {

Loading…
Cancel
Save