Cover download + controller async fix + robust cover parsing

feat/cover-download
peter.fong 2 weeks ago
parent ed341c727a
commit 9a774c4955

@ -21,9 +21,25 @@ from scraper.ui_log import get_ui_logs
from celery.result import AsyncResult
# ⬇⬇⬇ TOEGEVOEGD voor cover-serving
from flask import send_from_directory
import os
app = Flask(__name__)
# =====================================================
# STATIC FILE SERVING FOR OUTPUT ← TOEGEVOEGD
# =====================================================
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@app.route("/output/<path:filename>")
def serve_output(filename):
"""Serve output files such as cover.jpg and volumes."""
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
# =====================================================
# HOME PAGE
# =====================================================
@ -54,6 +70,8 @@ def start_scraping():
"result.html",
message="Scraping gestart.",
scraping_task_id=async_result.id,
# voor result.html cover rendering
book_title=None,
)
@ -103,8 +121,6 @@ def celery_result(task_id):
# RUN FLASK
# =====================================================
if __name__ == "__main__":
import os
debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0")
port = int(os.getenv("PORT", "5000"))

@ -3,6 +3,7 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements
@ -53,7 +54,7 @@ class BookScraper:
"title": self.book_title,
"author": self.book_author,
"description": self.book_description,
"cover_url": self.cover_url,
"cover_url": self.cover_url, # ← used by DownloadController
"book_url": self.url,
"chapters": [
{"num": ch.number, "title": ch.title, "url": ch.url}
@ -106,12 +107,62 @@ class BookScraper:
# ------------------------------------------------------------
def _parse_cover(self, soup):
img = soup.find("img", src=lambda v: v and "files/article/image" in v)
if not img:
log_debug("[BookScraper] No cover found")
"""
Extract correct cover based on book_id path logic.
1. primary: match "/files/article/image/{vol}/{book_id}/"
2. fallback: endswith "/{book_id}s.jpg"
"""
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", self.url)
if not m:
log_debug("[BookScraper] No book_id found in URL → cannot match cover")
return
self.cover_url = urljoin(self.site.root, img.get("src"))
book_id = m.group(1)
# Extract vol folder from URL (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", self.url)
volume = m2.group(1) if m2 else None
log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
imgs = soup.find_all("img", src=True)
chosen = None
# --------------------------------------------------------
# PRIORITY 1: Path-match
# /files/article/image/{vol}/{book_id}/
# --------------------------------------------------------
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
log_debug(f"[BookScraper] Cover matched by PATH: {src}")
break
# --------------------------------------------------------
# PRIORITY 2: endswith "/{book_id}s.jpg"
# --------------------------------------------------------
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
break
# --------------------------------------------------------
# No match
# --------------------------------------------------------
if not chosen:
log_debug("[BookScraper] No matching cover found")
return
self.cover_url = urljoin(self.site.root, chosen)
log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
# ------------------------------------------------------------

@ -3,12 +3,15 @@
# Purpose:
# Build Celery pipelines for all chapters
# and pass book_id for abort/progress/log functionality.
# + Download and replicate cover image to all volume folders
# =========================================================
from celery import group
from scraper.tasks.pipeline import build_chapter_pipeline
from logbus.publisher import log
import os
import requests
import shutil
class DownloadController:
@ -18,6 +21,7 @@ class DownloadController:
- volume splitting
- consistent meta propagation
- book_id-based abort + progress tracking
- cover download + volume replication
"""
def __init__(self, book_id: str, scrape_result: dict):
@ -27,6 +31,7 @@ class DownloadController:
# Core metadata
self.title = scrape_result.get("title", "UnknownBook")
self.chapters = scrape_result.get("chapters", []) or []
self.cover_url = scrape_result.get("cover_url")
# Output base dir
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@ -46,6 +51,62 @@ class DownloadController:
"book_url": scrape_result.get("book_url"),
}
# ---------------------------------------------------------
# Cover Download
# ---------------------------------------------------------
def download_cover(self):
"""Download one cover image into the root of the book folder."""
if not self.cover_url:
log(f"[CTRL] No cover URL found for '{self.title}'")
return
cover_path = os.path.join(self.book_base, "cover.jpg")
# HEADERS that bypass 403 hotlink protection
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
),
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
}
try:
log(f"[CTRL] Downloading cover: {self.cover_url}")
resp = requests.get(self.cover_url, timeout=10, headers=headers)
resp.raise_for_status()
with open(cover_path, "wb") as f:
f.write(resp.content)
log(f"[CTRL] Cover saved to: {cover_path}")
except Exception as e:
log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
# ---------------------------------------------------------
# Cover Replication to Volumes
# ---------------------------------------------------------
def replicate_cover_to_volumes(self):
"""Copy cover.jpg into each existing Volume_xxx directory."""
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
log("[CTRL] No cover.jpg found, replication skipped")
return
try:
for entry in os.listdir(self.book_base):
if entry.lower().startswith("volume_"):
vol_dir = os.path.join(self.book_base, entry)
dst = os.path.join(vol_dir, "cover.jpg")
shutil.copyfile(src, dst)
log(f"[CTRL] Cover replicated into: {dst}")
except Exception as e:
log(f"[CTRL] Cover replication failed: {e}")
# ---------------------------------------------------------
# Volume isolation
# ---------------------------------------------------------
@ -69,6 +130,11 @@ class DownloadController:
)
log(f"[CTRL] Output root: {self.book_base}")
# -------------------------------------
# 1) Download cover before any pipelines
# -------------------------------------
self.download_cover()
tasks = []
for ch in self.chapters:
@ -94,4 +160,9 @@ class DownloadController:
f"(book_id={self.book_id}, group_id={async_result.id})"
)
# -------------------------------------------------------
# 2) AFTER dispatch: cover replication to volume folders
# -------------------------------------------------------
self.replicate_cover_to_volumes()
return async_result

@ -3,7 +3,7 @@
# Purpose:
# Start the download → parse → save pipeline for a scraped book,
# including progress/abort tracking via book_id.
# ONLY THE CONTROLLER UPDATES PROGRESS.
# ONLY THE CONTROLLER UPDATES PROGRESS (initial total).
# ============================================================
from celery_app import celery_app
@ -12,9 +12,6 @@ from logbus.publisher import log
from scraper.download_controller import DownloadController
from scraper.progress import (
set_total,
inc_completed,
inc_skipped,
inc_failed,
)
from scraper.abort import abort_requested
@ -25,11 +22,10 @@ print(">>> [IMPORT] controller_tasks.py loaded")
def launch_downloads(self, book_id: str, scrape_result: dict):
"""
Launch the entire pipeline (download parse save),
AND maintain progress counters.
AND initialize progress counters.
EXPECTS:
book_id: ID generated in scraping.start_scrape_book
scrape_result: dict with title, author, url, chapters[]
Chapter-level progress is updated INSIDE the download/parse/save tasks.
This task MUST NOT call .get() on async subtasks (Celery restriction).
"""
title = scrape_result.get("title", "UnknownBook")
@ -50,46 +46,36 @@ def launch_downloads(self, book_id: str, scrape_result: dict):
ctl = DownloadController(book_id, scrape_result)
# ------------------------------------------------------------
# RUN PIPELINE IN SYNC LOOP
# (DownloadController.start() returns per-chapter generator)
# START PIPELINES (ASYNC)
# Returns a celery group AsyncResult. We DO NOT iterate or get().
# Progress & failures are handled by the worker subtasks.
# ------------------------------------------------------------
try:
for result in ctl.start(): # new generator mode
ch = result.get("chapter")
if result.get("skipped"):
inc_skipped(book_id)
inc_completed(book_id)
log(f"[CTRL] SKIPPED chapter {ch}")
continue
if result.get("failed"):
inc_failed(book_id)
inc_completed(book_id)
log(f"[CTRL] FAILED chapter {ch}")
continue
# Normal success
inc_completed(book_id)
log(f"[CTRL] DONE chapter {ch}")
# Abort requested mid-run?
if abort_requested(book_id):
log(f"[CTRL] ABORT after chapter {ch}")
break
group_result = ctl.start()
log(
f"[CTRL] Pipelines dispatched for '{title}' "
f"(book_id={book_id}, group_id={group_result.id})"
)
# Abort flag set BEFORE tasks start?
if abort_requested(book_id):
log(f"[CTRL] ABORT requested before tasks start")
return {"book_id": book_id, "aborted": True}
except Exception as exc:
log(f"[CTRL] ERROR while processing pipeline: {exc}")
inc_failed(book_id)
log(f"[CTRL] ERROR while dispatching pipelines: {exc}")
raise
# ------------------------------------------------------------
# FINISHED
# CONTROLLER DOES NOT WAIT FOR SUBTASK RESULTS
# (Download/parse/save tasks update progress themselves)
# ------------------------------------------------------------
log(f"[CTRL] Pipeline finished for book_id={book_id}")
log(f"[CTRL] Controller finished dispatch for book_id={book_id}")
return {
"book_id": book_id,
"total": total,
"completed": int(total), # For safety
"started": True,
"group_id": group_result.id,
}

@ -68,6 +68,24 @@
<div class="box">{{ message }}</div>
{% endif %}
<!-- COVER WEERGAVE (toegevoegd) -->
{% if book_title %}
<div class="box">
<strong>Cover:</strong><br />
<img
src="/output/{{ book_title }}/cover.jpg"
alt="Cover"
style="
margin-top: 10px;
max-width: 250px;
border: 1px solid #ccc;
border-radius: 4px;
"
onerror="this.style.display='none'"
/>
</div>
{% endif %}
<div id="statusBox" class="box hidden">
<div id="statusLine">Status: bezig…</div>
<div id="progressText"></div>
@ -122,7 +140,6 @@
}
function pollProgress() {
// FIX → UI blijft renderen, polling stopt alleen herhaling
if (!bookId) return;
fetch(`/progress/${bookId}`)
@ -137,7 +154,6 @@
p.skipped || 0
} | Failed: ${p.failed || 0}`;
// FAILED LIST
const failedBox = document.getElementById("failedBox");
const failedList = document.getElementById("failedList");
@ -151,7 +167,6 @@
});
}
// STATUS
if (p.abort) {
document.getElementById("statusLine").innerText = "ABORTED";
polling = false;
@ -162,7 +177,6 @@
document.getElementById("statusLine").innerText = "Bezig…";
}
// STOP repetitieve polling, maar blijf renderen
if (polling) setTimeout(pollProgress, 1000);
})
.catch(() => {

Loading…
Cancel
Save