|
|
|
|
@ -3,12 +3,15 @@
|
|
|
|
|
# Purpose:
|
|
|
|
|
# Build Celery pipelines for all chapters
|
|
|
|
|
# and pass book_id for abort/progress/log functionality.
|
|
|
|
|
# + Download and replicate cover image to all volume folders
|
|
|
|
|
# =========================================================
|
|
|
|
|
|
|
|
|
|
from celery import group
|
|
|
|
|
from scraper.tasks.pipeline import build_chapter_pipeline
|
|
|
|
|
from logbus.publisher import log
|
|
|
|
|
import os
|
|
|
|
|
import requests
|
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DownloadController:
|
|
|
|
|
@ -18,6 +21,7 @@ class DownloadController:
|
|
|
|
|
- volume splitting
|
|
|
|
|
- consistent meta propagation
|
|
|
|
|
- book_id-based abort + progress tracking
|
|
|
|
|
- cover download + volume replication
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, book_id: str, scrape_result: dict):
|
|
|
|
|
@ -27,6 +31,7 @@ class DownloadController:
|
|
|
|
|
# Core metadata
|
|
|
|
|
self.title = scrape_result.get("title", "UnknownBook")
|
|
|
|
|
self.chapters = scrape_result.get("chapters", []) or []
|
|
|
|
|
self.cover_url = scrape_result.get("cover_url")
|
|
|
|
|
|
|
|
|
|
# Output base dir
|
|
|
|
|
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
|
|
|
|
@ -46,6 +51,62 @@ class DownloadController:
|
|
|
|
|
"book_url": scrape_result.get("book_url"),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------
|
|
|
|
|
# Cover Download
|
|
|
|
|
# ---------------------------------------------------------
|
|
|
|
|
def download_cover(self):
|
|
|
|
|
"""Download one cover image into the root of the book folder."""
|
|
|
|
|
if not self.cover_url:
|
|
|
|
|
log(f"[CTRL] No cover URL found for '{self.title}'")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
cover_path = os.path.join(self.book_base, "cover.jpg")
|
|
|
|
|
|
|
|
|
|
# HEADERS that bypass 403 hotlink protection
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": (
|
|
|
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
|
|
|
|
|
"Gecko/20100101 Firefox/118.0"
|
|
|
|
|
),
|
|
|
|
|
"Referer": self.scrape_result.get("book_url") or "https://www.piaotia.com/",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
log(f"[CTRL] Downloading cover: {self.cover_url}")
|
|
|
|
|
|
|
|
|
|
resp = requests.get(self.cover_url, timeout=10, headers=headers)
|
|
|
|
|
resp.raise_for_status()
|
|
|
|
|
|
|
|
|
|
with open(cover_path, "wb") as f:
|
|
|
|
|
f.write(resp.content)
|
|
|
|
|
|
|
|
|
|
log(f"[CTRL] Cover saved to: {cover_path}")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"[CTRL] Cover download failed: {e} (url={self.cover_url})")
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------
|
|
|
|
|
# Cover Replication to Volumes
|
|
|
|
|
# ---------------------------------------------------------
|
|
|
|
|
def replicate_cover_to_volumes(self):
|
|
|
|
|
"""Copy cover.jpg into each existing Volume_xxx directory."""
|
|
|
|
|
src = os.path.join(self.book_base, "cover.jpg")
|
|
|
|
|
if not os.path.exists(src):
|
|
|
|
|
log("[CTRL] No cover.jpg found, replication skipped")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
for entry in os.listdir(self.book_base):
|
|
|
|
|
if entry.lower().startswith("volume_"):
|
|
|
|
|
vol_dir = os.path.join(self.book_base, entry)
|
|
|
|
|
dst = os.path.join(vol_dir, "cover.jpg")
|
|
|
|
|
|
|
|
|
|
shutil.copyfile(src, dst)
|
|
|
|
|
log(f"[CTRL] Cover replicated into: {dst}")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"[CTRL] Cover replication failed: {e}")
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------
|
|
|
|
|
# Volume isolation
|
|
|
|
|
# ---------------------------------------------------------
|
|
|
|
|
@ -69,6 +130,11 @@ class DownloadController:
|
|
|
|
|
)
|
|
|
|
|
log(f"[CTRL] Output root: {self.book_base}")
|
|
|
|
|
|
|
|
|
|
# -------------------------------------
|
|
|
|
|
# 1) Download cover before any pipelines
|
|
|
|
|
# -------------------------------------
|
|
|
|
|
self.download_cover()
|
|
|
|
|
|
|
|
|
|
tasks = []
|
|
|
|
|
|
|
|
|
|
for ch in self.chapters:
|
|
|
|
|
@ -94,4 +160,9 @@ class DownloadController:
|
|
|
|
|
f"(book_id={self.book_id}, group_id={async_result.id})"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# -------------------------------------------------------
|
|
|
|
|
# 2) AFTER dispatch: cover replication to volume folders
|
|
|
|
|
# -------------------------------------------------------
|
|
|
|
|
self.replicate_cover_to_volumes()
|
|
|
|
|
|
|
|
|
|
return async_result
|
|
|
|
|
|