init werkt.

feature/bookstate-progress-fix
peter.fong 1 week ago
parent f7f08fa45c
commit 292c9246a1

2
.gitignore vendored

@ -12,3 +12,5 @@
.env
**/.env
log.txt
**/static/covers/

@ -1,41 +1,36 @@
# ============================================
# File: bookscraper/app.py (ASYNC SCRAPING)
# ============================================
from dotenv import load_dotenv
load_dotenv()
import os
import redis
from flask import Flask, render_template, request, jsonify, send_from_directory
print(">>> [WEB] Importing celery_app …")
from celery_app import celery_app
from db.db import init_db
from celery.result import AsyncResult
init_db() # ensure DB schema exists before Flask starts
from flask import Flask, render_template, request, jsonify
from scraper.logger import log_debug
# Abort + Progress (per book_id)
from scraper.abort import set_abort
from scraper.progress import get_progress
# UI LOGS (GLOBAL — no book_id)
from scraper.ui_log import get_ui_logs, reset_ui_logs
from celery.result import AsyncResult
from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta
from scraper.state import state as r
# Cover serving
from flask import send_from_directory
import os
from scraper.services.init_service import InitService
import redis
from db.repository import get_registered_books
# INIT DB
init_db()
# Flask
app = Flask(__name__)
# =====================================================
# STATIC FILE SERVING FOR OUTPUT
# STATIC FILE SERVING
# =====================================================
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@ -46,26 +41,96 @@ def serve_output(filename):
# =====================================================
# HOME PAGE
# SECTION 1 — NAVIGATION / HTML PAGES
# =====================================================
@app.route("/", methods=["GET"])
def index():
return render_template("index.html")
@app.route("/dashboard", methods=["GET"])
def dashboard():
logs_list = get_ui_logs() or []
return render_template(
"dashboard/dashboard.html",
books=list_active_books(), # Redis
registered=get_registered_books(), # SQLite INIT results
logs=logs_list,
)
@app.route("/book/<book_id>")
def book_detail(book_id):
title = r.get(f"book:{book_id}:title") or book_id
return render_template(
"dashboard/book_detail.html",
book_id=book_id,
title=title,
logs=get_ui_logs(),
)
# =====================================================
# START SCRAPING (async via Celery)
# SECTION 2 — ACTION ROUTES (INIT, START, ABORT)
# =====================================================
# CORRECT PATH — services/ is root-level
@app.route("/init", methods=["POST"])
def init_book():
"""
INIT-flow:
- user enters URL
- lightweight metadata fetch
- insert into SQLite as 'registered'
- return dashboard HTML (NOT JSON)
"""
url = request.form.get("url", "").strip()
if not url:
return render_template(
"dashboard/dashboard.html",
error="Geen URL opgegeven.",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
try:
result = InitService.execute(url)
msg = f"Boek geregistreerd: {result.get('title')}"
return render_template(
"dashboard/dashboard.html",
message=msg,
books=list_active_books(), # Redis
registered=get_registered_books(), # SQLite INIT results
logs=get_ui_logs(),
)
except Exception as e:
log_debug(f"[INIT] ERROR: {e}")
return render_template(
"dashboard/dashboard.html",
error=f"INIT mislukt: {e}",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
@app.route("/start", methods=["POST"])
def start_scraping():
url = request.form.get("url", "").strip()
if not url:
# ★ FIX: dashboard moet altijd books + logs meekrijgen
return render_template(
"dashboard/dashboard.html",
error="Geen URL opgegeven.",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
@ -78,27 +143,15 @@ def start_scraping():
queue="scraping",
)
# ★ FIX: direct dashboard tonen met actuele data
return render_template(
"dashboard/dashboard.html",
scraping_task_id=async_result.id,
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
# =====================================================
# CLEAR UI LOGS
# =====================================================
@app.route("/clear-logs", methods=["POST"])
def clear_logs():
reset_ui_logs()
return jsonify({"status": "ok", "message": "UI logs cleared"})
# =====================================================
# ABORT (per book_id)
# =====================================================
@app.route("/abort/<book_id>", methods=["POST"])
def abort_download(book_id):
log_debug(f"[WEB] Abort requested for book: {book_id}")
@ -107,87 +160,10 @@ def abort_download(book_id):
# =====================================================
# PROGRESS (per book_id)
# =====================================================
@app.route("/progress/<book_id>", methods=["GET"])
def progress(book_id):
return jsonify(get_progress(book_id))
# =====================================================
# CELERY RESULT → return book_id
# =====================================================
@app.route("/celery-result/<task_id>", methods=["GET"])
def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app)
if result.successful():
return jsonify({"ready": True, "result": result.get()})
if result.failed():
return jsonify({"ready": True, "error": "failed"})
return jsonify({"ready": False})
# =====================================================
# API: book status new model
# =====================================================
def getStatus(book_id):
state = r.hgetall(f"book:{book_id}:state")
status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_id
au_total = dl_total
return {
"book_id": book_id,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": au_total,
}
# =====================================================
# REDIS BACKEND — BOOK STATE MODEL
# SECTION 3 — API ROUTES (JSON)
# =====================================================
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
def list_active_booksold():
"""Return list of active books from Redis Book State Model."""
keys = r.keys("book:*:state")
books = []
for key in keys:
book_id = key.split(":")[1]
print(book_id)
books.append(getStatus(book_id))
return books
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_id = key[first + 1 : second]
books.append(getStatus(book_id))
return books
# =====================================================
# API: list all active books
# =====================================================
@app.route("/api/books")
def api_books():
return jsonify(list_active_books())
@ -195,45 +171,50 @@ def api_books():
@app.route("/api/book/<book_id>/status")
def api_book_status(book_id):
return jsonify(getStatus(book_id))
# =====================================================
# API: book logs
# =====================================================
@app.route("/api/book/<book_id>/logs")
def api_book_logs(book_id):
logs = r.lrange(f"logs:{book_id}", 0, -1) or []
return jsonify(logs)
# =====================================================
# VIEW: DASHBOARD
# =====================================================
@app.route("/dashboard")
def dashboard():
logs_list = get_ui_logs() or []
# ★ FIX: dashboard moet altijd books + logs krijgen
return render_template(
"dashboard/dashboard.html",
books=list_active_books(),
logs=logs_list, # dashboard krijgt LIST, geen dict
)
@app.route("/progress/<book_id>")
def progress(book_id):
return jsonify(get_progress(book_id))
@app.route("/celery-result/<task_id>")
def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app)
if result.successful():
return jsonify({"ready": True, "result": result.get()})
if result.failed():
return jsonify({"ready": True, "error": "failed"})
return jsonify({"ready": False})
@app.route("/clear-logs", methods=["POST"])
def clear_logs():
reset_ui_logs()
return jsonify({"status": "ok", "message": "UI logs cleared"})
@app.route("/logs", methods=["GET"])
def logs():
try:
last_index = int(request.args.get("last_index", -1))
except:
last_index = -1
new_lines, total = get_ui_logs_delta(last_index)
return jsonify({"lines": new_lines, "total": total})
# =====================================================
# VIEW: BOOK DETAIL PAGE
# SECTION 4 — DEBUG ROUTES
# =====================================================
@app.route("/book/<book_id>")
def book_detail(book_id):
title = r.get(f"book:{book_id}:title") or book_id
return render_template(
"dashboard/book_detail.html",
book_id=book_id,
title=title,
logs=get_ui_logs(),
)
@app.route("/debug/redis-keys")
@ -254,37 +235,65 @@ def debug_redis_keys():
return jsonify(results)
# ============================================================
# Rolling log endpoint (no new file)
# ============================================================
from flask import jsonify, request
# =====================================================
# ROLLING LOG ENDPOINT — DELTA POLLING VIA ui_log
# DB DEBUG: LIST ALL BOOKS FROM SQLITE
# =====================================================
from scraper.ui_log import get_ui_logs_delta
from db.repository import fetch_all_books
@app.route("/logs", methods=["GET"])
def logs():
@app.route("/api/db/books")
def api_db_books():
"""
Delta log delivery for WebGUI.
Browser sends ?last_index=N, we return only new lines.
Return ALL books stored in SQLite including INIT-only entries.
Useful to verify that /init wrote correct metadata.
"""
try:
last_index = int(request.args.get("last_index", -1))
except:
last_index = -1
books = fetch_all_books()
return jsonify({"status": "ok", "books": books})
except Exception as e:
return jsonify({"status": "error", "message": str(e)}), 500
new_lines, total = get_ui_logs_delta(last_index)
return jsonify({"lines": new_lines, "total": total})
# =====================================================
# SECTION 5 — INTERNAL HELPERS
# =====================================================
def getStatus(book_id):
state = r.hgetall(f"book:{book_id}:state")
status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_id
return {
"book_id": book_id,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": dl_total,
}
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_id = key[first + 1 : second]
books.append(getStatus(book_id))
return books
# =====================================================
# RUN FLASK
# SECTION 6 — FLASK RUNNER
# =====================================================
if __name__ == "__main__":
debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0")

@ -3,12 +3,10 @@
# Purpose:
# Raw SQLite engine for BookScraper.
# Provides ONLY low-level DB primitives.
# - Connection management (WAL mode)
# - init_db() schema creation
# - Connection management (existing DELETE journal mode)
# - init_db() schema creation + safe schema upgrade
# - upsert_book() atomic write
# - raw fetch helpers (private)
#
# All business logic belongs in repository.py.
# ============================================================
import os
@ -48,10 +46,14 @@ def enable_wal_mode(conn):
# ------------------------------------------------------------
# Schema creation
# Schema creation + SAFE schema upgrades
# ------------------------------------------------------------
def init_db():
conn = get_db()
# --------------------------------------------------------
# BASE SCHEMA (unchanged)
# --------------------------------------------------------
conn.execute(
"""
CREATE TABLE IF NOT EXISTS books (
@ -76,14 +78,21 @@ def init_db():
)
conn.commit()
# --------------------------------------------------------
# SCHEMA UPGRADE: add description column if missing
# --------------------------------------------------------
cols = conn.execute("PRAGMA table_info(books);").fetchall()
colnames = [c[1] for c in cols]
if "description" not in colnames:
conn.execute("ALTER TABLE books ADD COLUMN description TEXT;")
conn.commit()
# ------------------------------------------------------------
# WRITE OPERATIONS
# ------------------------------------------------------------
def upsert_book(book_id, **fields):
"""
Raw upsert primitive. Repository layer should call this.
"""
conn = get_db()
keys = ["book_id"] + list(fields.keys())
@ -115,5 +124,6 @@ def _raw_get_book(book_id):
def _raw_get_all_books():
conn = get_db()
# unchanged
cur = conn.execute("SELECT * FROM books ORDER BY created_at DESC;")
return [dict(row) for row in cur.fetchall()]

@ -4,18 +4,20 @@
# High-level BookScraper database interface.
# This is the ONLY module Celery tasks and Flask should use.
#
# Uses low-level primitives from db.db, but exposes
# domain-level operations:
# - fetch_book / fetch_all_books
# - create_or_update_book
# - set_status
# - incrementing counters
# New additions for INIT-flow:
# - register_book()
# - update_book_after_full_scrape()
# - get_registered_books()
# - get_active_books()
#
# Existing functions remain unchanged for backward compatibility.
# ============================================================
from db.db import (
upsert_book,
_raw_get_book,
_raw_get_all_books,
get_db,
)
@ -32,8 +34,84 @@ def fetch_all_books():
return _raw_get_all_books()
# ============================================================
# NEW — INIT-FLOW SUPPORT
# ============================================================
def register_book(book_id, title, author=None, description=None, cover_url=None):
"""
Create a new book entry with initial metadata.
Called when user enters a URL and presses INIT.
"""
fields = {
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"chapters_total": 0,
"status": "registered",
}
upsert_book(book_id, **fields)
def update_book_after_full_scrape(
book_id,
title=None,
author=None,
description=None,
cover_url=None,
chapters_total=None,
):
"""
Called after a FULL scrape when chapters are known.
Moves the book into 'active' state.
"""
fields = {}
if title is not None:
fields["title"] = title
if author is not None:
fields["author"] = author
if description is not None:
fields["description"] = description
if cover_url is not None:
fields["cover_url"] = cover_url
if chapters_total is not None:
fields["chapters_total"] = chapters_total
fields["status"] = "active"
upsert_book(book_id, **fields)
def get_registered_books():
"""
Return books registered but not yet scraped.
"""
conn = get_db()
cur = conn.execute(
"""SELECT * FROM books WHERE status='registered'
ORDER BY created_at DESC"""
)
return [dict(row) for row in cur.fetchall()]
def get_active_books():
"""
Return books currently in progress.
"""
conn = get_db()
cur = conn.execute(
"""SELECT * FROM books
WHERE status IN ('active', 'downloading')
ORDER BY created_at DESC"""
)
return [dict(row) for row in cur.fetchall()]
# ------------------------------------------------------------
# BOOK CREATION / METADATA
# BOOK CREATION / METADATA (existing)
# ------------------------------------------------------------
def create_or_update_book(
book_id,
@ -64,14 +142,14 @@ def create_or_update_book(
# ------------------------------------------------------------
# STATUS MANAGEMENT
# STATUS MANAGEMENT (existing)
# ------------------------------------------------------------
def set_status(book_id, status):
upsert_book(book_id, status=status)
# ------------------------------------------------------------
# INCREMENTING COUNTERS (atomic)
# INCREMENTING COUNTERS (existing — backward compat only)
# ------------------------------------------------------------
def inc_downloaded(book_id, amount=1):
book = _raw_get_book(book_id)

@ -1,202 +1,21 @@
# scraper/book_scraper.py
# ============================================================
# File: scraper/book_scraper.py
# Purpose:
# Backwards-compatible wrapper giving same API as before.
# Uses the new engine under the hood.
# ============================================================
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements
from scraper.models.book_state import Chapter
from scraper.engine.parser import extract_metadata_full
class BookScraper:
"""
Minimal scraper: only metadata + chapter list.
The DownloadController handles Celery pipelines for:
- download
- parse
- save
"""
def __init__(self, site, url):
self.site = site
def __init__(self, site_scraper, url):
self.site = site_scraper
self.url = url
self.book_title = ""
self.book_author = ""
self.book_description = ""
self.cover_url = ""
self.chapter_base = None
self.chapters = []
# Load custom replacements
extra = load_replacements("replacements.txt")
self.site.replacements.update(extra)
# ------------------------------------------------------------
def execute(self):
"""Main entry point. Returns metadata + chapter URLs."""
soup = self._fetch(self.url)
self._parse_title(soup)
self._parse_author(soup)
self._parse_description(soup)
self._parse_cover(soup)
chapter_page = self.get_chapter_page(soup)
self.parse_chapter_links(chapter_page)
log_debug(f"[BookScraper] Completed metadata parse")
return {
"title": self.book_title,
"author": self.book_author,
"description": self.book_description,
"cover_url": self.cover_url, # ← used by DownloadController
"book_url": self.url,
"chapters": [
{"num": ch.number, "title": ch.title, "url": ch.url}
for ch in self.chapters
],
}
# ------------------------------------------------------------
def _fetch(self, url):
log_debug(f"[BookScraper] Fetch: {url}")
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
resp.encoding = self.site.encoding
return BeautifulSoup(resp.text, "lxml")
# ------------------------------------------------------------
def _parse_title(self, soup):
h1 = soup.find("h1")
self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
log_debug(f"[BookScraper] Title = {self.book_title}")
def _parse_author(self, soup):
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
self.book_author = raw.split("")[1] if "" in raw else "UnknownAuthor"
log_debug(f"[BookScraper] Author = {self.book_author}")
def _parse_description(self, soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
self.book_description = ""
log_debug("[BookScraper] Description not found")
return
parts = []
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
self.book_description = clean_text("\n".join(parts), self.site.replacements)
log_debug(f"[BookScraper] Description length = {len(self.book_description)}")
# ------------------------------------------------------------
def _parse_cover(self, soup):
"""
Extract correct cover based on book_id path logic.
1. primary: match "/files/article/image/{vol}/{book_id}/"
2. fallback: endswith "/{book_id}s.jpg"
Backwards compatible full scrape:
returns {title, author, description, cover_url, chapters, book_url}
"""
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", self.url)
if not m:
log_debug("[BookScraper] No book_id found in URL → cannot match cover")
return
book_id = m.group(1)
# Extract vol folder from URL (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", self.url)
volume = m2.group(1) if m2 else None
log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
imgs = soup.find_all("img", src=True)
chosen = None
# --------------------------------------------------------
# PRIORITY 1: Path-match
# /files/article/image/{vol}/{book_id}/
# --------------------------------------------------------
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
log_debug(f"[BookScraper] Cover matched by PATH: {src}")
break
# --------------------------------------------------------
# PRIORITY 2: endswith "/{book_id}s.jpg"
# --------------------------------------------------------
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
break
# --------------------------------------------------------
# No match
# --------------------------------------------------------
if not chosen:
log_debug("[BookScraper] No matching cover found")
return
self.cover_url = urljoin(self.site.root, chosen)
log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
# ------------------------------------------------------------
def get_chapter_page(self, soup):
"""Return BeautifulSoup of the main chapter list page."""
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
chapter_url = urljoin(self.site.root, href)
# base for chapter links
parts = chapter_url.rsplit("/", 1)
self.chapter_base = parts[0] + "/"
return self._fetch(chapter_url)
# ------------------------------------------------------------
def parse_chapter_links(self, soup):
cont = soup.select_one(self.site.chapter_list_selector)
items = cont.select("ul li a[href]")
self.chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full = urljoin(self.chapter_base, href)
self.chapters.append(Chapter(idx, title, full))
idx += 1
log_debug(f"[BookScraper] Found {len(self.chapters)} chapters")
return extract_metadata_full(self.url, self.site)

@ -0,0 +1,27 @@
# ============================================================
# File: scraper/engine/fetcher.py
# Purpose:
# Low-level HTML fetch utility shared by all site scrapers.
# Replaces scattered _fetch() logic inside BookScraper.
# ============================================================
import requests
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
"Gecko/20100101 Firefox/118.0"
)
}
def fetch_html(url: str, encoding: str = "utf-8", timeout: int = 10) -> BeautifulSoup:
"""
Fetch HTML with a consistent user-agent and encoding.
Returns BeautifulSoup(lxml).
"""
resp = requests.get(url, headers=HEADERS, timeout=timeout)
resp.encoding = encoding
return BeautifulSoup(resp.text, "lxml")

@ -0,0 +1,65 @@
# ============================================================
# File: scraper/engine/parser.py
# Purpose:
# High-level scraping API coordinating metadata extraction
# and chapter extraction using pluggable SiteScraper classes.
#
# This is the new central engine:
# - extract_metadata_only() used by INIT flow
# - extract_metadata_full() used by full scraping pipeline
# ============================================================
from scraper.engine.fetcher import fetch_html
def extract_metadata_only(url: str, site_scraper):
"""
Extract ONLY lightweight metadata:
- title
- author
- description
- cover_url
- chapters_total = 0
"""
soup = fetch_html(url, site_scraper.encoding)
title = site_scraper.parse_title(soup)
author = site_scraper.parse_author(soup)
description = site_scraper.parse_description(soup)
cover_url = site_scraper.parse_cover(soup, url)
return {
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"chapters_total": 0,
"book_url": url,
}
def extract_metadata_full(url: str, site_scraper):
"""
Full scraping (metadata + chapterlist).
Used by the scraping Celery pipeline.
"""
soup = fetch_html(url, site_scraper.encoding)
# metadata
meta = extract_metadata_only(url, site_scraper)
# chapter list
chapter_page_url = site_scraper.extract_chapter_page_url(soup)
chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding)
chapters = site_scraper.parse_chapter_list(chapter_page_soup)
meta["chapters"] = chapters
return meta
def build_book_id(title: str) -> str:
"""
Canonical book_id generator.
SCRAPE currently uses title as ID preserve that behavior.
"""
return title

@ -0,0 +1,44 @@
# ============================================================
# File: scraper/services/cover_service.py
# ============================================================
import os
import requests
from logbus.publisher import log
class CoverService:
@staticmethod
def download_main_cover(cover_url: str, book_id: str) -> str | None:
"""
Downloads cover image into: static/covers/<book_id>.jpg.
Returns local path or None.
"""
if not cover_url:
log(f"[COVER] No cover URL for book={book_id}")
return None
static_dir = os.path.join("static", "covers")
os.makedirs(static_dir, exist_ok=True)
dst_path = os.path.join(static_dir, f"{book_id}.jpg")
try:
log(f"[COVER] Downloading: {cover_url}")
resp = requests.get(
cover_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
)
resp.raise_for_status()
with open(dst_path, "wb") as f:
f.write(resp.content)
log(f"[COVER] Stored: {dst_path}")
return dst_path
except Exception as e:
log(f"[COVER] FAILED ({cover_url}) → {e}")
return None

@ -0,0 +1,74 @@
# ============================================================
# File: scraper/services/init_service.py
# Purpose:
# Orchestrate INIT-flow:
# - resolve site
# - fetch minimal metadata
# - derive book_id
# - register in SQLite
# - store main cover
# ============================================================
import re
from scraper.services.site_resolver import SiteResolver
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.cover_service import CoverService
from db.repository import register_book
class InitService:
@staticmethod
def derive_book_id(url: str) -> str:
"""
PTWXZ URL format ends with /{id}.html.
If no match fallback to sanitized URL.
"""
m = re.search(r"/(\d+)\.html$", url)
if m:
return m.group(1)
return url.replace("/", "_")
@staticmethod
def execute(url: str) -> dict:
"""
Main INIT-flow entry point.
Returns complete metadata + registration info.
"""
# 1) Determine which BookSite applies
site = SiteResolver.resolve(url)
# 2) Metadata only (no chapters)
meta = ScrapeEngine.fetch_metadata_only(site, url)
title = meta.get("title") or "Unknown"
author = meta.get("author")
description = meta.get("description")
cover_url = meta.get("cover_url")
# 3) Determine book_id
book_id = InitService.derive_book_id(url)
# 4) SQLite registration
register_book(
book_id=book_id,
title=title,
author=author,
description=description,
cover_url=cover_url,
)
# 5) Download UI cover
CoverService.download_main_cover(cover_url, book_id)
# 6) Structured output for UI
return {
"book_id": book_id,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"status": "registered",
}

@ -0,0 +1,33 @@
# ============================================================
# File: scraper/services/scrape_engine.py
# Purpose:
# Provide unified scraping methods for INIT-flow.
# Reuses BookScraper internally with ZERO duplication.
# ============================================================
from scraper.book_scraper import BookScraper
class ScrapeEngine:
"""
Adapter layer around BookScraper.
Allows INIT-flow to fetch ONLY metadata (no chapters).
"""
@staticmethod
def fetch_metadata_only(site, url: str) -> dict:
"""
Execute BookScraper but return ONLY metadata.
Chapters are intentionally removed.
"""
scraper = BookScraper(site, url)
result = scraper.execute() # returns full metadata + chapters
# Strip chapterlist — INIT-flow should not fetch them
return {
"title": result.get("title"),
"author": result.get("author"),
"description": result.get("description"),
"cover_url": result.get("cover_url"),
"book_url": url,
}

@ -0,0 +1,20 @@
# ============================================================
# File: scraper/services/site_resolver.py
# Purpose:
# Determine which BookSite implementation applies for a given URL.
# This keeps INIT-flow and SCRAPE-flow site-agnostic.
# ============================================================
from scraper.sites import BookSite # current PTWXZ implementation
class SiteResolver:
"""
Resolves the correct BookSite class based on URL.
Currently only PTWXZ/Piaotian is supported.
"""
@staticmethod
def resolve(url: str):
# Later: add more domain rules for other sources
return BookSite()

@ -0,0 +1,28 @@
# ============================================================
# File: scraper/sites/__init__.py
# Purpose:
# Site autodetection based on URL.
# ============================================================
from scraper.sites.piaotian import PiaotianScraper
def get_scraper_for_url(url: str):
"""
Return the correct scraper instance for a given URL.
Later: add more site implementations.
"""
if "ptwxz" in url or "piaotian" in url:
return PiaotianScraper()
raise ValueError(f"No scraper available for URL: {url}")
# ============================================================
# Backwards-compatibility export for legacy BookScraper
# ============================================================
# Old code expects:
# from scraper.sites import BookSite
# We map that to our new PiaotianScraper implementation.
BookSite = PiaotianScraper

@ -0,0 +1,51 @@
# ============================================================
# File: scraper/sites/base.py
# Purpose:
# Abstract interface that every site-specific scraper must implement.
# ============================================================
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
class SiteScraper(ABC):
"""
Defines the interface for site-specific scrapers.
Each concrete scraper (Piaotian, Biquge, etc.) must implement these.
"""
@property
@abstractmethod
def root(self) -> str: ...
@property
@abstractmethod
def encoding(self) -> str: ...
@property
@abstractmethod
def chapter_list_selector(self) -> str: ...
# --------------------------
# Metadata extraction
# --------------------------
@abstractmethod
def parse_title(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_author(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_description(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None: ...
# --------------------------
# Chapter extraction
# --------------------------
@abstractmethod
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str: ...
@abstractmethod
def parse_chapter_list(self, soup: BeautifulSoup) -> list: ...

@ -0,0 +1,120 @@
# ============================================================
# File: scraper/sites/piaotian.py
# Purpose:
# Concrete SiteScraper implementation for ptwxz.com (Piaotian).
# Moves all parsing logic out of BookScraper.
# ============================================================
from scraper.sites.base import SiteScraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
class PiaotianScraper(SiteScraper):
root = "https://www.ptwxz.com"
encoding = "GB18030"
chapter_list_selector = "div.centent"
# ------------------------------------------------------------
# METADATA PARSING
# ------------------------------------------------------------
def parse_title(self, soup: BeautifulSoup) -> str:
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownBook"
def parse_author(self, soup: BeautifulSoup) -> str:
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
return raw.split("")[1] if "" in raw else "UnknownAuthor"
def parse_description(self, soup: BeautifulSoup) -> str:
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
# stop when next <span> reappears
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSING
# (exactly your BookScraper._parse_cover logic)
# ------------------------------------------------------------
def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None:
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", url)
if not m:
return None
book_id = m.group(1)
# Extract vol (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", url)
volume = m2.group(1) if m2 else None
imgs = soup.find_all("img", src=True)
chosen = None
# Priority 1: match "/files/article/image/{vol}/{book_id}/"
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
break
# Priority 2: endswith "/{book_id}s.jpg"
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
break
if not chosen:
return None
return urljoin(self.root, chosen)
# ------------------------------------------------------------
# CHAPTER EXTRACTION
# ------------------------------------------------------------
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str:
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
return urljoin(self.root, href)
def parse_chapter_list(self, soup: BeautifulSoup) -> list:
cont = soup.select_one(self.chapter_list_selector)
items = cont.select("ul li a[href]") if cont else []
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full_url = urljoin(self.root, href)
chapters.append({"num": idx, "title": title, "url": full_url})
idx += 1
return chapters

@ -21,7 +21,7 @@ redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
parsed = urlparse(redis_url)
# ------------------------------------------------------------
# REGULIER REDIS CLIENT (slots, file checks, state)
# REGULIER REDIS CLIENT (slots, file checks, dstate)
# ------------------------------------------------------------
redis_client = Redis(
host=parsed.hostname,

@ -1,34 +1,53 @@
<!DOCTYPE html>
<html lang="nl">
<head>
<meta charset="UTF-8">
<head>
<meta charset="UTF-8" />
<title>BookScraper</title>
<style>
body { font-family: Arial, sans-serif; padding: 40px; max-width: 600px; margin: auto; }
h1 { margin-bottom: 20px; }
body {
font-family: Arial, sans-serif;
padding: 40px;
max-width: 600px;
margin: auto;
}
h1 {
margin-bottom: 20px;
}
input[type="text"] {
width: 100%; padding: 12px; font-size: 16px;
border: 1px solid #ccc; border-radius: 6px;
width: 100%;
padding: 12px;
font-size: 16px;
border: 1px solid #ccc;
border-radius: 6px;
}
button {
margin-top: 20px;
padding: 12px 20px;
background: #007bff; color: white;
border: none; border-radius: 6px;
font-size: 16px; cursor: pointer;
background: #007bff;
color: white;
border: none;
border-radius: 6px;
font-size: 16px;
cursor: pointer;
}
button:hover {
background: #0056b3;
}
button:hover { background: #0056b3; }
</style>
</head>
<body>
</head>
<body>
<h1>BookScraper WebGUI</h1>
<h1>BookScraper WebGUI</h1>
<form action="/start" method="POST">
<label for="url">Geef een boek-URL op:</label><br><br>
<input type="text" id="url" name="url" placeholder="https://example.com/book/12345" required>
<form action="/init" method="POST">
<label for="url">Geef een boek-URL op:</label><br /><br />
<input
type="text"
id="url"
name="url"
placeholder="https://example.com/book/12345"
required
/>
<button type="submit">Start Scraping</button>
</form>
</body>
</form>
</body>
</html>

Loading…
Cancel
Save