diff --git a/.gitignore b/.gitignore index 373902a..ded78eb 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,6 @@ # Negeer alle .env bestanden .env **/.env -log.txt \ No newline at end of file +log.txt + +**/static/covers/ \ No newline at end of file diff --git a/bookscraper/app.py b/bookscraper/app.py index 35de802..0367408 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -1,41 +1,36 @@ # ============================================ # File: bookscraper/app.py (ASYNC SCRAPING) # ============================================ - from dotenv import load_dotenv load_dotenv() +import os +import redis +from flask import Flask, render_template, request, jsonify, send_from_directory + print(">>> [WEB] Importing celery_app …") from celery_app import celery_app from db.db import init_db +from celery.result import AsyncResult -init_db() # ensure DB schema exists before Flask starts - -from flask import Flask, render_template, request, jsonify from scraper.logger import log_debug - -# Abort + Progress (per book_id) from scraper.abort import set_abort from scraper.progress import get_progress - -# UI LOGS (GLOBAL — no book_id) -from scraper.ui_log import get_ui_logs, reset_ui_logs - -from celery.result import AsyncResult +from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta from scraper.state import state as r -# Cover serving -from flask import send_from_directory -import os +from scraper.services.init_service import InitService -import redis +from db.repository import get_registered_books + +# INIT DB +init_db() -# Flask app = Flask(__name__) # ===================================================== -# STATIC FILE SERVING FOR OUTPUT +# STATIC FILE SERVING # ===================================================== OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output") @@ -46,26 +41,96 @@ def serve_output(filename): # ===================================================== -# HOME PAGE +# SECTION 1 — NAVIGATION / HTML PAGES # ===================================================== + + @app.route("/", methods=["GET"]) def index(): return render_template("index.html") +@app.route("/dashboard", methods=["GET"]) +def dashboard(): + logs_list = get_ui_logs() or [] + return render_template( + "dashboard/dashboard.html", + books=list_active_books(), # Redis + registered=get_registered_books(), # SQLite INIT results + logs=logs_list, + ) + + +@app.route("/book/") +def book_detail(book_id): + title = r.get(f"book:{book_id}:title") or book_id + return render_template( + "dashboard/book_detail.html", + book_id=book_id, + title=title, + logs=get_ui_logs(), + ) + + # ===================================================== -# START SCRAPING (async via Celery) +# SECTION 2 — ACTION ROUTES (INIT, START, ABORT) # ===================================================== + +# CORRECT PATH — services/ is root-level + + +@app.route("/init", methods=["POST"]) +def init_book(): + """ + INIT-flow: + - user enters URL + - lightweight metadata fetch + - insert into SQLite as 'registered' + - return dashboard HTML (NOT JSON) + """ + url = request.form.get("url", "").strip() + + if not url: + return render_template( + "dashboard/dashboard.html", + error="Geen URL opgegeven.", + books=list_active_books(), + registered=get_registered_books(), + logs=get_ui_logs(), + ) + + try: + result = InitService.execute(url) + msg = f"Boek geregistreerd: {result.get('title')}" + + return render_template( + "dashboard/dashboard.html", + message=msg, + books=list_active_books(), # Redis + registered=get_registered_books(), # SQLite INIT results + logs=get_ui_logs(), + ) + + except Exception as e: + log_debug(f"[INIT] ERROR: {e}") + return render_template( + "dashboard/dashboard.html", + error=f"INIT mislukt: {e}", + books=list_active_books(), + registered=get_registered_books(), + logs=get_ui_logs(), + ) + + @app.route("/start", methods=["POST"]) def start_scraping(): url = request.form.get("url", "").strip() - if not url: - # ★ FIX: dashboard moet altijd books + logs meekrijgen return render_template( "dashboard/dashboard.html", error="Geen URL opgegeven.", books=list_active_books(), + registered=get_registered_books(), logs=get_ui_logs(), ) @@ -78,27 +143,15 @@ def start_scraping(): queue="scraping", ) - # ★ FIX: direct dashboard tonen met actuele data return render_template( "dashboard/dashboard.html", scraping_task_id=async_result.id, books=list_active_books(), + registered=get_registered_books(), logs=get_ui_logs(), ) -# ===================================================== -# CLEAR UI LOGS -# ===================================================== -@app.route("/clear-logs", methods=["POST"]) -def clear_logs(): - reset_ui_logs() - return jsonify({"status": "ok", "message": "UI logs cleared"}) - - -# ===================================================== -# ABORT (per book_id) -# ===================================================== @app.route("/abort/", methods=["POST"]) def abort_download(book_id): log_debug(f"[WEB] Abort requested for book: {book_id}") @@ -107,87 +160,10 @@ def abort_download(book_id): # ===================================================== -# PROGRESS (per book_id) -# ===================================================== -@app.route("/progress/", methods=["GET"]) -def progress(book_id): - return jsonify(get_progress(book_id)) - - -# ===================================================== -# CELERY RESULT → return book_id -# ===================================================== -@app.route("/celery-result/", methods=["GET"]) -def celery_result(task_id): - result = AsyncResult(task_id, app=celery_app) - - if result.successful(): - return jsonify({"ready": True, "result": result.get()}) - if result.failed(): - return jsonify({"ready": True, "error": "failed"}) - return jsonify({"ready": False}) - - -# ===================================================== -# API: book status new model -# ===================================================== -def getStatus(book_id): - - state = r.hgetall(f"book:{book_id}:state") - status = state.get("status") or "unknown" - dl_done = int(state.get("chapters_download_done", 0)) - dl_skipped = int(state.get("chapters_download_skipped", 0)) - dl_total = int(state.get("chapters_total", 0)) - au_done = int(state.get("audio_done") or 0) - title = state.get("title") or book_id - - au_total = dl_total - - return { - "book_id": book_id, - "title": title, - "status": status, - "download_done": dl_done, - "download_skipped": dl_skipped, - "download_total": dl_total, - "audio_done": au_done, - "audio_total": au_total, - } - - -# ===================================================== -# REDIS BACKEND — BOOK STATE MODEL +# SECTION 3 — API ROUTES (JSON) # ===================================================== -REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") -r = redis.Redis.from_url(REDIS_URL, decode_responses=True) - - -def list_active_booksold(): - """Return list of active books from Redis Book State Model.""" - keys = r.keys("book:*:state") - books = [] - - for key in keys: - book_id = key.split(":")[1] - print(book_id) - books.append(getStatus(book_id)) - - return books - - -def list_active_books(): - books = [] - for key in r.scan_iter(match="book:*:state", count=1000): - first = key.find(":") - second = key.find(":", first + 1) - book_id = key[first + 1 : second] - books.append(getStatus(book_id)) - return books -# ===================================================== -# API: list all active books -# ===================================================== @app.route("/api/books") def api_books(): return jsonify(list_active_books()) @@ -195,45 +171,50 @@ def api_books(): @app.route("/api/book//status") def api_book_status(book_id): - return jsonify(getStatus(book_id)) -# ===================================================== -# API: book logs -# ===================================================== @app.route("/api/book//logs") def api_book_logs(book_id): logs = r.lrange(f"logs:{book_id}", 0, -1) or [] return jsonify(logs) -# ===================================================== -# VIEW: DASHBOARD -# ===================================================== -@app.route("/dashboard") -def dashboard(): - logs_list = get_ui_logs() or [] - # ★ FIX: dashboard moet altijd books + logs krijgen - return render_template( - "dashboard/dashboard.html", - books=list_active_books(), - logs=logs_list, # dashboard krijgt LIST, geen dict - ) +@app.route("/progress/") +def progress(book_id): + return jsonify(get_progress(book_id)) + + +@app.route("/celery-result/") +def celery_result(task_id): + result = AsyncResult(task_id, app=celery_app) + if result.successful(): + return jsonify({"ready": True, "result": result.get()}) + if result.failed(): + return jsonify({"ready": True, "error": "failed"}) + return jsonify({"ready": False}) + + +@app.route("/clear-logs", methods=["POST"]) +def clear_logs(): + reset_ui_logs() + return jsonify({"status": "ok", "message": "UI logs cleared"}) + + +@app.route("/logs", methods=["GET"]) +def logs(): + try: + last_index = int(request.args.get("last_index", -1)) + except: + last_index = -1 + + new_lines, total = get_ui_logs_delta(last_index) + return jsonify({"lines": new_lines, "total": total}) # ===================================================== -# VIEW: BOOK DETAIL PAGE +# SECTION 4 — DEBUG ROUTES # ===================================================== -@app.route("/book/") -def book_detail(book_id): - title = r.get(f"book:{book_id}:title") or book_id - return render_template( - "dashboard/book_detail.html", - book_id=book_id, - title=title, - logs=get_ui_logs(), - ) @app.route("/debug/redis-keys") @@ -254,37 +235,65 @@ def debug_redis_keys(): return jsonify(results) -# ============================================================ -# Rolling log endpoint (no new file) -# ============================================================ - -from flask import jsonify, request - # ===================================================== -# ROLLING LOG ENDPOINT — DELTA POLLING VIA ui_log +# DB DEBUG: LIST ALL BOOKS FROM SQLITE # ===================================================== -from scraper.ui_log import get_ui_logs_delta +from db.repository import fetch_all_books -@app.route("/logs", methods=["GET"]) -def logs(): +@app.route("/api/db/books") +def api_db_books(): """ - Delta log delivery for WebGUI. - Browser sends ?last_index=N, we return only new lines. + Return ALL books stored in SQLite — including INIT-only entries. + Useful to verify that /init wrote correct metadata. """ try: - last_index = int(request.args.get("last_index", -1)) - except: - last_index = -1 + books = fetch_all_books() + return jsonify({"status": "ok", "books": books}) + except Exception as e: + return jsonify({"status": "error", "message": str(e)}), 500 - new_lines, total = get_ui_logs_delta(last_index) - return jsonify({"lines": new_lines, "total": total}) +# ===================================================== +# SECTION 5 — INTERNAL HELPERS +# ===================================================== + + +def getStatus(book_id): + state = r.hgetall(f"book:{book_id}:state") + status = state.get("status") or "unknown" + dl_done = int(state.get("chapters_download_done", 0)) + dl_skipped = int(state.get("chapters_download_skipped", 0)) + dl_total = int(state.get("chapters_total", 0)) + au_done = int(state.get("audio_done") or 0) + title = state.get("title") or book_id + + return { + "book_id": book_id, + "title": title, + "status": status, + "download_done": dl_done, + "download_skipped": dl_skipped, + "download_total": dl_total, + "audio_done": au_done, + "audio_total": dl_total, + } + + +def list_active_books(): + books = [] + for key in r.scan_iter(match="book:*:state", count=1000): + first = key.find(":") + second = key.find(":", first + 1) + book_id = key[first + 1 : second] + books.append(getStatus(book_id)) + return books # ===================================================== -# RUN FLASK +# SECTION 6 — FLASK RUNNER # ===================================================== + if __name__ == "__main__": debug = os.getenv("FLASK_DEBUG", "0") == "1" host = os.getenv("HOST", "0.0.0.0") diff --git a/bookscraper/app/routes/__init__.py b/bookscraper/app/routes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/app/routes/init_book.py b/bookscraper/app/routes/init_book.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/db/db.py b/bookscraper/db/db.py index 5c989d9..4706d0e 100644 --- a/bookscraper/db/db.py +++ b/bookscraper/db/db.py @@ -3,12 +3,10 @@ # Purpose: # Raw SQLite engine for BookScraper. # Provides ONLY low-level DB primitives. -# - Connection management (WAL mode) -# - init_db() schema creation +# - Connection management (existing DELETE journal mode) +# - init_db() schema creation + safe schema upgrade # - upsert_book() atomic write # - raw fetch helpers (private) -# -# All business logic belongs in repository.py. # ============================================================ import os @@ -48,10 +46,14 @@ def enable_wal_mode(conn): # ------------------------------------------------------------ -# Schema creation +# Schema creation + SAFE schema upgrades # ------------------------------------------------------------ def init_db(): conn = get_db() + + # -------------------------------------------------------- + # BASE SCHEMA (unchanged) + # -------------------------------------------------------- conn.execute( """ CREATE TABLE IF NOT EXISTS books ( @@ -76,14 +78,21 @@ def init_db(): ) conn.commit() + # -------------------------------------------------------- + # SCHEMA UPGRADE: add description column if missing + # -------------------------------------------------------- + cols = conn.execute("PRAGMA table_info(books);").fetchall() + colnames = [c[1] for c in cols] + + if "description" not in colnames: + conn.execute("ALTER TABLE books ADD COLUMN description TEXT;") + conn.commit() + # ------------------------------------------------------------ # WRITE OPERATIONS # ------------------------------------------------------------ def upsert_book(book_id, **fields): - """ - Raw upsert primitive. Repository layer should call this. - """ conn = get_db() keys = ["book_id"] + list(fields.keys()) @@ -115,5 +124,6 @@ def _raw_get_book(book_id): def _raw_get_all_books(): conn = get_db() + # unchanged cur = conn.execute("SELECT * FROM books ORDER BY created_at DESC;") return [dict(row) for row in cur.fetchall()] diff --git a/bookscraper/db/light_fetch.py b/bookscraper/db/light_fetch.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/db/repository.py b/bookscraper/db/repository.py index 047511e..b34d871 100644 --- a/bookscraper/db/repository.py +++ b/bookscraper/db/repository.py @@ -4,18 +4,20 @@ # High-level BookScraper database interface. # This is the ONLY module Celery tasks and Flask should use. # -# Uses low-level primitives from db.db, but exposes -# domain-level operations: -# - fetch_book / fetch_all_books -# - create_or_update_book -# - set_status -# - incrementing counters +# New additions for INIT-flow: +# - register_book() +# - update_book_after_full_scrape() +# - get_registered_books() +# - get_active_books() +# +# Existing functions remain unchanged for backward compatibility. # ============================================================ from db.db import ( upsert_book, _raw_get_book, _raw_get_all_books, + get_db, ) @@ -32,8 +34,84 @@ def fetch_all_books(): return _raw_get_all_books() +# ============================================================ +# NEW — INIT-FLOW SUPPORT +# ============================================================ + + +def register_book(book_id, title, author=None, description=None, cover_url=None): + """ + Create a new book entry with initial metadata. + Called when user enters a URL and presses INIT. + """ + fields = { + "title": title, + "author": author, + "description": description, + "cover_url": cover_url, + "chapters_total": 0, + "status": "registered", + } + upsert_book(book_id, **fields) + + +def update_book_after_full_scrape( + book_id, + title=None, + author=None, + description=None, + cover_url=None, + chapters_total=None, +): + """ + Called after a FULL scrape when chapters are known. + Moves the book into 'active' state. + """ + fields = {} + + if title is not None: + fields["title"] = title + if author is not None: + fields["author"] = author + if description is not None: + fields["description"] = description + if cover_url is not None: + fields["cover_url"] = cover_url + if chapters_total is not None: + fields["chapters_total"] = chapters_total + + fields["status"] = "active" + + upsert_book(book_id, **fields) + + +def get_registered_books(): + """ + Return books registered but not yet scraped. + """ + conn = get_db() + cur = conn.execute( + """SELECT * FROM books WHERE status='registered' + ORDER BY created_at DESC""" + ) + return [dict(row) for row in cur.fetchall()] + + +def get_active_books(): + """ + Return books currently in progress. + """ + conn = get_db() + cur = conn.execute( + """SELECT * FROM books + WHERE status IN ('active', 'downloading') + ORDER BY created_at DESC""" + ) + return [dict(row) for row in cur.fetchall()] + + # ------------------------------------------------------------ -# BOOK CREATION / METADATA +# BOOK CREATION / METADATA (existing) # ------------------------------------------------------------ def create_or_update_book( book_id, @@ -64,14 +142,14 @@ def create_or_update_book( # ------------------------------------------------------------ -# STATUS MANAGEMENT +# STATUS MANAGEMENT (existing) # ------------------------------------------------------------ def set_status(book_id, status): upsert_book(book_id, status=status) # ------------------------------------------------------------ -# INCREMENTING COUNTERS (atomic) +# INCREMENTING COUNTERS (existing — backward compat only) # ------------------------------------------------------------ def inc_downloaded(book_id, amount=1): book = _raw_get_book(book_id) diff --git a/bookscraper/db/schema.py b/bookscraper/db/schema.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py index 922d0c7..52c5bd7 100644 --- a/bookscraper/scraper/book_scraper.py +++ b/bookscraper/scraper/book_scraper.py @@ -1,202 +1,21 @@ -# scraper/book_scraper.py +# ============================================================ +# File: scraper/book_scraper.py +# Purpose: +# Backwards-compatible wrapper giving same API as before. +# Uses the new engine under the hood. +# ============================================================ -import requests -from bs4 import BeautifulSoup -from urllib.parse import urljoin -import re - -from scraper.logger import log_debug -from scraper.utils import clean_text, load_replacements -from scraper.models.book_state import Chapter +from scraper.engine.parser import extract_metadata_full class BookScraper: - """ - Minimal scraper: only metadata + chapter list. - The DownloadController handles Celery pipelines for: - - download - - parse - - save - """ - - def __init__(self, site, url): - self.site = site + def __init__(self, site_scraper, url): + self.site = site_scraper self.url = url - self.book_title = "" - self.book_author = "" - self.book_description = "" - self.cover_url = "" - self.chapter_base = None - - self.chapters = [] - - # Load custom replacements - extra = load_replacements("replacements.txt") - self.site.replacements.update(extra) - - # ------------------------------------------------------------ def execute(self): - """Main entry point. Returns metadata + chapter URLs.""" - soup = self._fetch(self.url) - - self._parse_title(soup) - self._parse_author(soup) - self._parse_description(soup) - self._parse_cover(soup) - - chapter_page = self.get_chapter_page(soup) - self.parse_chapter_links(chapter_page) - - log_debug(f"[BookScraper] Completed metadata parse") - - return { - "title": self.book_title, - "author": self.book_author, - "description": self.book_description, - "cover_url": self.cover_url, # ← used by DownloadController - "book_url": self.url, - "chapters": [ - {"num": ch.number, "title": ch.title, "url": ch.url} - for ch in self.chapters - ], - } - - # ------------------------------------------------------------ - def _fetch(self, url): - log_debug(f"[BookScraper] Fetch: {url}") - resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) - resp.encoding = self.site.encoding - return BeautifulSoup(resp.text, "lxml") - - # ------------------------------------------------------------ - def _parse_title(self, soup): - h1 = soup.find("h1") - self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle" - log_debug(f"[BookScraper] Title = {self.book_title}") - - def _parse_author(self, soup): - td = soup.find("td", string=lambda t: t and "作" in t) - raw = td.get_text(strip=True) if td else "" - self.book_author = raw.split(":")[1] if ":" in raw else "UnknownAuthor" - log_debug(f"[BookScraper] Author = {self.book_author}") - - def _parse_description(self, soup): - span = soup.find("span", string=lambda t: t and "内容简介" in t) - if not span: - self.book_description = "" - log_debug("[BookScraper] Description not found") - return - - parts = [] - for sib in span.next_siblings: - if getattr(sib, "name", None) == "span": - break - - text = ( - sib.get_text(strip=True) - if hasattr(sib, "get_text") - else str(sib).strip() - ) - - if text: - parts.append(text) - - self.book_description = clean_text("\n".join(parts), self.site.replacements) - log_debug(f"[BookScraper] Description length = {len(self.book_description)}") - - # ------------------------------------------------------------ - def _parse_cover(self, soup): """ - Extract correct cover based on book_id path logic. - 1. primary: match "/files/article/image/{vol}/{book_id}/" - 2. fallback: endswith "/{book_id}s.jpg" + Backwards compatible full scrape: + returns {title, author, description, cover_url, chapters, book_url} """ - # Extract book_id from URL - m = re.search(r"/(\d+)\.html$", self.url) - if not m: - log_debug("[BookScraper] No book_id found in URL → cannot match cover") - return - - book_id = m.group(1) - - # Extract vol folder from URL (bookinfo//.html) - m2 = re.search(r"/bookinfo/(\d+)/", self.url) - volume = m2.group(1) if m2 else None - - log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}") - - imgs = soup.find_all("img", src=True) - - chosen = None - - # -------------------------------------------------------- - # PRIORITY 1: Path-match - # /files/article/image/{vol}/{book_id}/ - # -------------------------------------------------------- - if volume: - target_path = f"/files/article/image/{volume}/{book_id}/" - for img in imgs: - src = img["src"] - if target_path in src: - chosen = src - log_debug(f"[BookScraper] Cover matched by PATH: {src}") - break - - # -------------------------------------------------------- - # PRIORITY 2: endswith "/{book_id}s.jpg" - # -------------------------------------------------------- - if not chosen: - target_suffix = f"/{book_id}s.jpg" - for img in imgs: - src = img["src"] - if src.endswith(target_suffix): - chosen = src - log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}") - break - - # -------------------------------------------------------- - # No match - # -------------------------------------------------------- - if not chosen: - log_debug("[BookScraper] No matching cover found") - return - - self.cover_url = urljoin(self.site.root, chosen) - log_debug(f"[BookScraper] Cover URL = {self.cover_url}") - - # ------------------------------------------------------------ - def get_chapter_page(self, soup): - """Return BeautifulSoup of the main chapter list page.""" - node = soup.select_one( - "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" - ) - href = node.select_one("a").get("href") - chapter_url = urljoin(self.site.root, href) - - # base for chapter links - parts = chapter_url.rsplit("/", 1) - self.chapter_base = parts[0] + "/" - - return self._fetch(chapter_url) - - # ------------------------------------------------------------ - def parse_chapter_links(self, soup): - cont = soup.select_one(self.site.chapter_list_selector) - items = cont.select("ul li a[href]") - - self.chapters = [] - idx = 1 - - for a in items: - href = a.get("href") - if not href.endswith(".html"): - continue - - title = a.get_text(strip=True) - full = urljoin(self.chapter_base, href) - - self.chapters.append(Chapter(idx, title, full)) - idx += 1 - - log_debug(f"[BookScraper] Found {len(self.chapters)} chapters") + return extract_metadata_full(self.url, self.site) diff --git a/bookscraper/scraper/db.py b/bookscraper/scraper/db.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/scraper/engine/__init__.py b/bookscraper/scraper/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/scraper/engine/fetcher.py b/bookscraper/scraper/engine/fetcher.py new file mode 100644 index 0000000..9ca1321 --- /dev/null +++ b/bookscraper/scraper/engine/fetcher.py @@ -0,0 +1,27 @@ +# ============================================================ +# File: scraper/engine/fetcher.py +# Purpose: +# Low-level HTML fetch utility shared by all site scrapers. +# Replaces scattered _fetch() logic inside BookScraper. +# ============================================================ + +import requests +from bs4 import BeautifulSoup + + +HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) " + "Gecko/20100101 Firefox/118.0" + ) +} + + +def fetch_html(url: str, encoding: str = "utf-8", timeout: int = 10) -> BeautifulSoup: + """ + Fetch HTML with a consistent user-agent and encoding. + Returns BeautifulSoup(lxml). + """ + resp = requests.get(url, headers=HEADERS, timeout=timeout) + resp.encoding = encoding + return BeautifulSoup(resp.text, "lxml") diff --git a/bookscraper/scraper/engine/parser.py b/bookscraper/scraper/engine/parser.py new file mode 100644 index 0000000..4408f96 --- /dev/null +++ b/bookscraper/scraper/engine/parser.py @@ -0,0 +1,65 @@ +# ============================================================ +# File: scraper/engine/parser.py +# Purpose: +# High-level scraping API coordinating metadata extraction +# and chapter extraction using pluggable SiteScraper classes. +# +# This is the new central engine: +# - extract_metadata_only() used by INIT flow +# - extract_metadata_full() used by full scraping pipeline +# ============================================================ + +from scraper.engine.fetcher import fetch_html + + +def extract_metadata_only(url: str, site_scraper): + """ + Extract ONLY lightweight metadata: + - title + - author + - description + - cover_url + - chapters_total = 0 + """ + soup = fetch_html(url, site_scraper.encoding) + + title = site_scraper.parse_title(soup) + author = site_scraper.parse_author(soup) + description = site_scraper.parse_description(soup) + cover_url = site_scraper.parse_cover(soup, url) + + return { + "title": title, + "author": author, + "description": description, + "cover_url": cover_url, + "chapters_total": 0, + "book_url": url, + } + + +def extract_metadata_full(url: str, site_scraper): + """ + Full scraping (metadata + chapterlist). + Used by the scraping Celery pipeline. + """ + soup = fetch_html(url, site_scraper.encoding) + + # metadata + meta = extract_metadata_only(url, site_scraper) + + # chapter list + chapter_page_url = site_scraper.extract_chapter_page_url(soup) + chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding) + chapters = site_scraper.parse_chapter_list(chapter_page_soup) + + meta["chapters"] = chapters + return meta + + +def build_book_id(title: str) -> str: + """ + Canonical book_id generator. + SCRAPE currently uses title as ID → preserve that behavior. + """ + return title diff --git a/bookscraper/scraper/services/__init__.py b/bookscraper/scraper/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/scraper/services/cover_service.py b/bookscraper/scraper/services/cover_service.py new file mode 100644 index 0000000..8392947 --- /dev/null +++ b/bookscraper/scraper/services/cover_service.py @@ -0,0 +1,44 @@ +# ============================================================ +# File: scraper/services/cover_service.py +# ============================================================ + +import os +import requests +from logbus.publisher import log + + +class CoverService: + + @staticmethod + def download_main_cover(cover_url: str, book_id: str) -> str | None: + """ + Downloads cover image into: static/covers/.jpg. + Returns local path or None. + """ + + if not cover_url: + log(f"[COVER] No cover URL for book={book_id}") + return None + + static_dir = os.path.join("static", "covers") + os.makedirs(static_dir, exist_ok=True) + + dst_path = os.path.join(static_dir, f"{book_id}.jpg") + + try: + log(f"[COVER] Downloading: {cover_url}") + + resp = requests.get( + cover_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"} + ) + resp.raise_for_status() + + with open(dst_path, "wb") as f: + f.write(resp.content) + + log(f"[COVER] Stored: {dst_path}") + return dst_path + + except Exception as e: + log(f"[COVER] FAILED ({cover_url}) → {e}") + return None diff --git a/bookscraper/scraper/services/init_service.py b/bookscraper/scraper/services/init_service.py new file mode 100644 index 0000000..91bf5bf --- /dev/null +++ b/bookscraper/scraper/services/init_service.py @@ -0,0 +1,74 @@ +# ============================================================ +# File: scraper/services/init_service.py +# Purpose: +# Orchestrate INIT-flow: +# - resolve site +# - fetch minimal metadata +# - derive book_id +# - register in SQLite +# - store main cover +# ============================================================ + +import re +from scraper.services.site_resolver import SiteResolver +from scraper.services.scrape_engine import ScrapeEngine +from scraper.services.cover_service import CoverService + +from db.repository import register_book + + +class InitService: + + @staticmethod + def derive_book_id(url: str) -> str: + """ + PTWXZ URL format ends with /{id}.html. + If no match → fallback to sanitized URL. + """ + m = re.search(r"/(\d+)\.html$", url) + if m: + return m.group(1) + return url.replace("/", "_") + + @staticmethod + def execute(url: str) -> dict: + """ + Main INIT-flow entry point. + Returns complete metadata + registration info. + """ + + # 1) Determine which BookSite applies + site = SiteResolver.resolve(url) + + # 2) Metadata only (no chapters) + meta = ScrapeEngine.fetch_metadata_only(site, url) + + title = meta.get("title") or "Unknown" + author = meta.get("author") + description = meta.get("description") + cover_url = meta.get("cover_url") + + # 3) Determine book_id + book_id = InitService.derive_book_id(url) + + # 4) SQLite registration + register_book( + book_id=book_id, + title=title, + author=author, + description=description, + cover_url=cover_url, + ) + + # 5) Download UI cover + CoverService.download_main_cover(cover_url, book_id) + + # 6) Structured output for UI + return { + "book_id": book_id, + "title": title, + "author": author, + "description": description, + "cover_url": cover_url, + "status": "registered", + } diff --git a/bookscraper/scraper/services/scrape_engine.py b/bookscraper/scraper/services/scrape_engine.py new file mode 100644 index 0000000..35df5ac --- /dev/null +++ b/bookscraper/scraper/services/scrape_engine.py @@ -0,0 +1,33 @@ +# ============================================================ +# File: scraper/services/scrape_engine.py +# Purpose: +# Provide unified scraping methods for INIT-flow. +# Reuses BookScraper internally with ZERO duplication. +# ============================================================ + +from scraper.book_scraper import BookScraper + + +class ScrapeEngine: + """ + Adapter layer around BookScraper. + Allows INIT-flow to fetch ONLY metadata (no chapters). + """ + + @staticmethod + def fetch_metadata_only(site, url: str) -> dict: + """ + Execute BookScraper but return ONLY metadata. + Chapters are intentionally removed. + """ + scraper = BookScraper(site, url) + result = scraper.execute() # returns full metadata + chapters + + # Strip chapterlist — INIT-flow should not fetch them + return { + "title": result.get("title"), + "author": result.get("author"), + "description": result.get("description"), + "cover_url": result.get("cover_url"), + "book_url": url, + } diff --git a/bookscraper/scraper/services/site_resolver.py b/bookscraper/scraper/services/site_resolver.py new file mode 100644 index 0000000..6544f26 --- /dev/null +++ b/bookscraper/scraper/services/site_resolver.py @@ -0,0 +1,20 @@ +# ============================================================ +# File: scraper/services/site_resolver.py +# Purpose: +# Determine which BookSite implementation applies for a given URL. +# This keeps INIT-flow and SCRAPE-flow site-agnostic. +# ============================================================ + +from scraper.sites import BookSite # current PTWXZ implementation + + +class SiteResolver: + """ + Resolves the correct BookSite class based on URL. + Currently only PTWXZ/Piaotian is supported. + """ + + @staticmethod + def resolve(url: str): + # Later: add more domain rules for other sources + return BookSite() diff --git a/bookscraper/scraper/sites/__init__.py b/bookscraper/scraper/sites/__init__.py new file mode 100644 index 0000000..bc924f3 --- /dev/null +++ b/bookscraper/scraper/sites/__init__.py @@ -0,0 +1,28 @@ +# ============================================================ +# File: scraper/sites/__init__.py +# Purpose: +# Site autodetection based on URL. +# ============================================================ + +from scraper.sites.piaotian import PiaotianScraper + + +def get_scraper_for_url(url: str): + """ + Return the correct scraper instance for a given URL. + Later: add more site implementations. + """ + if "ptwxz" in url or "piaotian" in url: + return PiaotianScraper() + + raise ValueError(f"No scraper available for URL: {url}") + + +# ============================================================ +# Backwards-compatibility export for legacy BookScraper +# ============================================================ +# Old code expects: +# from scraper.sites import BookSite +# We map that to our new PiaotianScraper implementation. + +BookSite = PiaotianScraper diff --git a/bookscraper/scraper/sites/base.py b/bookscraper/scraper/sites/base.py new file mode 100644 index 0000000..b75f414 --- /dev/null +++ b/bookscraper/scraper/sites/base.py @@ -0,0 +1,51 @@ +# ============================================================ +# File: scraper/sites/base.py +# Purpose: +# Abstract interface that every site-specific scraper must implement. +# ============================================================ + +from abc import ABC, abstractmethod +from bs4 import BeautifulSoup + + +class SiteScraper(ABC): + """ + Defines the interface for site-specific scrapers. + Each concrete scraper (Piaotian, Biquge, etc.) must implement these. + """ + + @property + @abstractmethod + def root(self) -> str: ... + + @property + @abstractmethod + def encoding(self) -> str: ... + + @property + @abstractmethod + def chapter_list_selector(self) -> str: ... + + # -------------------------- + # Metadata extraction + # -------------------------- + @abstractmethod + def parse_title(self, soup: BeautifulSoup) -> str: ... + + @abstractmethod + def parse_author(self, soup: BeautifulSoup) -> str: ... + + @abstractmethod + def parse_description(self, soup: BeautifulSoup) -> str: ... + + @abstractmethod + def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None: ... + + # -------------------------- + # Chapter extraction + # -------------------------- + @abstractmethod + def extract_chapter_page_url(self, soup: BeautifulSoup) -> str: ... + + @abstractmethod + def parse_chapter_list(self, soup: BeautifulSoup) -> list: ... diff --git a/bookscraper/scraper/sites/piaotian.py b/bookscraper/scraper/sites/piaotian.py new file mode 100644 index 0000000..95e430e --- /dev/null +++ b/bookscraper/scraper/sites/piaotian.py @@ -0,0 +1,120 @@ +# ============================================================ +# File: scraper/sites/piaotian.py +# Purpose: +# Concrete SiteScraper implementation for ptwxz.com (Piaotian). +# Moves all parsing logic out of BookScraper. +# ============================================================ + +from scraper.sites.base import SiteScraper +from bs4 import BeautifulSoup +from urllib.parse import urljoin +import re + + +class PiaotianScraper(SiteScraper): + root = "https://www.ptwxz.com" + encoding = "GB18030" + chapter_list_selector = "div.centent" + + # ------------------------------------------------------------ + # METADATA PARSING + # ------------------------------------------------------------ + def parse_title(self, soup: BeautifulSoup) -> str: + h1 = soup.find("h1") + return h1.get_text(strip=True) if h1 else "UnknownBook" + + def parse_author(self, soup: BeautifulSoup) -> str: + td = soup.find("td", string=lambda t: t and "作" in t) + raw = td.get_text(strip=True) if td else "" + return raw.split(":")[1] if ":" in raw else "UnknownAuthor" + + def parse_description(self, soup: BeautifulSoup) -> str: + span = soup.find("span", string=lambda t: t and "内容简介" in t) + if not span: + return "" + + parts = [] + for sib in span.next_siblings: + # stop when next reappears + if getattr(sib, "name", None) == "span": + break + + text = ( + sib.get_text(strip=True) + if hasattr(sib, "get_text") + else str(sib).strip() + ) + if text: + parts.append(text) + + return "\n".join(parts) + + # ------------------------------------------------------------ + # COVER PARSING + # (exactly your BookScraper._parse_cover logic) + # ------------------------------------------------------------ + def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None: + # Extract book_id from URL + m = re.search(r"/(\d+)\.html$", url) + if not m: + return None + + book_id = m.group(1) + + # Extract vol (bookinfo//.html) + m2 = re.search(r"/bookinfo/(\d+)/", url) + volume = m2.group(1) if m2 else None + + imgs = soup.find_all("img", src=True) + chosen = None + + # Priority 1: match "/files/article/image/{vol}/{book_id}/" + if volume: + target_path = f"/files/article/image/{volume}/{book_id}/" + for img in imgs: + src = img["src"] + if target_path in src: + chosen = src + break + + # Priority 2: endswith "/{book_id}s.jpg" + if not chosen: + target_suffix = f"/{book_id}s.jpg" + for img in imgs: + src = img["src"] + if src.endswith(target_suffix): + chosen = src + break + + if not chosen: + return None + + return urljoin(self.root, chosen) + + # ------------------------------------------------------------ + # CHAPTER EXTRACTION + # ------------------------------------------------------------ + def extract_chapter_page_url(self, soup: BeautifulSoup) -> str: + node = soup.select_one( + "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" + ) + href = node.select_one("a").get("href") + return urljoin(self.root, href) + + def parse_chapter_list(self, soup: BeautifulSoup) -> list: + cont = soup.select_one(self.chapter_list_selector) + items = cont.select("ul li a[href]") if cont else [] + + chapters = [] + idx = 1 + + for a in items: + href = a.get("href") + if not href.endswith(".html"): + continue + title = a.get_text(strip=True) + full_url = urljoin(self.root, href) + chapters.append({"num": idx, "title": title, "url": full_url}) + idx += 1 + + return chapters diff --git a/bookscraper/scraper/tasks/audio_tasks.py b/bookscraper/scraper/tasks/audio_tasks.py index d80d2f1..18fcf55 100644 --- a/bookscraper/scraper/tasks/audio_tasks.py +++ b/bookscraper/scraper/tasks/audio_tasks.py @@ -21,7 +21,7 @@ redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND") parsed = urlparse(redis_url) # ------------------------------------------------------------ -# REGULIER REDIS CLIENT (slots, file checks, state) +# REGULIER REDIS CLIENT (slots, file checks, dstate) # ------------------------------------------------------------ redis_client = Redis( host=parsed.hostname, diff --git a/bookscraper/static/js/init_book.js b/bookscraper/static/js/init_book.js new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/templates/components/init_book_form.html b/bookscraper/templates/components/init_book_form.html new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/templates/index.html b/bookscraper/templates/index.html index a8a4b76..e9fde05 100644 --- a/bookscraper/templates/index.html +++ b/bookscraper/templates/index.html @@ -1,34 +1,53 @@ - - + + BookScraper - - + + +

BookScraper WebGUI

-

BookScraper WebGUI

- -
-

- - -
- - +
+

+ + +
+