init werkt.

1 week ago · 292c9246a1
parent f7f08fa45c
commit 292c9246a1
25 changed files with 794 additions and 395 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,3 +12,5 @@
 .env
 **/.env
 log.txt
+
+**/static/covers/
--- a/bookscraper/app.py
+++ b/bookscraper/app.py
@ -1,41 +1,36 @@
 # ============================================
 # File: bookscraper/app.py  (ASYNC SCRAPING)
 # ============================================
-
 from dotenv import load_dotenv

 load_dotenv()

+import os
+import redis
+from flask import Flask, render_template, request, jsonify, send_from_directory
+
 print(">>> [WEB] Importing celery_app …")
 from celery_app import celery_app
 from db.db import init_db
+from celery.result import AsyncResult

-init_db()  # ensure DB schema exists before Flask starts
-
-from flask import Flask, render_template, request, jsonify
 from scraper.logger import log_debug
-
-# Abort + Progress (per book_id)
 from scraper.abort import set_abort
 from scraper.progress import get_progress
-
-# UI LOGS (GLOBAL — no book_id)
-from scraper.ui_log import get_ui_logs, reset_ui_logs
-
-from celery.result import AsyncResult
+from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta
 from scraper.state import state as r

-# Cover serving
-from flask import send_from_directory
-import os
+from scraper.services.init_service import InitService

-import redis
+from db.repository import get_registered_books
+
+# INIT DB
+init_db()

-# Flask
 app = Flask(__name__)

 # =====================================================
-# STATIC FILE SERVING FOR OUTPUT
+# STATIC FILE SERVING
 # =====================================================
 OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")

@ -46,26 +41,96 @@ def serve_output(filename):


 # =====================================================
-# HOME PAGE
+# SECTION 1 — NAVIGATION / HTML PAGES
 # =====================================================
+
+
@app.route("/", methods=["GET"])
 def index():
    return render_template("index.html")


+@app.route("/dashboard", methods=["GET"])
+def dashboard():
+    logs_list = get_ui_logs() or []
+    return render_template(
+        "dashboard/dashboard.html",
+        books=list_active_books(),  # Redis
+        registered=get_registered_books(),  # SQLite INIT results
+        logs=logs_list,
+    )
+
+
+@app.route("/book/<book_id>")
+def book_detail(book_id):
+    title = r.get(f"book:{book_id}:title") or book_id
+    return render_template(
+        "dashboard/book_detail.html",
+        book_id=book_id,
+        title=title,
+        logs=get_ui_logs(),
+    )
+
+
 # =====================================================
-# START SCRAPING (async via Celery)
+# SECTION 2 — ACTION ROUTES (INIT, START, ABORT)
 # =====================================================
+
+# CORRECT PATH — services/ is root-level
+
+
+@app.route("/init", methods=["POST"])
+def init_book():
+    """
+    INIT-flow:
+    - user enters URL
+    - lightweight metadata fetch
+    - insert into SQLite as 'registered'
+    - return dashboard HTML (NOT JSON)
+    """
+    url = request.form.get("url", "").strip()
+
+    if not url:
+        return render_template(
+            "dashboard/dashboard.html",
+            error="Geen URL opgegeven.",
+            books=list_active_books(),
+            registered=get_registered_books(),
+            logs=get_ui_logs(),
+        )
+
+    try:
+        result = InitService.execute(url)
+        msg = f"Boek geregistreerd: {result.get('title')}"
+
+        return render_template(
+            "dashboard/dashboard.html",
+            message=msg,
+            books=list_active_books(),  # Redis
+            registered=get_registered_books(),  # SQLite INIT results
+            logs=get_ui_logs(),
+        )
+
+    except Exception as e:
+        log_debug(f"[INIT] ERROR: {e}")
+        return render_template(
+            "dashboard/dashboard.html",
+            error=f"INIT mislukt: {e}",
+            books=list_active_books(),
+            registered=get_registered_books(),
+            logs=get_ui_logs(),
+        )
+
+
@app.route("/start", methods=["POST"])
 def start_scraping():
    url = request.form.get("url", "").strip()
-
    if not url:
-        # ★ FIX: dashboard moet altijd books + logs meekrijgen
        return render_template(
            "dashboard/dashboard.html",
            error="Geen URL opgegeven.",
            books=list_active_books(),
+            registered=get_registered_books(),
            logs=get_ui_logs(),
        )

@ -78,27 +143,15 @@ def start_scraping():
        queue="scraping",
    )

-    # ★ FIX: direct dashboard tonen met actuele data
    return render_template(
        "dashboard/dashboard.html",
        scraping_task_id=async_result.id,
        books=list_active_books(),
+        registered=get_registered_books(),
        logs=get_ui_logs(),
    )


-# =====================================================
-# CLEAR UI LOGS
-# =====================================================
-@app.route("/clear-logs", methods=["POST"])
-def clear_logs():
-    reset_ui_logs()
-    return jsonify({"status": "ok", "message": "UI logs cleared"})
-
-
-# =====================================================
-# ABORT (per book_id)
-# =====================================================
@app.route("/abort/<book_id>", methods=["POST"])
 def abort_download(book_id):
    log_debug(f"[WEB] Abort requested for book: {book_id}")
@ -107,87 +160,10 @@ def abort_download(book_id):


 # =====================================================
-# PROGRESS (per book_id)
-# =====================================================
-@app.route("/progress/<book_id>", methods=["GET"])
-def progress(book_id):
-    return jsonify(get_progress(book_id))
-
-
-# =====================================================
-# CELERY RESULT → return book_id
-# =====================================================
-@app.route("/celery-result/<task_id>", methods=["GET"])
-def celery_result(task_id):
-    result = AsyncResult(task_id, app=celery_app)
-
-    if result.successful():
-        return jsonify({"ready": True, "result": result.get()})
-    if result.failed():
-        return jsonify({"ready": True, "error": "failed"})
-    return jsonify({"ready": False})
-
-
-# =====================================================
-# API: book status new model
-# =====================================================
-def getStatus(book_id):
-
-    state = r.hgetall(f"book:{book_id}:state")
-    status = state.get("status") or "unknown"
-    dl_done = int(state.get("chapters_download_done", 0))
-    dl_skipped = int(state.get("chapters_download_skipped", 0))
-    dl_total = int(state.get("chapters_total", 0))
-    au_done = int(state.get("audio_done") or 0)
-    title = state.get("title") or book_id
-
-    au_total = dl_total
-
-    return {
-        "book_id": book_id,
-        "title": title,
-        "status": status,
-        "download_done": dl_done,
-        "download_skipped": dl_skipped,
-        "download_total": dl_total,
-        "audio_done": au_done,
-        "audio_total": au_total,
-    }
-
-
-# =====================================================
-# REDIS BACKEND — BOOK STATE MODEL
+# SECTION 3 — API ROUTES (JSON)
 # =====================================================
-REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
-r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
-
-
-def list_active_booksold():
-    """Return list of active books from Redis Book State Model."""
-    keys = r.keys("book:*:state")
-    books = []
-
-    for key in keys:
-        book_id = key.split(":")[1]
-        print(book_id)
-        books.append(getStatus(book_id))
-
-    return books
-
-
-def list_active_books():
-    books = []
-    for key in r.scan_iter(match="book:*:state", count=1000):
-        first = key.find(":")
-        second = key.find(":", first + 1)
-        book_id = key[first + 1 : second]
-        books.append(getStatus(book_id))
-    return books


-# =====================================================
-# API: list all active books
-# =====================================================
@app.route("/api/books")
 def api_books():
    return jsonify(list_active_books())
@ -195,45 +171,50 @@ def api_books():

@app.route("/api/book/<book_id>/status")
 def api_book_status(book_id):
-
    return jsonify(getStatus(book_id))


-# =====================================================
-# API: book logs
-# =====================================================
@app.route("/api/book/<book_id>/logs")
 def api_book_logs(book_id):
    logs = r.lrange(f"logs:{book_id}", 0, -1) or []
    return jsonify(logs)


-# =====================================================
-# VIEW: DASHBOARD
-# =====================================================
-@app.route("/dashboard")
-def dashboard():
-    logs_list = get_ui_logs() or []
-    # ★ FIX: dashboard moet altijd books + logs krijgen
-    return render_template(
-        "dashboard/dashboard.html",
-        books=list_active_books(),
-        logs=logs_list,  # dashboard krijgt LIST, geen dict
-    )
+@app.route("/progress/<book_id>")
+def progress(book_id):
+    return jsonify(get_progress(book_id))
+
+
+@app.route("/celery-result/<task_id>")
+def celery_result(task_id):
+    result = AsyncResult(task_id, app=celery_app)
+    if result.successful():
+        return jsonify({"ready": True, "result": result.get()})
+    if result.failed():
+        return jsonify({"ready": True, "error": "failed"})
+    return jsonify({"ready": False})
+
+
+@app.route("/clear-logs", methods=["POST"])
+def clear_logs():
+    reset_ui_logs()
+    return jsonify({"status": "ok", "message": "UI logs cleared"})
+
+
+@app.route("/logs", methods=["GET"])
+def logs():
+    try:
+        last_index = int(request.args.get("last_index", -1))
+    except:
+        last_index = -1
+
+    new_lines, total = get_ui_logs_delta(last_index)
+    return jsonify({"lines": new_lines, "total": total})


 # =====================================================
-# VIEW: BOOK DETAIL PAGE
+# SECTION 4 — DEBUG ROUTES
 # =====================================================
-@app.route("/book/<book_id>")
-def book_detail(book_id):
-    title = r.get(f"book:{book_id}:title") or book_id
-    return render_template(
-        "dashboard/book_detail.html",
-        book_id=book_id,
-        title=title,
-        logs=get_ui_logs(),
-    )


@app.route("/debug/redis-keys")
@ -254,37 +235,65 @@ def debug_redis_keys():
    return jsonify(results)


-# ============================================================
-# Rolling log endpoint (no new file)
-# ============================================================
-
-from flask import jsonify, request
-
 # =====================================================
-# ROLLING LOG ENDPOINT — DELTA POLLING VIA ui_log
+# DB DEBUG: LIST ALL BOOKS FROM SQLITE
 # =====================================================
-from scraper.ui_log import get_ui_logs_delta
+from db.repository import fetch_all_books


-@app.route("/logs", methods=["GET"])
-def logs():
+@app.route("/api/db/books")
+def api_db_books():
    """
-    Delta log delivery for WebGUI.
-    Browser sends ?last_index=N, we return only new lines.
+    Return ALL books stored in SQLite — including INIT-only entries.
+    Useful to verify that /init wrote correct metadata.
    """
    try:
-        last_index = int(request.args.get("last_index", -1))
-    except:
-        last_index = -1
+        books = fetch_all_books()
+        return jsonify({"status": "ok", "books": books})
+    except Exception as e:
+        return jsonify({"status": "error", "message": str(e)}), 500

-    new_lines, total = get_ui_logs_delta(last_index)

-    return jsonify({"lines": new_lines, "total": total})
+# =====================================================
+# SECTION 5 — INTERNAL HELPERS
+# =====================================================
+
+
+def getStatus(book_id):
+    state = r.hgetall(f"book:{book_id}:state")
+    status = state.get("status") or "unknown"
+    dl_done = int(state.get("chapters_download_done", 0))
+    dl_skipped = int(state.get("chapters_download_skipped", 0))
+    dl_total = int(state.get("chapters_total", 0))
+    au_done = int(state.get("audio_done") or 0)
+    title = state.get("title") or book_id
+
+    return {
+        "book_id": book_id,
+        "title": title,
+        "status": status,
+        "download_done": dl_done,
+        "download_skipped": dl_skipped,
+        "download_total": dl_total,
+        "audio_done": au_done,
+        "audio_total": dl_total,
+    }
+
+
+def list_active_books():
+    books = []
+    for key in r.scan_iter(match="book:*:state", count=1000):
+        first = key.find(":")
+        second = key.find(":", first + 1)
+        book_id = key[first + 1 : second]
+        books.append(getStatus(book_id))
+    return books


 # =====================================================
-# RUN FLASK
+# SECTION 6 — FLASK RUNNER
 # =====================================================
+
 if __name__ == "__main__":
    debug = os.getenv("FLASK_DEBUG", "0") == "1"
    host = os.getenv("HOST", "0.0.0.0")
--- a/bookscraper/app/routes/init.py
+++ b/bookscraper/app/routes/init.py
--- a/bookscraper/app/routes/init_book.py
+++ b/bookscraper/app/routes/init_book.py
--- a/bookscraper/db/db.py
+++ b/bookscraper/db/db.py
@ -3,12 +3,10 @@
 # Purpose:
 #   Raw SQLite engine for BookScraper.
 #   Provides ONLY low-level DB primitives.
-#   - Connection management (WAL mode)
-#   - init_db() schema creation
+#   - Connection management (existing DELETE journal mode)
+#   - init_db() schema creation + safe schema upgrade
 #   - upsert_book() atomic write
 #   - raw fetch helpers (private)
-#
-#   All business logic belongs in repository.py.
 # ============================================================

 import os
@ -48,10 +46,14 @@ def enable_wal_mode(conn):


 # ------------------------------------------------------------
-# Schema creation
+# Schema creation + SAFE schema upgrades
 # ------------------------------------------------------------
 def init_db():
    conn = get_db()
+
+    # --------------------------------------------------------
+    # BASE SCHEMA (unchanged)
+    # --------------------------------------------------------
    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS books (
@ -76,14 +78,21 @@ def init_db():
    )
    conn.commit()

+    # --------------------------------------------------------
+    # SCHEMA UPGRADE: add description column if missing
+    # --------------------------------------------------------
+    cols = conn.execute("PRAGMA table_info(books);").fetchall()
+    colnames = [c[1] for c in cols]
+
+    if "description" not in colnames:
+        conn.execute("ALTER TABLE books ADD COLUMN description TEXT;")
+        conn.commit()
+

 # ------------------------------------------------------------
 # WRITE OPERATIONS
 # ------------------------------------------------------------
 def upsert_book(book_id, **fields):
-    """
-    Raw upsert primitive. Repository layer should call this.
-    """
    conn = get_db()

    keys = ["book_id"] + list(fields.keys())
@ -115,5 +124,6 @@ def _raw_get_book(book_id):

 def _raw_get_all_books():
    conn = get_db()
+    # unchanged
    cur = conn.execute("SELECT * FROM books ORDER BY created_at DESC;")
    return [dict(row) for row in cur.fetchall()]
--- a/bookscraper/db/light_fetch.py
+++ b/bookscraper/db/light_fetch.py
--- a/bookscraper/db/repository.py
+++ b/bookscraper/db/repository.py
@ -4,18 +4,20 @@
 #   High-level BookScraper database interface.
 #   This is the ONLY module Celery tasks and Flask should use.
 #
-#   Uses low-level primitives from db.db, but exposes
-#   domain-level operations:
-#   - fetch_book / fetch_all_books
-#   - create_or_update_book
-#   - set_status
-#   - incrementing counters
+#   New additions for INIT-flow:
+#     - register_book()
+#     - update_book_after_full_scrape()
+#     - get_registered_books()
+#     - get_active_books()
+#
+#   Existing functions remain unchanged for backward compatibility.
 # ============================================================

 from db.db import (
    upsert_book,
    _raw_get_book,
    _raw_get_all_books,
+    get_db,
 )


@ -32,8 +34,84 @@ def fetch_all_books():
    return _raw_get_all_books()


+# ============================================================
+# NEW — INIT-FLOW SUPPORT
+# ============================================================
+
+
+def register_book(book_id, title, author=None, description=None, cover_url=None):
+    """
+    Create a new book entry with initial metadata.
+    Called when user enters a URL and presses INIT.
+    """
+    fields = {
+        "title": title,
+        "author": author,
+        "description": description,
+        "cover_url": cover_url,
+        "chapters_total": 0,
+        "status": "registered",
+    }
+    upsert_book(book_id, **fields)
+
+
+def update_book_after_full_scrape(
+    book_id,
+    title=None,
+    author=None,
+    description=None,
+    cover_url=None,
+    chapters_total=None,
+):
+    """
+    Called after a FULL scrape when chapters are known.
+    Moves the book into 'active' state.
+    """
+    fields = {}
+
+    if title is not None:
+        fields["title"] = title
+    if author is not None:
+        fields["author"] = author
+    if description is not None:
+        fields["description"] = description
+    if cover_url is not None:
+        fields["cover_url"] = cover_url
+    if chapters_total is not None:
+        fields["chapters_total"] = chapters_total
+
+    fields["status"] = "active"
+
+    upsert_book(book_id, **fields)
+
+
+def get_registered_books():
+    """
+    Return books registered but not yet scraped.
+    """
+    conn = get_db()
+    cur = conn.execute(
+        """SELECT * FROM books WHERE status='registered'
+           ORDER BY created_at DESC"""
+    )
+    return [dict(row) for row in cur.fetchall()]
+
+
+def get_active_books():
+    """
+    Return books currently in progress.
+    """
+    conn = get_db()
+    cur = conn.execute(
+        """SELECT * FROM books
+           WHERE status IN ('active', 'downloading')
+           ORDER BY created_at DESC"""
+    )
+    return [dict(row) for row in cur.fetchall()]
+
+
 # ------------------------------------------------------------
-# BOOK CREATION / METADATA
+# BOOK CREATION / METADATA (existing)
 # ------------------------------------------------------------
 def create_or_update_book(
    book_id,
@ -64,14 +142,14 @@ def create_or_update_book(


 # ------------------------------------------------------------
-# STATUS MANAGEMENT
+# STATUS MANAGEMENT (existing)
 # ------------------------------------------------------------
 def set_status(book_id, status):
    upsert_book(book_id, status=status)


 # ------------------------------------------------------------
-# INCREMENTING COUNTERS (atomic)
+# INCREMENTING COUNTERS (existing — backward compat only)
 # ------------------------------------------------------------
 def inc_downloaded(book_id, amount=1):
    book = _raw_get_book(book_id)
--- a/bookscraper/db/schema.py
+++ b/bookscraper/db/schema.py
--- a/bookscraper/scraper/book_scraper.py
+++ b/bookscraper/scraper/book_scraper.py
@ -1,202 +1,21 @@
-# scraper/book_scraper.py
+# ============================================================
+# File: scraper/book_scraper.py
+# Purpose:
+#   Backwards-compatible wrapper giving same API as before.
+#   Uses the new engine under the hood.
+# ============================================================

-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin
-import re
-
-from scraper.logger import log_debug
-from scraper.utils import clean_text, load_replacements
-from scraper.models.book_state import Chapter
+from scraper.engine.parser import extract_metadata_full


 class BookScraper:
-    """
-    Minimal scraper: only metadata + chapter list.
-    The DownloadController handles Celery pipelines for:
-    - download
-    - parse
-    - save
-    """
-
-    def __init__(self, site, url):
-        self.site = site
+    def __init__(self, site_scraper, url):
+        self.site = site_scraper
        self.url = url

-        self.book_title = ""
-        self.book_author = ""
-        self.book_description = ""
-        self.cover_url = ""
-        self.chapter_base = None
-
-        self.chapters = []
-
-        # Load custom replacements
-        extra = load_replacements("replacements.txt")
-        self.site.replacements.update(extra)
-
-    # ------------------------------------------------------------
    def execute(self):
-        """Main entry point. Returns metadata + chapter URLs."""
-        soup = self._fetch(self.url)
-
-        self._parse_title(soup)
-        self._parse_author(soup)
-        self._parse_description(soup)
-        self._parse_cover(soup)
-
-        chapter_page = self.get_chapter_page(soup)
-        self.parse_chapter_links(chapter_page)
-
-        log_debug(f"[BookScraper] Completed metadata parse")
-
-        return {
-            "title": self.book_title,
-            "author": self.book_author,
-            "description": self.book_description,
-            "cover_url": self.cover_url,  # ← used by DownloadController
-            "book_url": self.url,
-            "chapters": [
-                {"num": ch.number, "title": ch.title, "url": ch.url}
-                for ch in self.chapters
-            ],
-        }
-
-    # ------------------------------------------------------------
-    def _fetch(self, url):
-        log_debug(f"[BookScraper] Fetch: {url}")
-        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
-        resp.encoding = self.site.encoding
-        return BeautifulSoup(resp.text, "lxml")
-
-    # ------------------------------------------------------------
-    def _parse_title(self, soup):
-        h1 = soup.find("h1")
-        self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
-        log_debug(f"[BookScraper] Title = {self.book_title}")
-
-    def _parse_author(self, soup):
-        td = soup.find("td", string=lambda t: t and "作" in t)
-        raw = td.get_text(strip=True) if td else ""
-        self.book_author = raw.split("：")[1] if "：" in raw else "UnknownAuthor"
-        log_debug(f"[BookScraper] Author = {self.book_author}")
-
-    def _parse_description(self, soup):
-        span = soup.find("span", string=lambda t: t and "内容简介" in t)
-        if not span:
-            self.book_description = ""
-            log_debug("[BookScraper] Description not found")
-            return
-
-        parts = []
-        for sib in span.next_siblings:
-            if getattr(sib, "name", None) == "span":
-                break
-
-            text = (
-                sib.get_text(strip=True)
-                if hasattr(sib, "get_text")
-                else str(sib).strip()
-            )
-
-            if text:
-                parts.append(text)
-
-        self.book_description = clean_text("\n".join(parts), self.site.replacements)
-        log_debug(f"[BookScraper] Description length = {len(self.book_description)}")
-
-    # ------------------------------------------------------------
-    def _parse_cover(self, soup):
        """
-        Extract correct cover based on book_id path logic.
-        1. primary: match "/files/article/image/{vol}/{book_id}/"
-        2. fallback: endswith "/{book_id}s.jpg"
+        Backwards compatible full scrape:
+        returns {title, author, description, cover_url, chapters, book_url}
        """
-        # Extract book_id from URL
-        m = re.search(r"/(\d+)\.html$", self.url)
-        if not m:
-            log_debug("[BookScraper] No book_id found in URL → cannot match cover")
-            return
-
-        book_id = m.group(1)
-
-        # Extract vol folder from URL (bookinfo/<vol>/<id>.html)
-        m2 = re.search(r"/bookinfo/(\d+)/", self.url)
-        volume = m2.group(1) if m2 else None
-
-        log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")
-
-        imgs = soup.find_all("img", src=True)
-
-        chosen = None
-
-        # --------------------------------------------------------
-        # PRIORITY 1: Path-match
-        # /files/article/image/{vol}/{book_id}/
-        # --------------------------------------------------------
-        if volume:
-            target_path = f"/files/article/image/{volume}/{book_id}/"
-            for img in imgs:
-                src = img["src"]
-                if target_path in src:
-                    chosen = src
-                    log_debug(f"[BookScraper] Cover matched by PATH: {src}")
-                    break
-
-        # --------------------------------------------------------
-        # PRIORITY 2: endswith "/{book_id}s.jpg"
-        # --------------------------------------------------------
-        if not chosen:
-            target_suffix = f"/{book_id}s.jpg"
-            for img in imgs:
-                src = img["src"]
-                if src.endswith(target_suffix):
-                    chosen = src
-                    log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
-                    break
-
-        # --------------------------------------------------------
-        # No match
-        # --------------------------------------------------------
-        if not chosen:
-            log_debug("[BookScraper] No matching cover found")
-            return
-
-        self.cover_url = urljoin(self.site.root, chosen)
-        log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
-
-    # ------------------------------------------------------------
-    def get_chapter_page(self, soup):
-        """Return BeautifulSoup of the main chapter list page."""
-        node = soup.select_one(
-            "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
-        )
-        href = node.select_one("a").get("href")
-        chapter_url = urljoin(self.site.root, href)
-
-        # base for chapter links
-        parts = chapter_url.rsplit("/", 1)
-        self.chapter_base = parts[0] + "/"
-
-        return self._fetch(chapter_url)
-
-    # ------------------------------------------------------------
-    def parse_chapter_links(self, soup):
-        cont = soup.select_one(self.site.chapter_list_selector)
-        items = cont.select("ul li a[href]")
-
-        self.chapters = []
-        idx = 1
-
-        for a in items:
-            href = a.get("href")
-            if not href.endswith(".html"):
-                continue
-
-            title = a.get_text(strip=True)
-            full = urljoin(self.chapter_base, href)
-
-            self.chapters.append(Chapter(idx, title, full))
-            idx += 1
-
-        log_debug(f"[BookScraper] Found {len(self.chapters)} chapters")
+        return extract_metadata_full(self.url, self.site)
--- a/bookscraper/scraper/db.py
+++ b/bookscraper/scraper/db.py
--- a/bookscraper/scraper/engine/init.py
+++ b/bookscraper/scraper/engine/init.py
--- a/bookscraper/scraper/engine/fetcher.py
+++ b/bookscraper/scraper/engine/fetcher.py
@ -0,0 +1,27 @@
+# ============================================================
+# File: scraper/engine/fetcher.py
+# Purpose:
+#   Low-level HTML fetch utility shared by all site scrapers.
+#   Replaces scattered _fetch() logic inside BookScraper.
+# ============================================================
+
+import requests
+from bs4 import BeautifulSoup
+
+
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
+        "Gecko/20100101 Firefox/118.0"
+    )
+}
+
+
+def fetch_html(url: str, encoding: str = "utf-8", timeout: int = 10) -> BeautifulSoup:
+    """
+    Fetch HTML with a consistent user-agent and encoding.
+    Returns BeautifulSoup(lxml).
+    """
+    resp = requests.get(url, headers=HEADERS, timeout=timeout)
+    resp.encoding = encoding
+    return BeautifulSoup(resp.text, "lxml")
--- a/bookscraper/scraper/engine/parser.py
+++ b/bookscraper/scraper/engine/parser.py
@ -0,0 +1,65 @@
+# ============================================================
+# File: scraper/engine/parser.py
+# Purpose:
+#   High-level scraping API coordinating metadata extraction
+#   and chapter extraction using pluggable SiteScraper classes.
+#
+#   This is the new central engine:
+#     - extract_metadata_only() used by INIT flow
+#     - extract_metadata_full() used by full scraping pipeline
+# ============================================================
+
+from scraper.engine.fetcher import fetch_html
+
+
+def extract_metadata_only(url: str, site_scraper):
+    """
+    Extract ONLY lightweight metadata:
+      - title
+      - author
+      - description
+      - cover_url
+      - chapters_total = 0
+    """
+    soup = fetch_html(url, site_scraper.encoding)
+
+    title = site_scraper.parse_title(soup)
+    author = site_scraper.parse_author(soup)
+    description = site_scraper.parse_description(soup)
+    cover_url = site_scraper.parse_cover(soup, url)
+
+    return {
+        "title": title,
+        "author": author,
+        "description": description,
+        "cover_url": cover_url,
+        "chapters_total": 0,
+        "book_url": url,
+    }
+
+
+def extract_metadata_full(url: str, site_scraper):
+    """
+    Full scraping (metadata + chapterlist).
+    Used by the scraping Celery pipeline.
+    """
+    soup = fetch_html(url, site_scraper.encoding)
+
+    # metadata
+    meta = extract_metadata_only(url, site_scraper)
+
+    # chapter list
+    chapter_page_url = site_scraper.extract_chapter_page_url(soup)
+    chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding)
+    chapters = site_scraper.parse_chapter_list(chapter_page_soup)
+
+    meta["chapters"] = chapters
+    return meta
+
+
+def build_book_id(title: str) -> str:
+    """
+    Canonical book_id generator.
+    SCRAPE currently uses title as ID → preserve that behavior.
+    """
+    return title
--- a/bookscraper/scraper/services/init.py
+++ b/bookscraper/scraper/services/init.py
--- a/bookscraper/scraper/services/cover_service.py
+++ b/bookscraper/scraper/services/cover_service.py
@ -0,0 +1,44 @@
+# ============================================================
+# File: scraper/services/cover_service.py
+# ============================================================
+
+import os
+import requests
+from logbus.publisher import log
+
+
+class CoverService:
+
+    @staticmethod
+    def download_main_cover(cover_url: str, book_id: str) -> str | None:
+        """
+        Downloads cover image into: static/covers/<book_id>.jpg.
+        Returns local path or None.
+        """
+
+        if not cover_url:
+            log(f"[COVER] No cover URL for book={book_id}")
+            return None
+
+        static_dir = os.path.join("static", "covers")
+        os.makedirs(static_dir, exist_ok=True)
+
+        dst_path = os.path.join(static_dir, f"{book_id}.jpg")
+
+        try:
+            log(f"[COVER] Downloading: {cover_url}")
+
+            resp = requests.get(
+                cover_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
+            )
+            resp.raise_for_status()
+
+            with open(dst_path, "wb") as f:
+                f.write(resp.content)
+
+            log(f"[COVER] Stored: {dst_path}")
+            return dst_path
+
+        except Exception as e:
+            log(f"[COVER] FAILED ({cover_url}) → {e}")
+            return None
--- a/bookscraper/scraper/services/init_service.py
+++ b/bookscraper/scraper/services/init_service.py
@ -0,0 +1,74 @@
+# ============================================================
+# File: scraper/services/init_service.py
+# Purpose:
+#   Orchestrate INIT-flow:
+#     - resolve site
+#     - fetch minimal metadata
+#     - derive book_id
+#     - register in SQLite
+#     - store main cover
+# ============================================================
+
+import re
+from scraper.services.site_resolver import SiteResolver
+from scraper.services.scrape_engine import ScrapeEngine
+from scraper.services.cover_service import CoverService
+
+from db.repository import register_book
+
+
+class InitService:
+
+    @staticmethod
+    def derive_book_id(url: str) -> str:
+        """
+        PTWXZ URL format ends with /{id}.html.
+        If no match → fallback to sanitized URL.
+        """
+        m = re.search(r"/(\d+)\.html$", url)
+        if m:
+            return m.group(1)
+        return url.replace("/", "_")
+
+    @staticmethod
+    def execute(url: str) -> dict:
+        """
+        Main INIT-flow entry point.
+        Returns complete metadata + registration info.
+        """
+
+        # 1) Determine which BookSite applies
+        site = SiteResolver.resolve(url)
+
+        # 2) Metadata only (no chapters)
+        meta = ScrapeEngine.fetch_metadata_only(site, url)
+
+        title = meta.get("title") or "Unknown"
+        author = meta.get("author")
+        description = meta.get("description")
+        cover_url = meta.get("cover_url")
+
+        # 3) Determine book_id
+        book_id = InitService.derive_book_id(url)
+
+        # 4) SQLite registration
+        register_book(
+            book_id=book_id,
+            title=title,
+            author=author,
+            description=description,
+            cover_url=cover_url,
+        )
+
+        # 5) Download UI cover
+        CoverService.download_main_cover(cover_url, book_id)
+
+        # 6) Structured output for UI
+        return {
+            "book_id": book_id,
+            "title": title,
+            "author": author,
+            "description": description,
+            "cover_url": cover_url,
+            "status": "registered",
+        }
--- a/bookscraper/scraper/services/scrape_engine.py
+++ b/bookscraper/scraper/services/scrape_engine.py
@ -0,0 +1,33 @@
+# ============================================================
+# File: scraper/services/scrape_engine.py
+# Purpose:
+#   Provide unified scraping methods for INIT-flow.
+#   Reuses BookScraper internally with ZERO duplication.
+# ============================================================
+
+from scraper.book_scraper import BookScraper
+
+
+class ScrapeEngine:
+    """
+    Adapter layer around BookScraper.
+    Allows INIT-flow to fetch ONLY metadata (no chapters).
+    """
+
+    @staticmethod
+    def fetch_metadata_only(site, url: str) -> dict:
+        """
+        Execute BookScraper but return ONLY metadata.
+        Chapters are intentionally removed.
+        """
+        scraper = BookScraper(site, url)
+        result = scraper.execute()  # returns full metadata + chapters
+
+        # Strip chapterlist — INIT-flow should not fetch them
+        return {
+            "title": result.get("title"),
+            "author": result.get("author"),
+            "description": result.get("description"),
+            "cover_url": result.get("cover_url"),
+            "book_url": url,
+        }
--- a/bookscraper/scraper/services/site_resolver.py
+++ b/bookscraper/scraper/services/site_resolver.py
@ -0,0 +1,20 @@
+# ============================================================
+# File: scraper/services/site_resolver.py
+# Purpose:
+#   Determine which BookSite implementation applies for a given URL.
+#   This keeps INIT-flow and SCRAPE-flow site-agnostic.
+# ============================================================
+
+from scraper.sites import BookSite  # current PTWXZ implementation
+
+
+class SiteResolver:
+    """
+    Resolves the correct BookSite class based on URL.
+    Currently only PTWXZ/Piaotian is supported.
+    """
+
+    @staticmethod
+    def resolve(url: str):
+        # Later: add more domain rules for other sources
+        return BookSite()
--- a/bookscraper/scraper/sites/init.py
+++ b/bookscraper/scraper/sites/init.py
@ -0,0 +1,28 @@
+# ============================================================
+# File: scraper/sites/__init__.py
+# Purpose:
+#   Site autodetection based on URL.
+# ============================================================
+
+from scraper.sites.piaotian import PiaotianScraper
+
+
+def get_scraper_for_url(url: str):
+    """
+    Return the correct scraper instance for a given URL.
+    Later: add more site implementations.
+    """
+    if "ptwxz" in url or "piaotian" in url:
+        return PiaotianScraper()
+
+    raise ValueError(f"No scraper available for URL: {url}")
+
+
+# ============================================================
+# Backwards-compatibility export for legacy BookScraper
+# ============================================================
+# Old code expects:
+#   from scraper.sites import BookSite
+# We map that to our new PiaotianScraper implementation.
+
+BookSite = PiaotianScraper
--- a/bookscraper/scraper/sites/base.py
+++ b/bookscraper/scraper/sites/base.py
@ -0,0 +1,51 @@
+# ============================================================
+# File: scraper/sites/base.py
+# Purpose:
+#   Abstract interface that every site-specific scraper must implement.
+# ============================================================
+
+from abc import ABC, abstractmethod
+from bs4 import BeautifulSoup
+
+
+class SiteScraper(ABC):
+    """
+    Defines the interface for site-specific scrapers.
+    Each concrete scraper (Piaotian, Biquge, etc.) must implement these.
+    """
+
+    @property
+    @abstractmethod
+    def root(self) -> str: ...
+
+    @property
+    @abstractmethod
+    def encoding(self) -> str: ...
+
+    @property
+    @abstractmethod
+    def chapter_list_selector(self) -> str: ...
+
+    # --------------------------
+    # Metadata extraction
+    # --------------------------
+    @abstractmethod
+    def parse_title(self, soup: BeautifulSoup) -> str: ...
+
+    @abstractmethod
+    def parse_author(self, soup: BeautifulSoup) -> str: ...
+
+    @abstractmethod
+    def parse_description(self, soup: BeautifulSoup) -> str: ...
+
+    @abstractmethod
+    def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None: ...
+
+    # --------------------------
+    # Chapter extraction
+    # --------------------------
+    @abstractmethod
+    def extract_chapter_page_url(self, soup: BeautifulSoup) -> str: ...
+
+    @abstractmethod
+    def parse_chapter_list(self, soup: BeautifulSoup) -> list: ...
--- a/bookscraper/scraper/sites/piaotian.py
+++ b/bookscraper/scraper/sites/piaotian.py
@ -0,0 +1,120 @@
+# ============================================================
+# File: scraper/sites/piaotian.py
+# Purpose:
+#   Concrete SiteScraper implementation for ptwxz.com (Piaotian).
+#   Moves all parsing logic out of BookScraper.
+# ============================================================
+
+from scraper.sites.base import SiteScraper
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+import re
+
+
+class PiaotianScraper(SiteScraper):
+    root = "https://www.ptwxz.com"
+    encoding = "GB18030"
+    chapter_list_selector = "div.centent"
+
+    # ------------------------------------------------------------
+    # METADATA PARSING
+    # ------------------------------------------------------------
+    def parse_title(self, soup: BeautifulSoup) -> str:
+        h1 = soup.find("h1")
+        return h1.get_text(strip=True) if h1 else "UnknownBook"
+
+    def parse_author(self, soup: BeautifulSoup) -> str:
+        td = soup.find("td", string=lambda t: t and "作" in t)
+        raw = td.get_text(strip=True) if td else ""
+        return raw.split("：")[1] if "：" in raw else "UnknownAuthor"
+
+    def parse_description(self, soup: BeautifulSoup) -> str:
+        span = soup.find("span", string=lambda t: t and "内容简介" in t)
+        if not span:
+            return ""
+
+        parts = []
+        for sib in span.next_siblings:
+            # stop when next <span> reappears
+            if getattr(sib, "name", None) == "span":
+                break
+
+            text = (
+                sib.get_text(strip=True)
+                if hasattr(sib, "get_text")
+                else str(sib).strip()
+            )
+            if text:
+                parts.append(text)
+
+        return "\n".join(parts)
+
+    # ------------------------------------------------------------
+    # COVER PARSING
+    # (exactly your BookScraper._parse_cover logic)
+    # ------------------------------------------------------------
+    def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None:
+        # Extract book_id from URL
+        m = re.search(r"/(\d+)\.html$", url)
+        if not m:
+            return None
+
+        book_id = m.group(1)
+
+        # Extract vol (bookinfo/<vol>/<id>.html)
+        m2 = re.search(r"/bookinfo/(\d+)/", url)
+        volume = m2.group(1) if m2 else None
+
+        imgs = soup.find_all("img", src=True)
+        chosen = None
+
+        # Priority 1: match "/files/article/image/{vol}/{book_id}/"
+        if volume:
+            target_path = f"/files/article/image/{volume}/{book_id}/"
+            for img in imgs:
+                src = img["src"]
+                if target_path in src:
+                    chosen = src
+                    break
+
+        # Priority 2: endswith "/{book_id}s.jpg"
+        if not chosen:
+            target_suffix = f"/{book_id}s.jpg"
+            for img in imgs:
+                src = img["src"]
+                if src.endswith(target_suffix):
+                    chosen = src
+                    break
+
+        if not chosen:
+            return None
+
+        return urljoin(self.root, chosen)
+
+    # ------------------------------------------------------------
+    # CHAPTER EXTRACTION
+    # ------------------------------------------------------------
+    def extract_chapter_page_url(self, soup: BeautifulSoup) -> str:
+        node = soup.select_one(
+            "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
+        )
+        href = node.select_one("a").get("href")
+        return urljoin(self.root, href)
+
+    def parse_chapter_list(self, soup: BeautifulSoup) -> list:
+        cont = soup.select_one(self.chapter_list_selector)
+        items = cont.select("ul li a[href]") if cont else []
+
+        chapters = []
+        idx = 1
+
+        for a in items:
+            href = a.get("href")
+            if not href.endswith(".html"):
+                continue
+            title = a.get_text(strip=True)
+            full_url = urljoin(self.root, href)
+            chapters.append({"num": idx, "title": title, "url": full_url})
+            idx += 1
+
+        return chapters
--- a/bookscraper/scraper/tasks/audio_tasks.py
+++ b/bookscraper/scraper/tasks/audio_tasks.py
@ -21,7 +21,7 @@ redis_url = os.getenv("REDIS_BACKEND_LOCAL") or os.getenv("REDIS_BACKEND")
 parsed = urlparse(redis_url)

 # ------------------------------------------------------------
-# REGULIER REDIS CLIENT (slots, file checks, state)
+# REGULIER REDIS CLIENT (slots, file checks, dstate)
 # ------------------------------------------------------------
 redis_client = Redis(
    host=parsed.hostname,
--- a/bookscraper/static/js/init_book.js
+++ b/bookscraper/static/js/init_book.js
--- a/bookscraper/templates/components/init_book_form.html
+++ b/bookscraper/templates/components/init_book_form.html
--- a/bookscraper/templates/index.html
+++ b/bookscraper/templates/index.html
@ -1,34 +1,53 @@
 <!DOCTYPE html>
 <html lang="nl">
-<head>
-    <meta charset="UTF-8">
+  <head>
+    <meta charset="UTF-8" />
    <title>BookScraper</title>
    <style>
-        body { font-family: Arial, sans-serif; padding: 40px; max-width: 600px; margin: auto; }
-        h1 { margin-bottom: 20px; }
+      body {
+        font-family: Arial, sans-serif;
+        padding: 40px;
+        max-width: 600px;
+        margin: auto;
+      }
+      h1 {
+        margin-bottom: 20px;
+      }
      input[type="text"] {
-            width: 100%; padding: 12px; font-size: 16px;
-            border: 1px solid #ccc; border-radius: 6px;
+        width: 100%;
+        padding: 12px;
+        font-size: 16px;
+        border: 1px solid #ccc;
+        border-radius: 6px;
      }
      button {
        margin-top: 20px;
        padding: 12px 20px;
-            background: #007bff; color: white;
-            border: none; border-radius: 6px;
-            font-size: 16px; cursor: pointer;
+        background: #007bff;
+        color: white;
+        border: none;
+        border-radius: 6px;
+        font-size: 16px;
+        cursor: pointer;
+      }
+      button:hover {
+        background: #0056b3;
      }
-        button:hover { background: #0056b3; }
    </style>
-</head>
-<body>
+  </head>
+  <body>
+    <h1>BookScraper WebGUI</h1>

-<h1>BookScraper WebGUI</h1>
-
-<form action="/start" method="POST">
-    <label for="url">Geef een boek-URL op:</label><br><br>
-    <input type="text" id="url" name="url" placeholder="https://example.com/book/12345" required>
+    <form action="/init" method="POST">
+      <label for="url">Geef een boek-URL op:</label><br /><br />
+      <input
+        type="text"
+        id="url"
+        name="url"
+        placeholder="https://example.com/book/12345"
+        required
+      />
      <button type="submit">Start Scraping</button>
-</form>
-
-</body>
+    </form>
+  </body>
 </html>