From 8e2d3cec49104a377d00b7f7ed56167e80ebe177 Mon Sep 17 00:00:00 2001 From: "peter.fong" Date: Mon, 1 Dec 2025 11:13:18 +0000 Subject: [PATCH] Refactor: new Celery pipeline + scraper/tasks architecture --- bookscraper/.gitignore | 1 + bookscraper/CHATGPT_CONTEXT.md | 6 + bookscraper/app.py | 80 +++++----- bookscraper/app_routes/download.py | 18 +++ bookscraper/celery_app.py | 36 ++++- bookscraper/logbus/publisher.py | 2 +- bookscraper/scraper/download_controller.py | 39 +++++ .../{queue => scraper/models}/__init__.py | 0 .../models/book_state.py} | 0 .../utils.py => scraper/tasks/__init__.py} | 0 bookscraper/{ => scraper}/tasks/audio.py | 0 bookscraper/scraper/tasks/download_tasks.py | 33 +++++ bookscraper/scraper/tasks/parse_tasks.py | 57 +++++++ bookscraper/scraper/tasks/pipeline.py | 28 ++++ bookscraper/scraper/tasks/save_tasks.py | 57 +++++++ bookscraper/scraper/tasks/scraping.py | 35 +++++ bookscraper/scraper/tasks/utils.py | 0 bookscraper/taskqueue/__init__.py | 0 bookscraper/{queue => taskqueue}/add_audio.py | 0 bookscraper/{queue => taskqueue}/add_book.py | 0 bookscraper/{queue => taskqueue}/models.py | 0 bookscraper/tasks/pipeline.py | 10 -- bookscraper/tasks/scraping.py | 17 --- bookscraper/templates/index.html | 50 +++---- bookscraper/templates/result.html | 59 +++++++- bookscraper/worker/download_worker.py | 11 ++ bookscraper/worker/downloader.py | 139 ++++++++++++++++++ 27 files changed, 563 insertions(+), 115 deletions(-) create mode 100644 bookscraper/.gitignore create mode 100644 bookscraper/CHATGPT_CONTEXT.md create mode 100644 bookscraper/app_routes/download.py create mode 100644 bookscraper/scraper/download_controller.py rename bookscraper/{queue => scraper/models}/__init__.py (100%) rename bookscraper/{tasks/__init__.py => scraper/models/book_state.py} (100%) rename bookscraper/{tasks/utils.py => scraper/tasks/__init__.py} (100%) rename bookscraper/{ => scraper}/tasks/audio.py (100%) create mode 100644 bookscraper/scraper/tasks/download_tasks.py create mode 100644 bookscraper/scraper/tasks/parse_tasks.py create mode 100644 bookscraper/scraper/tasks/pipeline.py create mode 100644 bookscraper/scraper/tasks/save_tasks.py create mode 100644 bookscraper/scraper/tasks/scraping.py create mode 100644 bookscraper/scraper/tasks/utils.py create mode 100644 bookscraper/taskqueue/__init__.py rename bookscraper/{queue => taskqueue}/add_audio.py (100%) rename bookscraper/{queue => taskqueue}/add_book.py (100%) rename bookscraper/{queue => taskqueue}/models.py (100%) delete mode 100644 bookscraper/tasks/pipeline.py delete mode 100644 bookscraper/tasks/scraping.py create mode 100644 bookscraper/worker/download_worker.py create mode 100644 bookscraper/worker/downloader.py diff --git a/bookscraper/.gitignore b/bookscraper/.gitignore new file mode 100644 index 0000000..ea1472e --- /dev/null +++ b/bookscraper/.gitignore @@ -0,0 +1 @@ +output/ diff --git a/bookscraper/CHATGPT_CONTEXT.md b/bookscraper/CHATGPT_CONTEXT.md new file mode 100644 index 0000000..399d43d --- /dev/null +++ b/bookscraper/CHATGPT_CONTEXT.md @@ -0,0 +1,6 @@ +# ChatGPT Project Context – Bookscraper / Celery Branch ## 1. Scraper Status (STABIEL – NIET aanpassen zonder toestemming) De Python-based bookscraper werkt volledig end-to-end. De volgende onderdelen zijn getest, stabiel en mogen niet worden herschreven of opgeschoond zonder expliciete toestemming: ### prepare_scripts() Genereert 3 shell-scripts in de output map: - say.txt — alleen TTS (Sinji voice, timestamps) - makebook.txt — alleen m4b merge + move - allinone.txt — TTS + merge + move ### Volume structuur output///v1, v2, v3, … ### Chapter-output - Chapter 1 bevat een header - Overige chapters bevatten alleen tekst ### Werkende functionaliteit - Rate limiter - Chapter parsing - Description parsing - Cover download - Skip-logica - prepare_scripts() ## 2. Ontwikkelregels voor ChatGPT - Geen stille rewrites - Geen opschonen van werkende code - Geen herstructurering zonder toestemming - Wijzigingen minimalistisch en doelgericht - Diff/patch-stijl - Exact aangeven welke bestanden worden geraakt - Directorystructuur behouden ## 3. Focusgebied (celery_branch) - Celery worker architectuur verbeteren - Queueing & retry policies - Stabiliteit & observability - Integratie met scraping tasks - Download functionaliteit eerst; audio later ## 4. Omgeving - VS Code Dev Containers - Docker Compose aanwezig - Celery + Redis in gebruik - Redis via hostmachine: redis://host.docker.internal:6379 ## 5. Download Worker (NIEUW – Werkend) - download_worker.py volledig operationeel - Tasks worden correct geregistreerd: - tasks.scraping.download_chapter_task - tasks.scraping.download_chapter - tasks.scraping.scrape_book - Redis broker/backend correct geladen via .env - Worker pakt taken op en voert ze uit - Download pipeline werkt end-to-end ## 6. Logbus Status (NIEUW) - Logbus gebruikt nu REDIS_BACKEND of REDIS_BROKER - Geen crashes meer - Logging is non-blocking ## 7. Nog te bouwen (Download-Only fase) - DownloadController voor bulk-downloads - Flask API endpoint voor download - scrape_worker / audio_worker later ## 8. Project Tree (samengevat) scraper/ tasks/ worker/ logbus/ app.py docker-compose.yml OUTPUT structure blijft behouden. + +find . \ + -path "./output" -prune -o \ + -path "*/__pycache__" -prune -o \ + -print | sed -e 's;[^/]*/; |;g;s;|;|--;' diff --git a/bookscraper/app.py b/bookscraper/app.py index 95ea814..f6f4377 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -1,64 +1,52 @@ -# app.py -from flask import Flask, request, Response, render_template -import time -import queue -import redis -import os +# ============================================ +# File: bookscraper/app.py +# Ensure project directory is on PYTHONPATH +# ============================================ -from scraper.book_scraper import BookScraper -from scraper.sites import BookSite -from scraper.logger import add_listener, remove_listener, LOG_BUFFER - -app = Flask(__name__) - -r = redis.Redis.from_url("redis://redis:6379/0") +from scraper.logger import log_debug +from scraper.download_controller import DownloadController +from flask import Flask, render_template, request +from dotenv import load_dotenv +import sys +from pathlib import Path +# Add this directory (bookscraper/) to Python import path +PROJECT_ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(PROJECT_ROOT)) -@app.route("/") -def index(): - return render_template("index.html") +# Load .env BEFORE any Celery app is imported +load_dotenv() -# ---------------------------------------------------------- -# RUN SCRAPER -# ---------------------------------------------------------- +app = Flask(__name__) -@app.route("/run", methods=["POST"]) -def run_scraper(): - data = request.json - site = BookSite() - scraper = BookScraper(site, data["url"]) - result = scraper.execute() - return { - "title": result["title"], - "buffer": LOG_BUFFER.getvalue() - } +@app.route("/", methods=["GET"]) +def index(): + return render_template("index.html") -# ---------------------------------------------------------- -# REALTIME LOG STREAM (SSE) -# ---------------------------------------------------------- +@app.route("/start", methods=["POST"]) +def start_scraping(): + url = request.form.get("url", "").strip() -@app.route("/stream") -def stream(): + if not url: + return render_template("result.html", error="Geen URL opgegeven.") - def event_stream(): - pub = r.pubsub() - pub.subscribe("logs") + try: + log_debug(f"[WEB] Start scraping: {url}") - for msg in pub.listen(): - if msg["type"] == "message": - yield f"data: {msg['data'].decode()}\n\n" + ctl = DownloadController(url) + result = ctl.start() - return Response(event_stream(), mimetype="text/event-stream") + return render_template("result.html", result=result, url=url) + except Exception as e: + log_debug(f"[WEB] ERROR: {e}") + return render_template("result.html", error=str(e), url=url) -# ---------------------------------------------------------- if __name__ == "__main__": + import os debug = os.getenv("FLASK_DEBUG", "0") == "1" - host = os.getenv("HOST", "0.0.0.0") - port = int(os.getenv("PORT", "5000")) - - app.run(debug=debug, host=host, port=port) + app.run(host="0.0.0.0", port=5000, debug=debug) diff --git a/bookscraper/app_routes/download.py b/bookscraper/app_routes/download.py new file mode 100644 index 0000000..88e137b --- /dev/null +++ b/bookscraper/app_routes/download.py @@ -0,0 +1,18 @@ +from flask import Blueprint, request, jsonify +from scraper.download_controller import DownloadController + +bp_download = Blueprint("download", __name__) + + +@bp_download.post("/api/download") +def api_download(): + data = request.get_json() or {} + url = data.get("url") + + if not url: + return jsonify({"error": "Missing URL"}), 400 + + ctl = DownloadController(url) + result = ctl.start() + + return jsonify(result), 200 diff --git a/bookscraper/celery_app.py b/bookscraper/celery_app.py index 5cedbe9..569f0ad 100644 --- a/bookscraper/celery_app.py +++ b/bookscraper/celery_app.py @@ -1,13 +1,36 @@ -# celery_app.py -from celery import Celery +# ============================================ +# File: bookscraper/celery_app.py +# ============================================ + import os +from celery import Celery +from dotenv import load_dotenv + +# Load environment variables (OK to do here) +load_dotenv() + +print(">>> DEBUG: celery_app.py LOADED") +print(">>> DEBUG: env REDIS_BROKER =", os.getenv("REDIS_BROKER")) +print(">>> DEBUG: env REDIS_URL =", os.getenv("REDIS_URL")) + +# Read broker settings +REDIS_BROKER = os.getenv("REDIS_BROKER") +REDIS_BACKEND = os.getenv("REDIS_BACKEND") + +# Fallback ONLY if missing +if not REDIS_BROKER: + REDIS_BROKER = os.getenv( + "REDIS_URL", "redis://host.docker.internal:6379/0" + ) -REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") +if not REDIS_BACKEND: + REDIS_BACKEND = REDIS_BROKER # safe fallback +# Create Celery app AFTER loading .env celery_app = Celery( "bookscraper", - broker=REDIS_URL, - backend=REDIS_URL + broker=REDIS_BROKER, + backend=REDIS_BACKEND, ) celery_app.conf.update( @@ -15,7 +38,8 @@ celery_app.conf.update( task_routes={ "tasks.scraping.*": {"queue": "scraping"}, "tasks.audio.*": {"queue": "audio"}, + "tasks.*": {"queue": "default"}, }, - worker_prefetch_multiplier=1, # important for concurrency=1 workers + worker_prefetch_multiplier=1, task_acks_late=True, ) diff --git a/bookscraper/logbus/publisher.py b/bookscraper/logbus/publisher.py index c09d6bd..faa9f2d 100644 --- a/bookscraper/logbus/publisher.py +++ b/bookscraper/logbus/publisher.py @@ -2,7 +2,7 @@ import redis import os -REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") +REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") r = redis.Redis.from_url(REDIS_URL) diff --git a/bookscraper/scraper/download_controller.py b/bookscraper/scraper/download_controller.py new file mode 100644 index 0000000..8b54254 --- /dev/null +++ b/bookscraper/scraper/download_controller.py @@ -0,0 +1,39 @@ +# scraper/download_controller.py + +from logbus.publisher import log +from scraper.tasks.pipeline import build_chapter_pipeline + + +class DownloadController: + + def __init__(self, url: str): + self.url = url + self.scraper = None # door BookScraper gevuld + self.base_path = None + + def start(self): + log(f"[DL-CONTROLLER] Parsing metadata for {self.url}") + + # 1) Boek info verzamelen + scraper = self.scraper = self._init_scraper() + scraper.parse_book_info() + + # base_path bepalen + self.base_path = scraper.get_base_path() + + # 2) Chapters ophalen + chapters = scraper.get_chapter_list() + + # 3) Per chapter een Celery pipeline starten + for ch in chapters: + log(f"[DL-CONTROLLER] Queue pipeline for chapter {ch.number}") + + workflow = build_chapter_pipeline( + chapter_number=ch.number, + chapter_url=ch.url, + base_path=self.base_path + ) + + workflow.delay() # 🔥 dit start de chain + + return {"status": "queued", "chapters": len(chapters)} diff --git a/bookscraper/queue/__init__.py b/bookscraper/scraper/models/__init__.py similarity index 100% rename from bookscraper/queue/__init__.py rename to bookscraper/scraper/models/__init__.py diff --git a/bookscraper/tasks/__init__.py b/bookscraper/scraper/models/book_state.py similarity index 100% rename from bookscraper/tasks/__init__.py rename to bookscraper/scraper/models/book_state.py diff --git a/bookscraper/tasks/utils.py b/bookscraper/scraper/tasks/__init__.py similarity index 100% rename from bookscraper/tasks/utils.py rename to bookscraper/scraper/tasks/__init__.py diff --git a/bookscraper/tasks/audio.py b/bookscraper/scraper/tasks/audio.py similarity index 100% rename from bookscraper/tasks/audio.py rename to bookscraper/scraper/tasks/audio.py diff --git a/bookscraper/scraper/tasks/download_tasks.py b/bookscraper/scraper/tasks/download_tasks.py new file mode 100644 index 0000000..a8e5bad --- /dev/null +++ b/bookscraper/scraper/tasks/download_tasks.py @@ -0,0 +1,33 @@ +# scraper/tasks/download_tasks.py + +from celery import shared_task +from logbus.publisher import log +import requests + + +@shared_task(bind=True, queue="download", ignore_result=False) +def download_chapter(self, chapter_number: int, chapter_url: str): + """ + Download a chapter page and return raw HTML for parsing. + Does NOT save anything; that is done by save_tasks.py + """ + + log(f"[DL] Downloading chapter {chapter_number}: {chapter_url}") + + try: + resp = requests.get(chapter_url, timeout=15) + resp.raise_for_status() + html = resp.text + + log(f"[DL] OK {chapter_number}: {len(html)} bytes") + + # Dit resultaat wordt doorgegeven aan parse_task + return { + "chapter": chapter_number, + "url": chapter_url, + "html": html, + } + + except Exception as exc: + log(f"[DL] ERROR downloading {chapter_url}: {exc}") + raise diff --git a/bookscraper/scraper/tasks/parse_tasks.py b/bookscraper/scraper/tasks/parse_tasks.py new file mode 100644 index 0000000..03a01a8 --- /dev/null +++ b/bookscraper/scraper/tasks/parse_tasks.py @@ -0,0 +1,57 @@ +# scraper/tasks/parse_tasks.py + +from celery import shared_task +from logbus.publisher import log +from scraper.utils import clean_text +from bs4 import BeautifulSoup + + +@shared_task(bind=True, queue="parse", ignore_result=False) +def parse_chapter(self, html: str, chapter_url: str): + """ + Parse downloaded chapter HTML into clean text. + Returns a dict: + { + "url": chapter_url, + "text": "...parsed text..." + } + """ + try: + log(f"[PARSE] Start parsing: {chapter_url}") + + soup = BeautifulSoup(html, "html.parser") + + # Veel Chinese sites gebruiken dit soort containers: + possible_blocks = [ + "#content", + ".content", + "div#content", + "div.content", + "div#chaptercontent", + "#chapterContent" + ] + + node = None + for sel in possible_blocks: + r = soup.select_one(sel) + if r: + node = r + break + + if not node: + log( + f"[PARSE] WARNING: no known content block found in {chapter_url}") + text = clean_text(soup.get_text()) + else: + text = clean_text(node.get_text()) + + log(f"[PARSE] Finished parsing: {chapter_url} ({len(text)} chars)") + + return { + "url": chapter_url, + "text": text, + } + + except Exception as exc: + log(f"[PARSE] ERROR parsing {chapter_url}: {exc}") + raise diff --git a/bookscraper/scraper/tasks/pipeline.py b/bookscraper/scraper/tasks/pipeline.py new file mode 100644 index 0000000..3a3c442 --- /dev/null +++ b/bookscraper/scraper/tasks/pipeline.py @@ -0,0 +1,28 @@ +# scraper/tasks/pipeline.py + +from celery import chain +from logbus.publisher import log + +from scraper.tasks.download_tasks import download_chapter +from scraper.tasks.parse_tasks import parse_chapter +from scraper.tasks.save_tasks import save_chapter + + +def build_chapter_pipeline(chapter_number: int, chapter_url: str, base_path: str): + """ + Build a Celery pipeline for a single chapter: + download -> parse -> save + """ + + log(f"[PIPELINE] Building chain for chapter {chapter_number}") + + # Important: download returns dict {chapter, url, html} + # parse accepts html + chapter_url (via s()) + # save accepts chapter_number, text, base_path + workflow = chain( + download_chapter.s(chapter_number, chapter_url), + parse_chapter.s(), # takes previous result dict + save_chapter.s(base_path=base_path) + ) + + return workflow diff --git a/bookscraper/scraper/tasks/save_tasks.py b/bookscraper/scraper/tasks/save_tasks.py new file mode 100644 index 0000000..cce44cf --- /dev/null +++ b/bookscraper/scraper/tasks/save_tasks.py @@ -0,0 +1,57 @@ +# scraper/tasks/save_tasks.py + +from celery import shared_task +from logbus.publisher import log +import os + + +@shared_task(bind=True, queue="save", ignore_result=False) +def save_chapter(self, result: dict, base_path: str): + """ + Save parsed chapter text to disk. + result = { + "url": ..., + "text": ... + } + """ + + try: + text = result.get("text", "") + url = result.get("url") + + # Haal chapter nummer uit URL + # Bijvoorbeeld: .../12345.html + # ⇒ 12345 + chapter_number = extract_chapter_number(url) + + if not os.path.exists(base_path): + os.makedirs(base_path, exist_ok=True) + + filename = f"{chapter_number:05d}.txt" + path = os.path.join(base_path, filename) + + with open(path, "w", encoding="utf-8") as f: + f.write(text) + + log(f"[SAVE] Saved chapter {chapter_number} → {path}") + + return {"chapter": chapter_number, "path": path} + + except Exception as exc: + log(f"[SAVE] ERROR saving chapter from {url}: {exc}") + raise + + +def extract_chapter_number(url: str) -> int: + """ + Utility extractor for chapter numbers from a URL. + Example: https://site.com/1234.html → 1234 + """ + try: + import re + m = re.search(r'(\d+)\.html?', url) + if m: + return int(m.group(1)) + except: + pass + return 0 diff --git a/bookscraper/scraper/tasks/scraping.py b/bookscraper/scraper/tasks/scraping.py new file mode 100644 index 0000000..2d435f8 --- /dev/null +++ b/bookscraper/scraper/tasks/scraping.py @@ -0,0 +1,35 @@ +from celery import shared_task +from scraper.book_scraper import BookScraper +from scraper.sites import BookSite +from logbus.publisher import log + + +@shared_task(bind=True, queue="scraping") +def scrape_book(self, url): + """ + HIGH-LEVEL SCRAPER TASK + Roept synchronen BookScraper aan voor een volledige scrape. + """ + log(f"[SCRAPER] Start scrape: {url}") + + scraper = BookScraper(BookSite(), url) + result = scraper.execute() + + log(f"[SCRAPER] Finished scrape: {url}") + return {"title": result["title"]} + + +@shared_task(bind=True, queue="download", max_retries=5) +def download_chapter_task(self, number, title, url, output_base): + """ + Download alleen één chapter. + download_worker.py voert dit uiteindelijk uit. + """ + from worker.download_worker import download_single_chapter + + try: + return download_single_chapter(number, title, url, output_base) + + except Exception as e: + log(f"[DOWNLOAD] Error while downloading chapter {number}: {e}") + raise self.retry(countdown=3) diff --git a/bookscraper/scraper/tasks/utils.py b/bookscraper/scraper/tasks/utils.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/taskqueue/__init__.py b/bookscraper/taskqueue/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bookscraper/queue/add_audio.py b/bookscraper/taskqueue/add_audio.py similarity index 100% rename from bookscraper/queue/add_audio.py rename to bookscraper/taskqueue/add_audio.py diff --git a/bookscraper/queue/add_book.py b/bookscraper/taskqueue/add_book.py similarity index 100% rename from bookscraper/queue/add_book.py rename to bookscraper/taskqueue/add_book.py diff --git a/bookscraper/queue/models.py b/bookscraper/taskqueue/models.py similarity index 100% rename from bookscraper/queue/models.py rename to bookscraper/taskqueue/models.py diff --git a/bookscraper/tasks/pipeline.py b/bookscraper/tasks/pipeline.py deleted file mode 100644 index 28ef012..0000000 --- a/bookscraper/tasks/pipeline.py +++ /dev/null @@ -1,10 +0,0 @@ -# tasks/pipeline.py -from celery import shared_task -from tasks.scraping import scrape_book -from tasks.audio import text_to_audio - - -@shared_task(bind=True) -def scrape_and_convert(self, url): - result = scrape_book.delay(url) - return result.id diff --git a/bookscraper/tasks/scraping.py b/bookscraper/tasks/scraping.py deleted file mode 100644 index 0b742ac..0000000 --- a/bookscraper/tasks/scraping.py +++ /dev/null @@ -1,17 +0,0 @@ -# tasks/scraping.py -from celery import shared_task -from scraper.book_scraper import BookScraper -from scraper.sites import BookSite -from logbus.publisher import log - - -@shared_task(bind=True, queue="scraping") -def scrape_book(self, url): - log(f"START scraping: {url}") - - site = BookSite() - scraper = BookScraper(site, url) - result = scraper.execute() - - log(f"FINISHED scraping: {url}") - return {"title": result["title"]} diff --git a/bookscraper/templates/index.html b/bookscraper/templates/index.html index 7cb4612..a8a4b76 100644 --- a/bookscraper/templates/index.html +++ b/bookscraper/templates/index.html @@ -1,40 +1,34 @@ - + + BookScraper -

BookScraper

+

BookScraper WebGUI

- - - -

Realtime log:

-
- - +
+

+ + +
diff --git a/bookscraper/templates/result.html b/bookscraper/templates/result.html index a1e0a56..bbf4082 100644 --- a/bookscraper/templates/result.html +++ b/bookscraper/templates/result.html @@ -1,18 +1,63 @@ - + - Scrape Done + + Scrape Resultaat + -

Scrape Complete

+← Terug -

Book title: {{ title }}

+{% if error %} +
+ Fout:
{{ error }} +
+{% endif %} -

Output folder:

-
{{ basepath }}
+

Scrape resultaat

-Scrape another book +{% if result %} +
+ Titel: {{ result.title }}
+ Auteur: {{ result.author }}
+
+ +{% if result.description %} +
+ Beschrijving:
+

{{ result.description }}

+
+{% endif %} + +
+ Aantal chapters: {{ result.chapters|length }} +
+ +{% if result.chapters %} +
+ Chapters:

+ +
+{% endif %} +{% endif %} diff --git a/bookscraper/worker/download_worker.py b/bookscraper/worker/download_worker.py new file mode 100644 index 0000000..ab1578e --- /dev/null +++ b/bookscraper/worker/download_worker.py @@ -0,0 +1,11 @@ +# worker/download_worker.py + +from worker.downloader import ChapterDownloader + + +def download_single_chapter(number, title, url, output_base): + """ + Called by Celery task. + """ + dl = ChapterDownloader() + return dl.download(number, title, url, output_base) diff --git a/bookscraper/worker/downloader.py b/bookscraper/worker/downloader.py new file mode 100644 index 0000000..6ad8378 --- /dev/null +++ b/bookscraper/worker/downloader.py @@ -0,0 +1,139 @@ +# worker/downloader.py + +import time +import requests +from io import BytesIO +from bs4 import BeautifulSoup +from scraper.logger import log_debug +from scraper.utils import clean_text +from urllib.parse import urljoin + + +class ChapterDownloader: + """ + Worker-side chapter downloader. + - Geen metadata scraping + - Geen BookScraper dependency + - Alleen: GET → parse → text → save + """ + + def __init__(self, min_delay=1.0): + self.min_delay = min_delay + self._last_download_time = 0 + + # ------------------------------------------------------------ + def throttle(self): + now = time.time() + elapsed = now - self._last_download_time + + if elapsed < self.min_delay: + time.sleep(self.min_delay - elapsed) + + self._last_download_time = time.time() + + # ------------------------------------------------------------ + def get_doc_with_retry(self, url): + attempt = 1 + + while True: + self.throttle() + log_debug(f"[DL] GET {url} (attempt {attempt})") + + try: + resp = requests.get( + url, + headers={"User-Agent": "Mozilla/5.0"}, + timeout=10, + ) + except Exception as e: + log_debug(f"[DL] Network error {e} → retry") + attempt += 1 + time.sleep(2) + continue + + code = resp.status_code + + if code == 200: + resp.encoding = "utf-8" + return BeautifulSoup(resp.text, "lxml") + + if code == 429: + log_debug("[DL] 429 cooldown 60s") + time.sleep(60) + attempt += 1 + continue + + if code in (403, 500): + log_debug(f"[DL] HTTP {code} → retry") + time.sleep(5) + attempt += 1 + continue + + log_debug(f"[DL] Unexpected HTTP {code}") + time.sleep(3) + attempt += 1 + + # ------------------------------------------------------------ + def parse_chapter_text(self, soup): + """ + Kopie van BookScraper.parse_chapter_text, + MAAR zonder dependencies op parse_title, parse_author, etc. + """ + body = soup.body + if not body: + return "" + + h1 = body.find("h1") + if not h1: + return "" + + parts = [] + collecting = False + + for sib in h1.next_siblings: + if getattr(sib, "class", None) == ["toplink"]: + continue + if getattr(sib, "class", None) == ["bottomlink"]: + break + if getattr(sib, "name", None) in ["script", "style"]: + continue + + if not collecting: + if getattr(sib, "name", None) == "br": + collecting = True + continue + + text = sib.get_text("\n", strip=True) if hasattr( + sib, "get_text") else str(sib).strip() + if text: + parts.append(text) + + raw = "\n".join(parts) + return clean_text(raw, {}) + + # ------------------------------------------------------------ + def save_chapter(self, number, title, text, output_base): + """ + Save chapter using same volume logic as BookScraper. + """ + max_size = 200 + volume = ((number - 1) // max_size) + 1 + vdir = f"{output_base}/v{volume}" + + import os + os.makedirs(vdir, exist_ok=True) + + fname = f"{number:05d}_{title}.txt" + full = f"{vdir}/{fname}" + + with open(full, "w", encoding="utf-8") as f: + f.write(text) + + log_debug(f"[DL] Saved chapter {number}: {full}") + return full + + # ------------------------------------------------------------ + def download(self, number, title, url, output_base): + soup = self.get_doc_with_retry(url) + text = self.parse_chapter_text(soup) + return self.save_chapter(number, title, text, output_base)