Refactor: new Celery pipeline + scraper/tasks architecture

celery-integration
peter.fong 2 weeks ago
parent fe4ed78802
commit 8e2d3cec49

@ -0,0 +1 @@
output/

@ -0,0 +1,6 @@
# ChatGPT Project Context Bookscraper / Celery Branch ## 1. Scraper Status (STABIEL NIET aanpassen zonder toestemming) De Python-based bookscraper werkt volledig end-to-end. De volgende onderdelen zijn getest, stabiel en mogen niet worden herschreven of opgeschoond zonder expliciete toestemming: ### prepare_scripts() Genereert 3 shell-scripts in de output map: - say.txt — alleen TTS (Sinji voice, timestamps) - makebook.txt — alleen m4b merge + move - allinone.txt — TTS + merge + move ### Volume structuur output/<book>/<site>/v1, v2, v3, … ### Chapter-output - Chapter 1 bevat een header - Overige chapters bevatten alleen tekst ### Werkende functionaliteit - Rate limiter - Chapter parsing - Description parsing - Cover download - Skip-logica - prepare_scripts() ## 2. Ontwikkelregels voor ChatGPT - Geen stille rewrites - Geen opschonen van werkende code - Geen herstructurering zonder toestemming - Wijzigingen minimalistisch en doelgericht - Diff/patch-stijl - Exact aangeven welke bestanden worden geraakt - Directorystructuur behouden ## 3. Focusgebied (celery_branch) - Celery worker architectuur verbeteren - Queueing & retry policies - Stabiliteit & observability - Integratie met scraping tasks - Download functionaliteit eerst; audio later ## 4. Omgeving - VS Code Dev Containers - Docker Compose aanwezig - Celery + Redis in gebruik - Redis via hostmachine: redis://host.docker.internal:6379 ## 5. Download Worker (NIEUW Werkend) - download_worker.py volledig operationeel - Tasks worden correct geregistreerd: - tasks.scraping.download_chapter_task - tasks.scraping.download_chapter - tasks.scraping.scrape_book - Redis broker/backend correct geladen via .env - Worker pakt taken op en voert ze uit - Download pipeline werkt end-to-end ## 6. Logbus Status (NIEUW) - Logbus gebruikt nu REDIS_BACKEND of REDIS_BROKER - Geen crashes meer - Logging is non-blocking ## 7. Nog te bouwen (Download-Only fase) - DownloadController voor bulk-downloads - Flask API endpoint voor download - scrape_worker / audio_worker later ## 8. Project Tree (samengevat) scraper/ tasks/ worker/ logbus/ app.py docker-compose.yml OUTPUT structure blijft behouden.
find . \
-path "./output" -prune -o \
-path "*/__pycache__" -prune -o \
-print | sed -e 's;[^/]*/; |;g;s;|;|--;'

@ -1,64 +1,52 @@
# app.py
from flask import Flask, request, Response, render_template
import time
import queue
import redis
import os
# ============================================
# File: bookscraper/app.py
# Ensure project directory is on PYTHONPATH
# ============================================
from scraper.book_scraper import BookScraper
from scraper.sites import BookSite
from scraper.logger import add_listener, remove_listener, LOG_BUFFER
app = Flask(__name__)
r = redis.Redis.from_url("redis://redis:6379/0")
from scraper.logger import log_debug
from scraper.download_controller import DownloadController
from flask import Flask, render_template, request
from dotenv import load_dotenv
import sys
from pathlib import Path
# Add this directory (bookscraper/) to Python import path
PROJECT_ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))
@app.route("/")
def index():
return render_template("index.html")
# Load .env BEFORE any Celery app is imported
load_dotenv()
# ----------------------------------------------------------
# RUN SCRAPER
# ----------------------------------------------------------
app = Flask(__name__)
@app.route("/run", methods=["POST"])
def run_scraper():
data = request.json
site = BookSite()
scraper = BookScraper(site, data["url"])
result = scraper.execute()
return {
"title": result["title"],
"buffer": LOG_BUFFER.getvalue()
}
@app.route("/", methods=["GET"])
def index():
return render_template("index.html")
# ----------------------------------------------------------
# REALTIME LOG STREAM (SSE)
# ----------------------------------------------------------
@app.route("/start", methods=["POST"])
def start_scraping():
url = request.form.get("url", "").strip()
@app.route("/stream")
def stream():
if not url:
return render_template("result.html", error="Geen URL opgegeven.")
def event_stream():
pub = r.pubsub()
pub.subscribe("logs")
try:
log_debug(f"[WEB] Start scraping: {url}")
for msg in pub.listen():
if msg["type"] == "message":
yield f"data: {msg['data'].decode()}\n\n"
ctl = DownloadController(url)
result = ctl.start()
return Response(event_stream(), mimetype="text/event-stream")
return render_template("result.html", result=result, url=url)
except Exception as e:
log_debug(f"[WEB] ERROR: {e}")
return render_template("result.html", error=str(e), url=url)
# ----------------------------------------------------------
if __name__ == "__main__":
import os
debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0")
port = int(os.getenv("PORT", "5000"))
app.run(debug=debug, host=host, port=port)
app.run(host="0.0.0.0", port=5000, debug=debug)

@ -0,0 +1,18 @@
from flask import Blueprint, request, jsonify
from scraper.download_controller import DownloadController
bp_download = Blueprint("download", __name__)
@bp_download.post("/api/download")
def api_download():
data = request.get_json() or {}
url = data.get("url")
if not url:
return jsonify({"error": "Missing URL"}), 400
ctl = DownloadController(url)
result = ctl.start()
return jsonify(result), 200

@ -1,13 +1,36 @@
# celery_app.py
from celery import Celery
# ============================================
# File: bookscraper/celery_app.py
# ============================================
import os
from celery import Celery
from dotenv import load_dotenv
# Load environment variables (OK to do here)
load_dotenv()
print(">>> DEBUG: celery_app.py LOADED")
print(">>> DEBUG: env REDIS_BROKER =", os.getenv("REDIS_BROKER"))
print(">>> DEBUG: env REDIS_URL =", os.getenv("REDIS_URL"))
# Read broker settings
REDIS_BROKER = os.getenv("REDIS_BROKER")
REDIS_BACKEND = os.getenv("REDIS_BACKEND")
# Fallback ONLY if missing
if not REDIS_BROKER:
REDIS_BROKER = os.getenv(
"REDIS_URL", "redis://host.docker.internal:6379/0"
)
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
if not REDIS_BACKEND:
REDIS_BACKEND = REDIS_BROKER # safe fallback
# Create Celery app AFTER loading .env
celery_app = Celery(
"bookscraper",
broker=REDIS_URL,
backend=REDIS_URL
broker=REDIS_BROKER,
backend=REDIS_BACKEND,
)
celery_app.conf.update(
@ -15,7 +38,8 @@ celery_app.conf.update(
task_routes={
"tasks.scraping.*": {"queue": "scraping"},
"tasks.audio.*": {"queue": "audio"},
"tasks.*": {"queue": "default"},
},
worker_prefetch_multiplier=1, # important for concurrency=1 workers
worker_prefetch_multiplier=1,
task_acks_late=True,
)

@ -2,7 +2,7 @@
import redis
import os
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL)

@ -0,0 +1,39 @@
# scraper/download_controller.py
from logbus.publisher import log
from scraper.tasks.pipeline import build_chapter_pipeline
class DownloadController:
def __init__(self, url: str):
self.url = url
self.scraper = None # door BookScraper gevuld
self.base_path = None
def start(self):
log(f"[DL-CONTROLLER] Parsing metadata for {self.url}")
# 1) Boek info verzamelen
scraper = self.scraper = self._init_scraper()
scraper.parse_book_info()
# base_path bepalen
self.base_path = scraper.get_base_path()
# 2) Chapters ophalen
chapters = scraper.get_chapter_list()
# 3) Per chapter een Celery pipeline starten
for ch in chapters:
log(f"[DL-CONTROLLER] Queue pipeline for chapter {ch.number}")
workflow = build_chapter_pipeline(
chapter_number=ch.number,
chapter_url=ch.url,
base_path=self.base_path
)
workflow.delay() # 🔥 dit start de chain
return {"status": "queued", "chapters": len(chapters)}

@ -0,0 +1,33 @@
# scraper/tasks/download_tasks.py
from celery import shared_task
from logbus.publisher import log
import requests
@shared_task(bind=True, queue="download", ignore_result=False)
def download_chapter(self, chapter_number: int, chapter_url: str):
"""
Download a chapter page and return raw HTML for parsing.
Does NOT save anything; that is done by save_tasks.py
"""
log(f"[DL] Downloading chapter {chapter_number}: {chapter_url}")
try:
resp = requests.get(chapter_url, timeout=15)
resp.raise_for_status()
html = resp.text
log(f"[DL] OK {chapter_number}: {len(html)} bytes")
# Dit resultaat wordt doorgegeven aan parse_task
return {
"chapter": chapter_number,
"url": chapter_url,
"html": html,
}
except Exception as exc:
log(f"[DL] ERROR downloading {chapter_url}: {exc}")
raise

@ -0,0 +1,57 @@
# scraper/tasks/parse_tasks.py
from celery import shared_task
from logbus.publisher import log
from scraper.utils import clean_text
from bs4 import BeautifulSoup
@shared_task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, html: str, chapter_url: str):
"""
Parse downloaded chapter HTML into clean text.
Returns a dict:
{
"url": chapter_url,
"text": "...parsed text..."
}
"""
try:
log(f"[PARSE] Start parsing: {chapter_url}")
soup = BeautifulSoup(html, "html.parser")
# Veel Chinese sites gebruiken dit soort containers:
possible_blocks = [
"#content",
".content",
"div#content",
"div.content",
"div#chaptercontent",
"#chapterContent"
]
node = None
for sel in possible_blocks:
r = soup.select_one(sel)
if r:
node = r
break
if not node:
log(
f"[PARSE] WARNING: no known content block found in {chapter_url}")
text = clean_text(soup.get_text())
else:
text = clean_text(node.get_text())
log(f"[PARSE] Finished parsing: {chapter_url} ({len(text)} chars)")
return {
"url": chapter_url,
"text": text,
}
except Exception as exc:
log(f"[PARSE] ERROR parsing {chapter_url}: {exc}")
raise

@ -0,0 +1,28 @@
# scraper/tasks/pipeline.py
from celery import chain
from logbus.publisher import log
from scraper.tasks.download_tasks import download_chapter
from scraper.tasks.parse_tasks import parse_chapter
from scraper.tasks.save_tasks import save_chapter
def build_chapter_pipeline(chapter_number: int, chapter_url: str, base_path: str):
"""
Build a Celery pipeline for a single chapter:
download -> parse -> save
"""
log(f"[PIPELINE] Building chain for chapter {chapter_number}")
# Important: download returns dict {chapter, url, html}
# parse accepts html + chapter_url (via s())
# save accepts chapter_number, text, base_path
workflow = chain(
download_chapter.s(chapter_number, chapter_url),
parse_chapter.s(), # takes previous result dict
save_chapter.s(base_path=base_path)
)
return workflow

@ -0,0 +1,57 @@
# scraper/tasks/save_tasks.py
from celery import shared_task
from logbus.publisher import log
import os
@shared_task(bind=True, queue="save", ignore_result=False)
def save_chapter(self, result: dict, base_path: str):
"""
Save parsed chapter text to disk.
result = {
"url": ...,
"text": ...
}
"""
try:
text = result.get("text", "")
url = result.get("url")
# Haal chapter nummer uit URL
# Bijvoorbeeld: .../12345.html
# ⇒ 12345
chapter_number = extract_chapter_number(url)
if not os.path.exists(base_path):
os.makedirs(base_path, exist_ok=True)
filename = f"{chapter_number:05d}.txt"
path = os.path.join(base_path, filename)
with open(path, "w", encoding="utf-8") as f:
f.write(text)
log(f"[SAVE] Saved chapter {chapter_number}{path}")
return {"chapter": chapter_number, "path": path}
except Exception as exc:
log(f"[SAVE] ERROR saving chapter from {url}: {exc}")
raise
def extract_chapter_number(url: str) -> int:
"""
Utility extractor for chapter numbers from a URL.
Example: https://site.com/1234.html 1234
"""
try:
import re
m = re.search(r'(\d+)\.html?', url)
if m:
return int(m.group(1))
except:
pass
return 0

@ -0,0 +1,35 @@
from celery import shared_task
from scraper.book_scraper import BookScraper
from scraper.sites import BookSite
from logbus.publisher import log
@shared_task(bind=True, queue="scraping")
def scrape_book(self, url):
"""
HIGH-LEVEL SCRAPER TASK
Roept synchronen BookScraper aan voor een volledige scrape.
"""
log(f"[SCRAPER] Start scrape: {url}")
scraper = BookScraper(BookSite(), url)
result = scraper.execute()
log(f"[SCRAPER] Finished scrape: {url}")
return {"title": result["title"]}
@shared_task(bind=True, queue="download", max_retries=5)
def download_chapter_task(self, number, title, url, output_base):
"""
Download alleen één chapter.
download_worker.py voert dit uiteindelijk uit.
"""
from worker.download_worker import download_single_chapter
try:
return download_single_chapter(number, title, url, output_base)
except Exception as e:
log(f"[DOWNLOAD] Error while downloading chapter {number}: {e}")
raise self.retry(countdown=3)

@ -1,10 +0,0 @@
# tasks/pipeline.py
from celery import shared_task
from tasks.scraping import scrape_book
from tasks.audio import text_to_audio
@shared_task(bind=True)
def scrape_and_convert(self, url):
result = scrape_book.delay(url)
return result.id

@ -1,17 +0,0 @@
# tasks/scraping.py
from celery import shared_task
from scraper.book_scraper import BookScraper
from scraper.sites import BookSite
from logbus.publisher import log
@shared_task(bind=True, queue="scraping")
def scrape_book(self, url):
log(f"START scraping: {url}")
site = BookSite()
scraper = BookScraper(site, url)
result = scraper.execute()
log(f"FINISHED scraping: {url}")
return {"title": result["title"]}

@ -1,40 +1,34 @@
<!DOCTYPE html>
<html>
<html lang="nl">
<head>
<meta charset="UTF-8">
<title>BookScraper</title>
<style>
body { font-family: Arial; padding:20px; }
#log { background:#000; color:#0f0; padding:10px; height:400px; overflow:auto; white-space:pre-wrap; }
body { font-family: Arial, sans-serif; padding: 40px; max-width: 600px; margin: auto; }
h1 { margin-bottom: 20px; }
input[type="text"] {
width: 100%; padding: 12px; font-size: 16px;
border: 1px solid #ccc; border-radius: 6px;
}
button {
margin-top: 20px;
padding: 12px 20px;
background: #007bff; color: white;
border: none; border-radius: 6px;
font-size: 16px; cursor: pointer;
}
button:hover { background: #0056b3; }
</style>
</head>
<body>
<h1>BookScraper</h1>
<h1>BookScraper WebGUI</h1>
<input id="url" type="text" placeholder="Book URL" style="width:400px">
<button onclick="startScrape()">Start</button>
<h2>Realtime log:</h2>
<div id="log"></div>
<script>
function startScrape() {
document.getElementById("log").innerHTML = "";
const evtSource = new EventSource("/stream");
evtSource.onmessage = function(e) {
const logDiv = document.getElementById("log");
logDiv.innerText += e.data + "\n";
logDiv.scrollTop = logDiv.scrollHeight;
};
fetch("/run", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ url: document.getElementById("url").value })
});
}
</script>
<form action="/start" method="POST">
<label for="url">Geef een boek-URL op:</label><br><br>
<input type="text" id="url" name="url" placeholder="https://example.com/book/12345" required>
<button type="submit">Start Scraping</button>
</form>
</body>
</html>

@ -1,18 +1,63 @@
<!DOCTYPE html>
<html>
<html lang="nl">
<head>
<title>Scrape Done</title>
<meta charset="UTF-8">
<title>Scrape Resultaat</title>
<style>
body { font-family: Arial, sans-serif; padding: 40px; max-width: 900px; margin: auto; }
h1 { margin-bottom: 10px; }
.error { padding: 15px; background: #ffdddd; border-left: 5px solid #ff4444; margin-bottom: 20px; }
.box { padding: 15px; background: #f7f7f7; border: 1px solid #ddd; margin-bottom: 20px; border-radius: 6px; }
a { color: #007bff; text-decoration: none; }
a:hover { text-decoration: underline; }
pre { background: #222; color: #eee; padding: 10px; border-radius: 6px; overflow-x: auto; }
small { color: #555; }
</style>
</head>
<body>
<h1>Scrape Complete</h1>
<a href="/">&larr; Terug</a>
<p><strong>Book title:</strong> {{ title }}</p>
{% if error %}
<div class="error">
<strong>Fout:</strong><br>{{ error }}
</div>
{% endif %}
<p><strong>Output folder:</strong></p>
<pre>{{ basepath }}</pre>
<h1>Scrape resultaat</h1>
<a href="/">Scrape another book</a>
{% if result %}
<div class="box">
<strong>Titel:</strong> {{ result.title }}<br>
<strong>Auteur:</strong> {{ result.author }}<br>
</div>
{% if result.description %}
<div class="box">
<strong>Beschrijving:</strong><br>
<p>{{ result.description }}</p>
</div>
{% endif %}
<div class="box">
<strong>Aantal chapters:</strong> {{ result.chapters|length }}
</div>
{% if result.chapters %}
<div class="box">
<strong>Chapters:</strong><br><br>
<ul>
{% for ch in result.chapters %}
<li>
<a href="{{ ch.url }}" target="_blank">
Chapter {{ ch.number }} — {{ ch.title }}
</a>
</li>
{% endfor %}
</ul>
</div>
{% endif %}
{% endif %}
</body>
</html>

@ -0,0 +1,11 @@
# worker/download_worker.py
from worker.downloader import ChapterDownloader
def download_single_chapter(number, title, url, output_base):
"""
Called by Celery task.
"""
dl = ChapterDownloader()
return dl.download(number, title, url, output_base)

@ -0,0 +1,139 @@
# worker/downloader.py
import time
import requests
from io import BytesIO
from bs4 import BeautifulSoup
from scraper.logger import log_debug
from scraper.utils import clean_text
from urllib.parse import urljoin
class ChapterDownloader:
"""
Worker-side chapter downloader.
- Geen metadata scraping
- Geen BookScraper dependency
- Alleen: GET parse text save
"""
def __init__(self, min_delay=1.0):
self.min_delay = min_delay
self._last_download_time = 0
# ------------------------------------------------------------
def throttle(self):
now = time.time()
elapsed = now - self._last_download_time
if elapsed < self.min_delay:
time.sleep(self.min_delay - elapsed)
self._last_download_time = time.time()
# ------------------------------------------------------------
def get_doc_with_retry(self, url):
attempt = 1
while True:
self.throttle()
log_debug(f"[DL] GET {url} (attempt {attempt})")
try:
resp = requests.get(
url,
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
except Exception as e:
log_debug(f"[DL] Network error {e} → retry")
attempt += 1
time.sleep(2)
continue
code = resp.status_code
if code == 200:
resp.encoding = "utf-8"
return BeautifulSoup(resp.text, "lxml")
if code == 429:
log_debug("[DL] 429 cooldown 60s")
time.sleep(60)
attempt += 1
continue
if code in (403, 500):
log_debug(f"[DL] HTTP {code} → retry")
time.sleep(5)
attempt += 1
continue
log_debug(f"[DL] Unexpected HTTP {code}")
time.sleep(3)
attempt += 1
# ------------------------------------------------------------
def parse_chapter_text(self, soup):
"""
Kopie van BookScraper.parse_chapter_text,
MAAR zonder dependencies op parse_title, parse_author, etc.
"""
body = soup.body
if not body:
return ""
h1 = body.find("h1")
if not h1:
return ""
parts = []
collecting = False
for sib in h1.next_siblings:
if getattr(sib, "class", None) == ["toplink"]:
continue
if getattr(sib, "class", None) == ["bottomlink"]:
break
if getattr(sib, "name", None) in ["script", "style"]:
continue
if not collecting:
if getattr(sib, "name", None) == "br":
collecting = True
continue
text = sib.get_text("\n", strip=True) if hasattr(
sib, "get_text") else str(sib).strip()
if text:
parts.append(text)
raw = "\n".join(parts)
return clean_text(raw, {})
# ------------------------------------------------------------
def save_chapter(self, number, title, text, output_base):
"""
Save chapter using same volume logic as BookScraper.
"""
max_size = 200
volume = ((number - 1) // max_size) + 1
vdir = f"{output_base}/v{volume}"
import os
os.makedirs(vdir, exist_ok=True)
fname = f"{number:05d}_{title}.txt"
full = f"{vdir}/{fname}"
with open(full, "w", encoding="utf-8") as f:
f.write(text)
log_debug(f"[DL] Saved chapter {number}: {full}")
return full
# ------------------------------------------------------------
def download(self, number, title, url, output_base):
soup = self.get_doc_with_retry(url)
text = self.parse_chapter_text(soup)
return self.save_chapter(number, title, text, output_base)
Loading…
Cancel
Save