bookscraper single thread

celery-integration
peter.fong 2 weeks ago
parent 158cb63d54
commit 3ed85d08e3

@ -0,0 +1,20 @@
FROM python:3.11-slim
# Pillow dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libjpeg62-turbo-dev zlib1g-dev \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Kopieer volledige app (zoals hij nu is)
COPY . .
# Expose Flask port
EXPOSE 5000
# Gebruik jouw eigen app.py als entrypoint
CMD ["python", "app.py"]

@ -0,0 +1,125 @@
# 📚 BookScraper — Web UI + Docker + Live Log Streaming
BookScraper is een moderne, volledig geautomatiseerde scraper voor Chinese webnovels
zoals **Piaotian / Piaotia**.
Het project combineert een krachtige scraping-engine met een prettige webinterface.
---
# 🔍 Wat doet dit project?
BookScraper bestaat uit drie belangrijke onderdelen:
---
## 1. 🧠 BookScraper Engine (Python)
Dit is de kern van het project.
De engine:
- Leest basisinformatie van een boek (titel, auteur, cover)
- Zoekt alle chapter-links
- Downloadt elk chapter met:
- **Retry systeem**
- **Anti-429 backoff**
(wacht: `backoff * attempt + 1 seconde`)
- Detectie van lege chapters → automatisch opnieuw proberen
- Past tekstreplacements toe (via `replacements.txt`)
- Slaat chapters geordend op
- Splits lange boeken automatisch in volumes (`v1/`, `v2/`, `v3/`…)
De engine is **bestand tegen rate limiting** van Piaotian en soortgelijke sites
en werkt met een **throttle (MAX_DOWNLOADS_PER_SEC)** om blokkades te voorkomen.
---
## 2. 🌐 Flask Webinterface (UI)
De webinterface biedt:
- Een invoerveld voor de boek-URL
- Een knop: **Run Scraper**
- Live feedback via **server-sent events (SSE)**
Tijdens het scrapen zie je realtime updates verschijnen, zoals:
[DEBUG] GET chapter 1123
[DEBUG] HTTP 429 → retry sleep 4.0s
[DEBUG] Saved chapter: output/xxx/01123_章名.txt
Hierdoor voelt het alsof de scraper “live” aan het werk is.
---
## 3. 📡 Live Logging (SSE)
De Logger vangt alle BookScraper-meldingen op en streamt ze
via `/stream` naar de webinterface.
Dit maakt het ideaal om scraping in de gaten te houden zonder console.
---
## 4. 🔧 Configuratie via `.env`
Om het project flexibel te houden wordt alles ingesteld via `.env`:
- Throttle (`MAX_DOWNLOADS_PER_SEC`)
- Debugmode (`FLASK_DEBUG`)
- DRY_RUN (alleen eerste chapters)
- Volume size
- Host & Port
De `.env` wordt automatisch geladen door Docker Compose en door Flask.
---
# 📦 Projectstructuur
bookscraper/
├── scraper/
│ ├── book_scraper.py # De scraper engine
│ ├── logger.py # SSE logger
│ ├── sites.py # Site configuratie (selectors etc.)
│ ├── utils.py # Helpers
│ └── ...
├── templates/
│ └── index.html # UI
├── output/ # Book results
├── app.py # Flask webserver + endpoints
├── replacements.txt # Tekstvervangers
├── Dockerfile
├── docker-compose.yml
├── requirements.txt
└── .env
---
# ▶️ Project handmatig starten (ZONDER Docker)
Zorg dat dependencies geïnstalleerd zijn:
```bash
pip install -r requirements.txt
Start de Flask server:
python app.py
Open daarna:
👉 http://localhost:5000
Docker Build (zonder compose)
Manueel builden:
docker build -t bookscraper .
docker run -p 5000:5000 --env-file .env bookscraper
docker run \
-p 5000:5000 \
--env-file .env \
-v $(pwd)/output:/app/output \
bookscraper

@ -1,53 +1,71 @@
from flask import Flask, request, render_template_string # app.py
from scraper.book_scraper import BookScraper from flask import Flask, request, Response, render_template
from scraper.sites import BookSite import time
import sys import queue
import os import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from scraper.book_scraper import BookScraper
from scraper.sites import BookSite
from scraper.logger import add_listener, remove_listener, LOG_BUFFER
app = Flask(__name__) app = Flask(__name__)
# --- GET: toon formulier --- @app.route("/")
@app.route("/", methods=["GET"])
def index(): def index():
return render_template_string(""" return render_template("index.html")
<html>
<body>
<h2>BookScraper</h2>
<form method="post">
<label>Book URL:</label><br>
<input name="url" style="width:400px"><br>
<button type="submit">Scrape</button>
</form>
</body>
</html>
""")
# --- POST: scraper uitvoeren ---
@app.route("/", methods=["POST"])
def run_scraper():
url = request.form.get("url")
# ----------------------------------------------------------
# RUN SCRAPER
# ----------------------------------------------------------
@app.route("/run", methods=["POST"])
def run_scraper():
data = request.json
site = BookSite() site = BookSite()
scraper = BookScraper(site, url) scraper = BookScraper(site, data["url"])
result = scraper.execute() result = scraper.execute()
return render_template_string(""" return {
<html> "title": result["title"],
<body> "buffer": LOG_BUFFER.getvalue()
<h2>Scrape result: {{title}}</h2> }
<h3>Debug output:</h3>
<pre style='background:#111;color:#0f0;padding:10px;border-radius:8px'>
{{debug}}
</pre>
<p><a href="/">Terug</a></p>
</body>
</html>
""", title=result["title"], debug=result["debug"])
# ----------------------------------------------------------
# REALTIME LOG STREAM (SSE)
# ----------------------------------------------------------
@app.route("/stream")
def stream():
def event_stream():
q = queue.Queue()
# push logregels van BookScraper naar SSE
def listener(line):
q.put(line)
add_listener(listener)
try:
while True:
msg = q.get() # blokkeert totdat logregel binnenkomt
yield f"data: {msg}\n\n"
except GeneratorExit:
pass
finally:
remove_listener(listener)
return Response(event_stream(), mimetype="text/event-stream")
# ----------------------------------------------------------
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True) debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0")
port = int(os.getenv("PORT", "5000"))
app.run(debug=debug, host=host, port=port)

@ -0,0 +1,25 @@
version: "3.9"
services:
bookscraper:
build:
context: .
dockerfile: Dockerfile
container_name: bookscraper
ports:
- "5050:5000"
# Mount alles zoals je lokaal al werkt
volumes:
- .:/app # volledige projectmap
- /Users/peter/Desktop/books:/app/output
# Bestaande .env wordt automatisch geladen door Docker Compose
env_file:
- .env
# Zorg dat Flask NIET in debugmode gaat (jouw code bepaalt dit)
environment:
FLASK_ENV: "production"
restart: unless-stopped

Binary file not shown.

Before

Width:  |  Height:  |  Size: 11 KiB

@ -6,18 +6,14 @@ from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from PIL import Image from PIL import Image
from io import BytesIO from io import BytesIO
from dotenv import load_dotenv
from scraper.logger import setup_logger, LOG_BUFFER from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements from scraper.utils import clean_text, load_replacements
load_dotenv()
logger = setup_logger()
class Chapter: class Chapter:
def __init__(self, number, title, url): def __init__(self, num, title, url):
self.number = number self.number = num
self.title = title self.title = title
self.url = url self.url = url
self.text = "" self.text = ""
@ -34,88 +30,135 @@ class BookScraper:
self.cover_url = "" self.cover_url = ""
self.chapters = [] self.chapters = []
self.chapter_base = None
self.base_path = None self.base_path = None
self.chapter_base = None
# ENV
self.DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
self.TEST_LIMIT = int(os.getenv("TEST_LIMIT", "10"))
self.MAX_DL = float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1"))
self.min_delay = 1.0 / self.MAX_DL if self.MAX_DL > 0 else 1.0
self._last_download_time = 0
# replacements.txt
fp = os.path.join(os.getcwd(), "replacements.txt")
extra = load_replacements(fp)
self.site.replacements.update(extra)
self.start_time = None
self.total_chapters = 0
self.volume_dirs = {}
# ------------------------------------------------------------
# RATE LIMITER
# ------------------------------------------------------------
# ENV settings def throttle(self):
self.DRY_RUN = os.getenv("DRY_RUN", "0") == "1" now = time.time()
self.TEST_CHAPTER_LIMIT = int(os.getenv("TEST_CHAPTER_LIMIT", "10")) elapsed = now - self._last_download_time
self.MAX_VOL_SIZE = int(os.getenv("MAX_VOL_SIZE", "1500"))
self.MAX_DL_PER_SEC = int(os.getenv("MAX_DL_PER_SEC", "2"))
# Load text replacements if elapsed < self.min_delay:
self.replacements = load_replacements("replacements.txt") time.sleep(self.min_delay - elapsed)
# ----------------------------------------------------- self._last_download_time = time.time()
# ------------------------------------------------------------
def execute(self): def execute(self):
LOG_BUFFER.seek(0) log_debug(f"Starting scraper for {self.url}")
LOG_BUFFER.truncate(0)
logger.debug("Starting scraper for %s", self.url) self.start_time = time.time()
soup = self.get_document(self.url)
soup = self.get_doc_with_retry(self.url)
self.parse_title(soup) self.parse_title(soup)
self.parse_author(soup) self.parse_author(soup)
self.parse_description(soup) self.parse_description(soup)
self.parse_cover(soup) self.parse_cover(soup)
self.prepare_output_folder() self.prepare_output_folder()
chapter_page = self.get_chapter_page(soup) chapter_page = self.get_chapter_page(soup)
self.parse_chapter_links(chapter_page) self.parse_chapter_links(chapter_page)
self.prepare_volume_folders()
if self.DRY_RUN: if self.DRY_RUN:
logger.debug( self.download_some(self.TEST_LIMIT)
"DRY RUN → downloading only first %s chapters", self.TEST_CHAPTER_LIMIT)
self.get_some_chapters(self.TEST_CHAPTER_LIMIT)
else: else:
self.get_all_chapters() self.download_all()
self.split_into_volumes()
return {"title": self.book_title}
# ------------------------------------------------------------
# HTTP GET WITH RETRIES + HARD 429 COOLDOWN WITH COUNTDOWN
# ------------------------------------------------------------
def get_doc_with_retry(self, url):
attempt = 1
while True:
self.throttle()
log_debug(f"GET {url} (attempt {attempt})")
try:
resp = requests.get(
url,
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
except Exception as e:
log_debug(f"Network error {e} → retry in {attempt + 1}s")
time.sleep(attempt + 1)
attempt += 1
continue
return { code = resp.status_code
"title": self.book_title, log_debug(f"HTTP {code} for {url}")
"debug": LOG_BUFFER.getvalue()
} # 429 → hard cooldown with countdown
if code == 429:
cooldown = 60
log_debug(f"429 detected — cooldown {cooldown}s")
for i in range(cooldown, 0, -1):
log_debug(f"429 cooldown… {i}s remaining")
time.sleep(1)
attempt += 1
continue
# ----------------------------------------------------- # recoverable
# NETWORK if code in (403, 500):
# ----------------------------------------------------- wait = min(5 * attempt, 30)
def get_document(self, url): log_debug(f"HTTP {code} → retry in {wait}s")
logger.debug("GET %s", url) time.sleep(wait)
time.sleep(1 / max(1, self.MAX_DL_PER_SEC)) attempt += 1
continue
resp = requests.get( if code == 200:
url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) resp.encoding = self.site.encoding
resp.encoding = self.site.encoding return BeautifulSoup(resp.text, "lxml")
logger.debug("HTTP %s for %s", resp.status_code, url) # unexpected
return BeautifulSoup(resp.text, "lxml") wait = attempt + 1
log_debug(f"Unexpected HTTP {code} → sleep {wait}s")
time.sleep(wait)
attempt += 1
# ----------------------------------------------------- # ------------------------------------------------------------
# BASIC PARSERS (piaotia structure)
# -----------------------------------------------------
def parse_title(self, soup): def parse_title(self, soup):
h1 = soup.find("h1") h1 = soup.find("h1")
if h1: self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
self.book_title = h1.get_text(strip=True) log_debug(f"Book title = {self.book_title}")
else:
self.book_title = "UnknownTitle"
logger.debug("Book title: %s", self.book_title)
def parse_author(self, soup): def parse_author(self, soup):
td = soup.find("td", string=lambda t: t and "" in t and "" in t) td = soup.find("td", string=lambda t: t and "" in t)
if td: self.book_author = (
raw = td.get_text(strip=True) td.get_text(strip=True).split("")[1]
if "" in raw: if td and "" in td.get_text()
self.book_author = raw.split("", 1)[1].strip() else "UnknownAuthor"
else: )
self.book_author = "UnknownAuthor" log_debug(f"Book author = {self.book_author}")
else:
self.book_author = "UnknownAuthor"
logger.debug("Book author: %s", self.book_author)
def parse_description(self, soup): def parse_description(self, soup):
span = soup.find("span", string=lambda t: t and "内容简介" in t) span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span: if not span:
log_debug("No description found")
self.book_description = "" self.book_description = ""
return return
@ -123,113 +166,210 @@ class BookScraper:
for sib in span.next_siblings: for sib in span.next_siblings:
if getattr(sib, "name", None) == "span": if getattr(sib, "name", None) == "span":
break break
txt = sib.get_text(strip=True) if not isinstance( text = (
sib, str) else sib.strip() sib.get_text(strip=True)
if txt: if hasattr(sib, "get_text")
parts.append(txt) else str(sib).strip()
)
if text:
parts.append(text)
self.book_description = "\n".join(parts) self.book_description = "\n".join(parts)
logger.debug("Description parsed (%s chars)", log_debug(f"Description length = {len(self.book_description)}")
len(self.book_description))
# ------------------------------------------------------------
def parse_cover(self, soup): def parse_cover(self, soup):
selector = ( cover = soup.find(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table " "img", src=lambda v: v and "files/article/image" in v)
"> tr:nth-of-type(4) > td:nth-of-type(1) > table > tr:nth-of-type(1) " if not cover:
"> td:nth-of-type(2) > a:nth-of-type(1) > img" log_debug("Cover not found")
) return
img = soup.select_one(selector)
if img:
self.cover_url = urljoin(self.site.root, img.get("src"))
else:
logger.debug("Cover not found!")
logger.debug("Cover URL = %s", self.cover_url)
# ----------------------------------------------------- self.cover_url = urljoin(self.site.root, cover.get("src"))
log_debug(f"Cover URL = {self.cover_url}")
# ------------------------------------------------------------
def prepare_output_folder(self): def prepare_output_folder(self):
output_root = os.getenv("OUTPUT_DIR", "./output") self.base_path = Path("output") / self.book_title / self.site.name
self.base_path = Path(output_root) / self.book_title / self.site.name
self.base_path.mkdir(parents=True, exist_ok=True) self.base_path.mkdir(parents=True, exist_ok=True)
logger.debug("Output directory: %s", self.base_path)
if self.cover_url: if self.cover_url:
self.save_image(self.cover_url, self.base_path / "cover.jpg") self.download_cover()
def download_cover(self):
log_debug(f"Downloading cover: {self.cover_url}")
def save_image(self, url, path):
logger.debug("Downloading cover: %s", url)
resp = requests.get( resp = requests.get(
url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) self.cover_url,
if resp.status_code == 200: headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
if resp.status_code != 200:
return
if "html" in resp.headers.get("Content-Type", ""):
return
try:
img = Image.open(BytesIO(resp.content)) img = Image.open(BytesIO(resp.content))
img.save(path) except:
logger.debug("Cover saved to %s", path) return
img.save(self.base_path / "cover.jpg")
log_debug("Cover saved")
# ----------------------------------------------------- # ------------------------------------------------------------
# CHAPTER PAGE
# -----------------------------------------------------
def get_chapter_page(self, soup): def get_chapter_page(self, soup):
node = soup.select_one( node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table") "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
link = node.select_one("a") )
href = link.get("href") href = node.select_one("a").get("href")
chapter_url = urljoin(self.site.root, href) url = urljoin(self.site.root, href)
parsed = urlparse(chapter_url)
base = parsed.path.rsplit("/", 1)[0] + "/"
self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{base}"
logger.debug("Chapter index URL = %s", chapter_url) parsed = urlparse(url)
logger.debug("CHAPTER_BASE = %s", self.chapter_base) bp = parsed.path.rsplit("/", 1)[0] + "/"
self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{bp}"
return self.get_document(chapter_url) return self.get_doc_with_retry(url)
# ------------------------------------------------------------
def parse_chapter_links(self, soup): def parse_chapter_links(self, soup):
container = soup.select_one("div.centent") cont = soup.select_one(self.site.chapter_list_selector)
links = container.select("ul li a[href]") items = cont.select("ul li a[href]")
for i, a in enumerate(links, 1): self.chapters = []
idx = 1
for a in items:
href = a.get("href") href = a.get("href")
if not href.endswith(".html"): if not href.endswith(".html"):
continue continue
abs_url = urljoin(self.chapter_base, href)
title = a.get_text(strip=True) title = a.get_text(strip=True)
self.chapters.append(Chapter(i, title, abs_url)) full = urljoin(self.chapter_base, href)
self.chapters.append(Chapter(idx, title, full))
idx += 1
self.total_chapters = len(self.chapters)
log_debug(f"Found {self.total_chapters} chapters")
# ------------------------------------------------------------
def prepare_volume_folders(self):
max_size = int(os.getenv("MAX_VOL_SIZE", "200"))
num_vols = (self.total_chapters + max_size - 1) // max_size
logger.debug("Total chapters: %s", len(self.chapters)) for v in range(1, num_vols + 1):
d = self.base_path / f"v{v}"
d.mkdir(parents=True, exist_ok=True)
self.volume_dirs[v] = d
# ----------------------------------------------------- # ------------------------------------------------------------
# DOWNLOAD CHAPTERS def download_all(self):
# -----------------------------------------------------
def get_all_chapters(self):
for ch in self.chapters: for ch in self.chapters:
ch.text = self.fetch_chapter(ch) self.download_chapter(ch)
logger.debug("CH %s length = %s", ch.number, len(ch.text))
def get_some_chapters(self, limit): def download_some(self, limit):
for ch in self.chapters[:limit]: for ch in self.chapters[:limit]:
ch.text = self.fetch_chapter(ch) self.download_chapter(ch)
filename = self.base_path / f"{ch.number:05d}_{ch.title}.txt"
filename.write_text(ch.text, encoding="utf-8") # ------------------------------------------------------------
logger.debug("Saved test chapter: %s", filename) def download_chapter(self, ch):
# Determine volume + filename
max_size = int(os.getenv("MAX_VOL_SIZE", "200"))
volume = ((ch.number - 1) // max_size) + 1
vdir = self.volume_dirs.get(volume, self.base_path)
expected_name = f"{ch.number:05d}_{ch.title}.txt"
fname = vdir / expected_name
expected_full_path = str(fname.resolve())
# STRICT SKIP CHECK
if fname.exists() and fname.is_file():
actual_size = fname.stat().st_size
# correct name?
if fname.name == expected_name:
expected_dir = str(vdir.resolve())
actual_dir = str(fname.parent.resolve())
if expected_dir == actual_dir:
if actual_size > 300:
log_debug(
f"Skip chapter {ch.number}/{self.total_chapters}: already exists\n"
f" Path: {expected_full_path}\n"
f" Size: {actual_size} bytes"
)
return
else:
log_debug(
f"Existing file too small ({actual_size} bytes), redownloading: {expected_full_path}"
)
else:
log_debug(
f"Directory mismatch for chapter {ch.number}, redownloading"
)
else:
log_debug(
f"Filename mismatch for chapter {ch.number}, redownloading\n"
f" Expected: {expected_name}\n"
f" Found: {fname.name}"
)
# PROGRESS INFO
percent = (ch.number / self.total_chapters) * 100
elapsed = time.time() - self.start_time
avg_time = elapsed / max(ch.number - 1, 1)
remaining = self.total_chapters - ch.number
eta_seconds = max(0, remaining * avg_time)
eta_min = int(eta_seconds // 60)
eta_sec = int(eta_seconds % 60)
log_debug(
f"Fetching chapter {ch.number}/{self.total_chapters} "
f"({percent:.2f}%, ETA {eta_min}m {eta_sec}s): "
f"{ch.title}"
)
# RETRY EMPTY CONTENT
attempt = 1
while True:
soup = self.get_doc_with_retry(ch.url)
text = self.parse_chapter_text(soup)
def fetch_chapter(self, ch): if text.strip():
soup = self.get_document(ch.url) ch.text = text
text = self.parse_chapter_text(soup) break
return clean_text(text, self.replacements)
wait = min(10 + attempt, 30)
log_debug(f"Empty chapter → retry in {wait}s")
time.sleep(wait)
attempt += 1
fname.write_text(ch.text, encoding="utf-8")
log_debug(f"Saved chapter to v{volume}: {fname}")
chapter_delay = float(os.getenv("CHAPTER_DELAY", "2"))
log_debug(f"Throttling {chapter_delay}s before next chapter")
time.sleep(chapter_delay)
# ------------------------------------------------------------
def parse_chapter_text(self, soup): def parse_chapter_text(self, soup):
body = soup.body body = soup.body
if not body:
return ""
h1 = body.find("h1") h1 = body.find("h1")
if not h1:
return ""
parts = [] parts = []
collecting = False collecting = False
for sib in h1.next_siblings: for sib in h1.next_siblings:
if getattr(sib, "get", None) and sib.get("class") == ["bottomlink"]: if getattr(sib, "class", None) == ["toplink"]:
break
if getattr(sib, "get", None) and sib.get("class") == ["toplink"]:
continue continue
if getattr(sib, "class", None) == ["bottomlink"]:
break
if getattr(sib, "name", None) in ["script", "style"]: if getattr(sib, "name", None) in ["script", "style"]:
continue continue
@ -238,32 +378,14 @@ class BookScraper:
collecting = True collecting = True
continue continue
txt = sib.strip() if isinstance(sib, str) else sib.get_text("\n", strip=True) text = (
if txt: sib.get_text("\n", strip=True)
parts.append(txt) if hasattr(sib, "get_text")
else str(sib).strip()
return "\n".join(parts).strip() )
if text:
# ----------------------------------------------------- parts.append(text)
# SPLIT VOLUMES
# ----------------------------------------------------- raw = "\n".join(parts)
def split_into_volumes(self): raw = clean_text(raw, self.site.replacements)
logger.debug( return raw.strip()
"Splitting into volumes (max %s chapters per volume)", self.MAX_VOL_SIZE)
chapters = len(self.chapters)
volume = 1
index = 0
while index < chapters:
chunk = self.chapters[index:index + self.MAX_VOL_SIZE]
volume_dir = self.base_path / f"v{volume}"
volume_dir.mkdir(exist_ok=True)
for ch in chunk:
filename = volume_dir / f"{ch.number:05d}_{ch.title}.txt"
filename.write_text(ch.text, encoding="utf-8")
logger.debug("Volume %s saved (%s chapters)", volume, len(chunk))
volume += 1
index += self.MAX_VOL_SIZE

@ -2,26 +2,72 @@
import logging import logging
from io import StringIO from io import StringIO
# In-memory buffer returned to web UI # In-memory buffer (voor eindresultaat)
LOG_BUFFER = StringIO() LOG_BUFFER = StringIO()
# List van callbacks (SSE-clients)
LISTENERS = []
def add_listener(callback):
"""Registreer een SSE listener callback."""
LISTENERS.append(callback)
def remove_listener(callback):
"""Verwijder SSE listener (bij disconnect)."""
if callback in LISTENERS:
LISTENERS.remove(callback)
def broadcast(line):
"""Stuur logregel naar alle listeners."""
for cb in LISTENERS[:]:
try:
cb(line)
except Exception:
LISTENERS.remove(cb)
def setup_logger(): def setup_logger():
"""Creëer logger die naar console, buffer én SSE broadcast."""
logger = logging.getLogger("bookscraper") logger = logging.getLogger("bookscraper")
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
logger.handlers = [] # voorkomen dubbele handlers bij reload logger.handlers = []
# Console handler # formatter
fmt = logging.Formatter("[%(levelname)s] %(message)s")
# console handler
ch = logging.StreamHandler() ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG) ch.setLevel(logging.DEBUG)
ch.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) ch.setFormatter(fmt)
# buffer handler
bh = logging.StreamHandler(LOG_BUFFER)
bh.setLevel(logging.DEBUG)
bh.setFormatter(fmt)
# Buffer handler for returning to UI # SSE handler
mh = logging.StreamHandler(LOG_BUFFER) class SSEHandler(logging.Handler):
mh.setLevel(logging.DEBUG) def emit(self, record):
mh.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) msg = self.format(record)
broadcast(msg)
sh = SSEHandler()
sh.setLevel(logging.DEBUG)
sh.setFormatter(fmt)
logger.addHandler(ch) logger.addHandler(ch)
logger.addHandler(mh) logger.addHandler(bh)
logger.addHandler(sh)
return logger return logger
# Globale logger
LOGGER = setup_logger()
def log_debug(msg):
LOGGER.debug(msg)

@ -3,7 +3,7 @@ class BookSite:
self.name = "piaotian" self.name = "piaotian"
self.root = "https://www.ptwxz.com" self.root = "https://www.ptwxz.com"
self.chapter_list_selector = "div.centent" self.chapter_list_selector = "div.centent"
self.encoding = "gb2312" self.encoding = "GB18030"
self.replacements = { self.replacements = {
"&nbsp;&nbsp;": "\n", "&nbsp;&nbsp;": "\n",
"手机用户请访问http://m.piaotian.net": "", "手机用户请访问http://m.piaotian.net": "",

@ -1,22 +1,40 @@
<!DOCTYPE html> <!DOCTYPE html>
<html> <html>
<head> <head>
<title>Book Scraper</title> <title>BookScraper</title>
<style>
body { font-family: Arial; padding:20px; }
#log { background:#000; color:#0f0; padding:10px; height:400px; overflow:auto; white-space:pre-wrap; }
</style>
</head> </head>
<body> <body>
<h1>Book Scraper</h1> <h1>BookScraper</h1>
{% if error %} <input id="url" type="text" placeholder="Book URL" style="width:400px">
<p style="color:red">{{ error }}</p> <button onclick="startScrape()">Start</button>
{% endif %}
<form method="post"> <h2>Realtime log:</h2>
<label for="book_url">Enter Book URL:</label><br><br> <div id="log"></div>
<input type="text" id="book_url" name="book_url" style="width:400px">
<br><br> <script>
<button type="submit">Scrape</button> function startScrape() {
</form> document.getElementById("log").innerHTML = "";
const evtSource = new EventSource("/stream");
evtSource.onmessage = function(e) {
const logDiv = document.getElementById("log");
logDiv.innerText += e.data + "\n";
logDiv.scrollTop = logDiv.scrollHeight;
};
fetch("/run", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ url: document.getElementById("url").value })
});
}
</script>
</body> </body>
</html> </html>

Loading…
Cancel
Save