diff --git a/bookscraper/Dockerfile b/bookscraper/Dockerfile new file mode 100644 index 0000000..5c0c780 --- /dev/null +++ b/bookscraper/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +# Pillow dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libjpeg62-turbo-dev zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Kopieer volledige app (zoals hij nu is) +COPY . . + +# Expose Flask port +EXPOSE 5000 + +# Gebruik jouw eigen app.py als entrypoint +CMD ["python", "app.py"] diff --git a/bookscraper/README.md b/bookscraper/README.md new file mode 100644 index 0000000..b3f96d4 --- /dev/null +++ b/bookscraper/README.md @@ -0,0 +1,125 @@ +# ๐ BookScraper โ Web UI + Docker + Live Log Streaming + +BookScraper is een moderne, volledig geautomatiseerde scraper voor Chinese webnovels +zoals **Piaotian / Piaotia**. +Het project combineert een krachtige scraping-engine met een prettige webinterface. + +--- + +# ๐ Wat doet dit project? + +BookScraper bestaat uit drie belangrijke onderdelen: + +--- + +## 1. ๐ง BookScraper Engine (Python) + +Dit is de kern van het project. +De engine: + +- Leest basisinformatie van een boek (titel, auteur, cover) +- Zoekt alle chapter-links +- Downloadt elk chapter met: + - **Retry systeem** + - **Anti-429 backoff** + (wacht: `backoff * attempt + 1 seconde`) + - Detectie van lege chapters โ automatisch opnieuw proberen +- Past tekstreplacements toe (via `replacements.txt`) +- Slaat chapters geordend op +- Splits lange boeken automatisch in volumes (`v1/`, `v2/`, `v3/`โฆ) + +De engine is **bestand tegen rate limiting** van Piaotian en soortgelijke sites +en werkt met een **throttle (MAX_DOWNLOADS_PER_SEC)** om blokkades te voorkomen. + +--- + +## 2. ๐ Flask Webinterface (UI) + +De webinterface biedt: + +- Een invoerveld voor de boek-URL +- Een knop: **Run Scraper** +- Live feedback via **server-sent events (SSE)** + +Tijdens het scrapen zie je realtime updates verschijnen, zoals: + +[DEBUG] GET chapter 1123 +[DEBUG] HTTP 429 โ retry sleep 4.0s +[DEBUG] Saved chapter: output/xxx/01123_็ซ ๅ.txt + + +Hierdoor voelt het alsof de scraper โliveโ aan het werk is. + +--- + +## 3. ๐ก Live Logging (SSE) + +De Logger vangt alle BookScraper-meldingen op en streamt ze +via `/stream` naar de webinterface. +Dit maakt het ideaal om scraping in de gaten te houden zonder console. + +--- + +## 4. ๐ง Configuratie via `.env` + +Om het project flexibel te houden wordt alles ingesteld via `.env`: + +- Throttle (`MAX_DOWNLOADS_PER_SEC`) +- Debugmode (`FLASK_DEBUG`) +- DRY_RUN (alleen eerste chapters) +- Volume size +- Host & Port + +De `.env` wordt automatisch geladen door Docker Compose en door Flask. + +--- + +# ๐ฆ Projectstructuur + +bookscraper/ +โ +โโโ scraper/ +โ โโโ book_scraper.py # De scraper engine +โ โโโ logger.py # SSE logger +โ โโโ sites.py # Site configuratie (selectors etc.) +โ โโโ utils.py # Helpers +โ โโโ ... +โ +โโโ templates/ +โ โโโ index.html # UI +โ +โโโ output/ # Book results +โ +โโโ app.py # Flask webserver + endpoints +โโโ replacements.txt # Tekstvervangers +โโโ Dockerfile +โโโ docker-compose.yml +โโโ requirements.txt +โโโ .env + + +--- + +# โถ๏ธ Project handmatig starten (ZONDER Docker) + +Zorg dat dependencies geรฏnstalleerd zijn: + +```bash +pip install -r requirements.txt +Start de Flask server: +python app.py + +Open daarna: +๐ http://localhost:5000 + +Docker Build (zonder compose) + +Manueel builden: + +docker build -t bookscraper . +docker run -p 5000:5000 --env-file .env bookscraper +docker run \ + -p 5000:5000 \ + --env-file .env \ + -v $(pwd)/output:/app/output \ + bookscraper diff --git a/bookscraper/app.py b/bookscraper/app.py index 4f6d9a6..ed983c6 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -1,53 +1,71 @@ -from flask import Flask, request, render_template_string -from scraper.book_scraper import BookScraper -from scraper.sites import BookSite -import sys +# app.py +from flask import Flask, request, Response, render_template +import time +import queue import os -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from scraper.book_scraper import BookScraper +from scraper.sites import BookSite +from scraper.logger import add_listener, remove_listener, LOG_BUFFER app = Flask(__name__) -# --- GET: toon formulier --- -@app.route("/", methods=["GET"]) +@app.route("/") def index(): - return render_template_string(""" - -
-
-{{debug}}
-
-
-
-
- """, title=result["title"], debug=result["debug"])
+ return {
+ "title": result["title"],
+ "buffer": LOG_BUFFER.getvalue()
+ }
+
+# ----------------------------------------------------------
+# REALTIME LOG STREAM (SSE)
+# ----------------------------------------------------------
+
+@app.route("/stream")
+def stream():
+
+ def event_stream():
+ q = queue.Queue()
+
+ # push logregels van BookScraper naar SSE
+ def listener(line):
+ q.put(line)
+
+ add_listener(listener)
+
+ try:
+ while True:
+ msg = q.get() # blokkeert totdat logregel binnenkomt
+ yield f"data: {msg}\n\n"
+ except GeneratorExit:
+ pass
+ finally:
+ remove_listener(listener)
+
+ return Response(event_stream(), mimetype="text/event-stream")
+
+
+# ----------------------------------------------------------
if __name__ == "__main__":
- app.run(debug=True)
+ debug = os.getenv("FLASK_DEBUG", "0") == "1"
+ host = os.getenv("HOST", "0.0.0.0")
+ port = int(os.getenv("PORT", "5000"))
+
+ app.run(debug=debug, host=host, port=port)
diff --git a/bookscraper/docker-compose.yml b/bookscraper/docker-compose.yml
new file mode 100644
index 0000000..ade0a70
--- /dev/null
+++ b/bookscraper/docker-compose.yml
@@ -0,0 +1,25 @@
+version: "3.9"
+
+services:
+ bookscraper:
+ build:
+ context: .
+ dockerfile: Dockerfile
+ container_name: bookscraper
+ ports:
+ - "5050:5000"
+
+ # Mount alles zoals je lokaal al werkt
+ volumes:
+ - .:/app # volledige projectmap
+ - /Users/peter/Desktop/books:/app/output
+
+ # Bestaande .env wordt automatisch geladen door Docker Compose
+ env_file:
+ - .env
+
+ # Zorg dat Flask NIET in debugmode gaat (jouw code bepaalt dit)
+ environment:
+ FLASK_ENV: "production"
+
+ restart: unless-stopped
diff --git a/bookscraper/output/ๅๆๅฌๅค/piaotian/cover.jpg b/bookscraper/output/ๅๆๅฌๅค/piaotian/cover.jpg
deleted file mode 100644
index 733afb4..0000000
Binary files a/bookscraper/output/ๅๆๅฌๅค/piaotian/cover.jpg and /dev/null differ
diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py
index 56b4f25..83b0348 100644
--- a/bookscraper/scraper/book_scraper.py
+++ b/bookscraper/scraper/book_scraper.py
@@ -6,18 +6,14 @@ from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from PIL import Image
from io import BytesIO
-from dotenv import load_dotenv
-from scraper.logger import setup_logger, LOG_BUFFER
+from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements
-load_dotenv()
-logger = setup_logger()
-
class Chapter:
- def __init__(self, number, title, url):
- self.number = number
+ def __init__(self, num, title, url):
+ self.number = num
self.title = title
self.url = url
self.text = ""
@@ -34,88 +30,135 @@ class BookScraper:
self.cover_url = ""
self.chapters = []
- self.chapter_base = None
self.base_path = None
+ self.chapter_base = None
+
+ # ENV
+ self.DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
+ self.TEST_LIMIT = int(os.getenv("TEST_LIMIT", "10"))
+ self.MAX_DL = float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1"))
+ self.min_delay = 1.0 / self.MAX_DL if self.MAX_DL > 0 else 1.0
+ self._last_download_time = 0
+
+ # replacements.txt
+ fp = os.path.join(os.getcwd(), "replacements.txt")
+ extra = load_replacements(fp)
+ self.site.replacements.update(extra)
+
+ self.start_time = None
+ self.total_chapters = 0
+ self.volume_dirs = {}
+
+ # ------------------------------------------------------------
+ # RATE LIMITER
+ # ------------------------------------------------------------
- # ENV settings
- self.DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
- self.TEST_CHAPTER_LIMIT = int(os.getenv("TEST_CHAPTER_LIMIT", "10"))
- self.MAX_VOL_SIZE = int(os.getenv("MAX_VOL_SIZE", "1500"))
- self.MAX_DL_PER_SEC = int(os.getenv("MAX_DL_PER_SEC", "2"))
+ def throttle(self):
+ now = time.time()
+ elapsed = now - self._last_download_time
- # Load text replacements
- self.replacements = load_replacements("replacements.txt")
+ if elapsed < self.min_delay:
+ time.sleep(self.min_delay - elapsed)
- # -----------------------------------------------------
+ self._last_download_time = time.time()
+
+ # ------------------------------------------------------------
def execute(self):
- LOG_BUFFER.seek(0)
- LOG_BUFFER.truncate(0)
+ log_debug(f"Starting scraper for {self.url}")
- logger.debug("Starting scraper for %s", self.url)
- soup = self.get_document(self.url)
+ self.start_time = time.time()
+ soup = self.get_doc_with_retry(self.url)
self.parse_title(soup)
self.parse_author(soup)
self.parse_description(soup)
self.parse_cover(soup)
+
self.prepare_output_folder()
chapter_page = self.get_chapter_page(soup)
self.parse_chapter_links(chapter_page)
+ self.prepare_volume_folders()
if self.DRY_RUN:
- logger.debug(
- "DRY RUN โ downloading only first %s chapters", self.TEST_CHAPTER_LIMIT)
- self.get_some_chapters(self.TEST_CHAPTER_LIMIT)
+ self.download_some(self.TEST_LIMIT)
else:
- self.get_all_chapters()
- self.split_into_volumes()
+ self.download_all()
+
+ return {"title": self.book_title}
+
+ # ------------------------------------------------------------
+ # HTTP GET WITH RETRIES + HARD 429 COOLDOWN WITH COUNTDOWN
+ # ------------------------------------------------------------
+ def get_doc_with_retry(self, url):
+ attempt = 1
+
+ while True:
+ self.throttle()
+ log_debug(f"GET {url} (attempt {attempt})")
+
+ try:
+ resp = requests.get(
+ url,
+ headers={"User-Agent": "Mozilla/5.0"},
+ timeout=10,
+ )
+ except Exception as e:
+ log_debug(f"Network error {e} โ retry in {attempt + 1}s")
+ time.sleep(attempt + 1)
+ attempt += 1
+ continue
- return {
- "title": self.book_title,
- "debug": LOG_BUFFER.getvalue()
- }
+ code = resp.status_code
+ log_debug(f"HTTP {code} for {url}")
+
+ # 429 โ hard cooldown with countdown
+ if code == 429:
+ cooldown = 60
+ log_debug(f"429 detected โ cooldown {cooldown}s")
+ for i in range(cooldown, 0, -1):
+ log_debug(f"429 cooldownโฆ {i}s remaining")
+ time.sleep(1)
+ attempt += 1
+ continue
- # -----------------------------------------------------
- # NETWORK
- # -----------------------------------------------------
- def get_document(self, url):
- logger.debug("GET %s", url)
- time.sleep(1 / max(1, self.MAX_DL_PER_SEC))
+ # recoverable
+ if code in (403, 500):
+ wait = min(5 * attempt, 30)
+ log_debug(f"HTTP {code} โ retry in {wait}s")
+ time.sleep(wait)
+ attempt += 1
+ continue
- resp = requests.get(
- url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
- resp.encoding = self.site.encoding
+ if code == 200:
+ resp.encoding = self.site.encoding
+ return BeautifulSoup(resp.text, "lxml")
- logger.debug("HTTP %s for %s", resp.status_code, url)
- return BeautifulSoup(resp.text, "lxml")
+ # unexpected
+ wait = attempt + 1
+ log_debug(f"Unexpected HTTP {code} โ sleep {wait}s")
+ time.sleep(wait)
+ attempt += 1
- # -----------------------------------------------------
- # BASIC PARSERS (piaotia structure)
- # -----------------------------------------------------
+ # ------------------------------------------------------------
def parse_title(self, soup):
h1 = soup.find("h1")
- if h1:
- self.book_title = h1.get_text(strip=True)
- else:
- self.book_title = "UnknownTitle"
- logger.debug("Book title: %s", self.book_title)
+ self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
+ log_debug(f"Book title = {self.book_title}")
def parse_author(self, soup):
- td = soup.find("td", string=lambda t: t and "ไฝ" in t and "่
" in t)
- if td:
- raw = td.get_text(strip=True)
- if "๏ผ" in raw:
- self.book_author = raw.split("๏ผ", 1)[1].strip()
- else:
- self.book_author = "UnknownAuthor"
- else:
- self.book_author = "UnknownAuthor"
- logger.debug("Book author: %s", self.book_author)
+ td = soup.find("td", string=lambda t: t and "ไฝ" in t)
+ self.book_author = (
+ td.get_text(strip=True).split("๏ผ")[1]
+ if td and "๏ผ" in td.get_text()
+ else "UnknownAuthor"
+ )
+ log_debug(f"Book author = {self.book_author}")
def parse_description(self, soup):
span = soup.find("span", string=lambda t: t and "ๅ
ๅฎน็ฎไป" in t)
if not span:
+ log_debug("No description found")
self.book_description = ""
return
@@ -123,113 +166,210 @@ class BookScraper:
for sib in span.next_siblings:
if getattr(sib, "name", None) == "span":
break
- txt = sib.get_text(strip=True) if not isinstance(
- sib, str) else sib.strip()
- if txt:
- parts.append(txt)
+ text = (
+ sib.get_text(strip=True)
+ if hasattr(sib, "get_text")
+ else str(sib).strip()
+ )
+ if text:
+ parts.append(text)
self.book_description = "\n".join(parts)
- logger.debug("Description parsed (%s chars)",
- len(self.book_description))
+ log_debug(f"Description length = {len(self.book_description)}")
+ # ------------------------------------------------------------
def parse_cover(self, soup):
- selector = (
- "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table "
- "> tr:nth-of-type(4) > td:nth-of-type(1) > table > tr:nth-of-type(1) "
- "> td:nth-of-type(2) > a:nth-of-type(1) > img"
- )
- img = soup.select_one(selector)
- if img:
- self.cover_url = urljoin(self.site.root, img.get("src"))
- else:
- logger.debug("Cover not found!")
- logger.debug("Cover URL = %s", self.cover_url)
+ cover = soup.find(
+ "img", src=lambda v: v and "files/article/image" in v)
+ if not cover:
+ log_debug("Cover not found")
+ return
- # -----------------------------------------------------
+ self.cover_url = urljoin(self.site.root, cover.get("src"))
+ log_debug(f"Cover URL = {self.cover_url}")
+
+ # ------------------------------------------------------------
def prepare_output_folder(self):
- output_root = os.getenv("OUTPUT_DIR", "./output")
- self.base_path = Path(output_root) / self.book_title / self.site.name
+ self.base_path = Path("output") / self.book_title / self.site.name
self.base_path.mkdir(parents=True, exist_ok=True)
- logger.debug("Output directory: %s", self.base_path)
if self.cover_url:
- self.save_image(self.cover_url, self.base_path / "cover.jpg")
+ self.download_cover()
+
+ def download_cover(self):
+ log_debug(f"Downloading cover: {self.cover_url}")
- def save_image(self, url, path):
- logger.debug("Downloading cover: %s", url)
resp = requests.get(
- url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
- if resp.status_code == 200:
+ self.cover_url,
+ headers={"User-Agent": "Mozilla/5.0"},
+ timeout=10,
+ )
+
+ if resp.status_code != 200:
+ return
+
+ if "html" in resp.headers.get("Content-Type", ""):
+ return
+
+ try:
img = Image.open(BytesIO(resp.content))
- img.save(path)
- logger.debug("Cover saved to %s", path)
+ except:
+ return
+
+ img.save(self.base_path / "cover.jpg")
+ log_debug("Cover saved")
- # -----------------------------------------------------
- # CHAPTER PAGE
- # -----------------------------------------------------
+ # ------------------------------------------------------------
def get_chapter_page(self, soup):
node = soup.select_one(
- "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table")
- link = node.select_one("a")
- href = link.get("href")
- chapter_url = urljoin(self.site.root, href)
-
- parsed = urlparse(chapter_url)
- base = parsed.path.rsplit("/", 1)[0] + "/"
- self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{base}"
+ "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
+ )
+ href = node.select_one("a").get("href")
+ url = urljoin(self.site.root, href)
- logger.debug("Chapter index URL = %s", chapter_url)
- logger.debug("CHAPTER_BASE = %s", self.chapter_base)
+ parsed = urlparse(url)
+ bp = parsed.path.rsplit("/", 1)[0] + "/"
+ self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{bp}"
- return self.get_document(chapter_url)
+ return self.get_doc_with_retry(url)
+ # ------------------------------------------------------------
def parse_chapter_links(self, soup):
- container = soup.select_one("div.centent")
- links = container.select("ul li a[href]")
+ cont = soup.select_one(self.site.chapter_list_selector)
+ items = cont.select("ul li a[href]")
- for i, a in enumerate(links, 1):
+ self.chapters = []
+ idx = 1
+ for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
-
- abs_url = urljoin(self.chapter_base, href)
title = a.get_text(strip=True)
- self.chapters.append(Chapter(i, title, abs_url))
+ full = urljoin(self.chapter_base, href)
+ self.chapters.append(Chapter(idx, title, full))
+ idx += 1
+
+ self.total_chapters = len(self.chapters)
+ log_debug(f"Found {self.total_chapters} chapters")
+
+ # ------------------------------------------------------------
+ def prepare_volume_folders(self):
+ max_size = int(os.getenv("MAX_VOL_SIZE", "200"))
+ num_vols = (self.total_chapters + max_size - 1) // max_size
- logger.debug("Total chapters: %s", len(self.chapters))
+ for v in range(1, num_vols + 1):
+ d = self.base_path / f"v{v}"
+ d.mkdir(parents=True, exist_ok=True)
+ self.volume_dirs[v] = d
- # -----------------------------------------------------
- # DOWNLOAD CHAPTERS
- # -----------------------------------------------------
- def get_all_chapters(self):
+ # ------------------------------------------------------------
+ def download_all(self):
for ch in self.chapters:
- ch.text = self.fetch_chapter(ch)
- logger.debug("CH %s length = %s", ch.number, len(ch.text))
+ self.download_chapter(ch)
- def get_some_chapters(self, limit):
+ def download_some(self, limit):
for ch in self.chapters[:limit]:
- ch.text = self.fetch_chapter(ch)
- filename = self.base_path / f"{ch.number:05d}_{ch.title}.txt"
- filename.write_text(ch.text, encoding="utf-8")
- logger.debug("Saved test chapter: %s", filename)
+ self.download_chapter(ch)
+
+ # ------------------------------------------------------------
+ def download_chapter(self, ch):
+ # Determine volume + filename
+ max_size = int(os.getenv("MAX_VOL_SIZE", "200"))
+ volume = ((ch.number - 1) // max_size) + 1
+ vdir = self.volume_dirs.get(volume, self.base_path)
+
+ expected_name = f"{ch.number:05d}_{ch.title}.txt"
+ fname = vdir / expected_name
+ expected_full_path = str(fname.resolve())
+
+ # STRICT SKIP CHECK
+ if fname.exists() and fname.is_file():
+ actual_size = fname.stat().st_size
+
+ # correct name?
+ if fname.name == expected_name:
+ expected_dir = str(vdir.resolve())
+ actual_dir = str(fname.parent.resolve())
+
+ if expected_dir == actual_dir:
+ if actual_size > 300:
+ log_debug(
+ f"Skip chapter {ch.number}/{self.total_chapters}: already exists\n"
+ f" Path: {expected_full_path}\n"
+ f" Size: {actual_size} bytes"
+ )
+ return
+ else:
+ log_debug(
+ f"Existing file too small ({actual_size} bytes), redownloading: {expected_full_path}"
+ )
+ else:
+ log_debug(
+ f"Directory mismatch for chapter {ch.number}, redownloading"
+ )
+ else:
+ log_debug(
+ f"Filename mismatch for chapter {ch.number}, redownloading\n"
+ f" Expected: {expected_name}\n"
+ f" Found: {fname.name}"
+ )
+
+ # PROGRESS INFO
+ percent = (ch.number / self.total_chapters) * 100
+ elapsed = time.time() - self.start_time
+ avg_time = elapsed / max(ch.number - 1, 1)
+ remaining = self.total_chapters - ch.number
+ eta_seconds = max(0, remaining * avg_time)
+
+ eta_min = int(eta_seconds // 60)
+ eta_sec = int(eta_seconds % 60)
+
+ log_debug(
+ f"Fetching chapter {ch.number}/{self.total_chapters} "
+ f"({percent:.2f}%, ETA {eta_min}m {eta_sec}s): "
+ f"{ch.title}"
+ )
+
+ # RETRY EMPTY CONTENT
+ attempt = 1
+ while True:
+ soup = self.get_doc_with_retry(ch.url)
+ text = self.parse_chapter_text(soup)
- def fetch_chapter(self, ch):
- soup = self.get_document(ch.url)
- text = self.parse_chapter_text(soup)
- return clean_text(text, self.replacements)
+ if text.strip():
+ ch.text = text
+ break
+
+ wait = min(10 + attempt, 30)
+ log_debug(f"Empty chapter โ retry in {wait}s")
+ time.sleep(wait)
+ attempt += 1
+
+ fname.write_text(ch.text, encoding="utf-8")
+ log_debug(f"Saved chapter to v{volume}: {fname}")
+ chapter_delay = float(os.getenv("CHAPTER_DELAY", "2"))
+ log_debug(f"Throttling {chapter_delay}s before next chapter")
+ time.sleep(chapter_delay)
+
+ # ------------------------------------------------------------
def parse_chapter_text(self, soup):
body = soup.body
+ if not body:
+ return ""
+
h1 = body.find("h1")
+ if not h1:
+ return ""
parts = []
collecting = False
for sib in h1.next_siblings:
- if getattr(sib, "get", None) and sib.get("class") == ["bottomlink"]:
- break
- if getattr(sib, "get", None) and sib.get("class") == ["toplink"]:
+ if getattr(sib, "class", None) == ["toplink"]:
continue
+ if getattr(sib, "class", None) == ["bottomlink"]:
+ break
if getattr(sib, "name", None) in ["script", "style"]:
continue
@@ -238,32 +378,14 @@ class BookScraper:
collecting = True
continue
- txt = sib.strip() if isinstance(sib, str) else sib.get_text("\n", strip=True)
- if txt:
- parts.append(txt)
-
- return "\n".join(parts).strip()
-
- # -----------------------------------------------------
- # SPLIT VOLUMES
- # -----------------------------------------------------
- def split_into_volumes(self):
- logger.debug(
- "Splitting into volumes (max %s chapters per volume)", self.MAX_VOL_SIZE)
-
- chapters = len(self.chapters)
- volume = 1
- index = 0
-
- while index < chapters:
- chunk = self.chapters[index:index + self.MAX_VOL_SIZE]
- volume_dir = self.base_path / f"v{volume}"
- volume_dir.mkdir(exist_ok=True)
-
- for ch in chunk:
- filename = volume_dir / f"{ch.number:05d}_{ch.title}.txt"
- filename.write_text(ch.text, encoding="utf-8")
-
- logger.debug("Volume %s saved (%s chapters)", volume, len(chunk))
- volume += 1
- index += self.MAX_VOL_SIZE
+ text = (
+ sib.get_text("\n", strip=True)
+ if hasattr(sib, "get_text")
+ else str(sib).strip()
+ )
+ if text:
+ parts.append(text)
+
+ raw = "\n".join(parts)
+ raw = clean_text(raw, self.site.replacements)
+ return raw.strip()
diff --git a/bookscraper/scraper/logger.py b/bookscraper/scraper/logger.py
index f70d0d5..e0f28f1 100644
--- a/bookscraper/scraper/logger.py
+++ b/bookscraper/scraper/logger.py
@@ -2,26 +2,72 @@
import logging
from io import StringIO
-# In-memory buffer returned to web UI
+# In-memory buffer (voor eindresultaat)
LOG_BUFFER = StringIO()
+# List van callbacks (SSE-clients)
+LISTENERS = []
+
+
+def add_listener(callback):
+ """Registreer een SSE listener callback."""
+ LISTENERS.append(callback)
+
+
+def remove_listener(callback):
+ """Verwijder SSE listener (bij disconnect)."""
+ if callback in LISTENERS:
+ LISTENERS.remove(callback)
+
+
+def broadcast(line):
+ """Stuur logregel naar alle listeners."""
+ for cb in LISTENERS[:]:
+ try:
+ cb(line)
+ except Exception:
+ LISTENERS.remove(cb)
+
def setup_logger():
+ """Creรซer logger die naar console, buffer รฉn SSE broadcast."""
logger = logging.getLogger("bookscraper")
logger.setLevel(logging.DEBUG)
- logger.handlers = [] # voorkomen dubbele handlers bij reload
+ logger.handlers = []
- # Console handler
+ # formatter
+ fmt = logging.Formatter("[%(levelname)s] %(message)s")
+
+ # console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
- ch.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
+ ch.setFormatter(fmt)
+
+ # buffer handler
+ bh = logging.StreamHandler(LOG_BUFFER)
+ bh.setLevel(logging.DEBUG)
+ bh.setFormatter(fmt)
- # Buffer handler for returning to UI
- mh = logging.StreamHandler(LOG_BUFFER)
- mh.setLevel(logging.DEBUG)
- mh.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
+ # SSE handler
+ class SSEHandler(logging.Handler):
+ def emit(self, record):
+ msg = self.format(record)
+ broadcast(msg)
+
+ sh = SSEHandler()
+ sh.setLevel(logging.DEBUG)
+ sh.setFormatter(fmt)
logger.addHandler(ch)
- logger.addHandler(mh)
+ logger.addHandler(bh)
+ logger.addHandler(sh)
return logger
+
+
+# Globale logger
+LOGGER = setup_logger()
+
+
+def log_debug(msg):
+ LOGGER.debug(msg)
diff --git a/bookscraper/scraper/sites.py b/bookscraper/scraper/sites.py
index 89d3451..51023dc 100644
--- a/bookscraper/scraper/sites.py
+++ b/bookscraper/scraper/sites.py
@@ -3,7 +3,7 @@ class BookSite:
self.name = "piaotian"
self.root = "https://www.ptwxz.com"
self.chapter_list_selector = "div.centent"
- self.encoding = "gb2312"
+ self.encoding = "GB18030"
self.replacements = {
" ": "\n",
"ๆๆบ็จๆท่ฏท่ฎฟ้ฎhttp://m.piaotian.net": "",
diff --git a/bookscraper/templates/index.html b/bookscraper/templates/index.html
index 03526d9..7cb4612 100644
--- a/bookscraper/templates/index.html
+++ b/bookscraper/templates/index.html
@@ -1,22 +1,40 @@
- {{ error }}
-{% endif %} + + - +