diff --git a/bookscraper/Dockerfile b/bookscraper/Dockerfile new file mode 100644 index 0000000..5c0c780 --- /dev/null +++ b/bookscraper/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +# Pillow dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libjpeg62-turbo-dev zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Kopieer volledige app (zoals hij nu is) +COPY . . + +# Expose Flask port +EXPOSE 5000 + +# Gebruik jouw eigen app.py als entrypoint +CMD ["python", "app.py"] diff --git a/bookscraper/README.md b/bookscraper/README.md new file mode 100644 index 0000000..b3f96d4 --- /dev/null +++ b/bookscraper/README.md @@ -0,0 +1,125 @@ +# ๐Ÿ“š BookScraper โ€” Web UI + Docker + Live Log Streaming + +BookScraper is een moderne, volledig geautomatiseerde scraper voor Chinese webnovels +zoals **Piaotian / Piaotia**. +Het project combineert een krachtige scraping-engine met een prettige webinterface. + +--- + +# ๐Ÿ” Wat doet dit project? + +BookScraper bestaat uit drie belangrijke onderdelen: + +--- + +## 1. ๐Ÿง  BookScraper Engine (Python) + +Dit is de kern van het project. +De engine: + +- Leest basisinformatie van een boek (titel, auteur, cover) +- Zoekt alle chapter-links +- Downloadt elk chapter met: + - **Retry systeem** + - **Anti-429 backoff** + (wacht: `backoff * attempt + 1 seconde`) + - Detectie van lege chapters โ†’ automatisch opnieuw proberen +- Past tekstreplacements toe (via `replacements.txt`) +- Slaat chapters geordend op +- Splits lange boeken automatisch in volumes (`v1/`, `v2/`, `v3/`โ€ฆ) + +De engine is **bestand tegen rate limiting** van Piaotian en soortgelijke sites +en werkt met een **throttle (MAX_DOWNLOADS_PER_SEC)** om blokkades te voorkomen. + +--- + +## 2. ๐ŸŒ Flask Webinterface (UI) + +De webinterface biedt: + +- Een invoerveld voor de boek-URL +- Een knop: **Run Scraper** +- Live feedback via **server-sent events (SSE)** + +Tijdens het scrapen zie je realtime updates verschijnen, zoals: + +[DEBUG] GET chapter 1123 +[DEBUG] HTTP 429 โ†’ retry sleep 4.0s +[DEBUG] Saved chapter: output/xxx/01123_็ซ ๅ.txt + + +Hierdoor voelt het alsof de scraper โ€œliveโ€ aan het werk is. + +--- + +## 3. ๐Ÿ“ก Live Logging (SSE) + +De Logger vangt alle BookScraper-meldingen op en streamt ze +via `/stream` naar de webinterface. +Dit maakt het ideaal om scraping in de gaten te houden zonder console. + +--- + +## 4. ๐Ÿ”ง Configuratie via `.env` + +Om het project flexibel te houden wordt alles ingesteld via `.env`: + +- Throttle (`MAX_DOWNLOADS_PER_SEC`) +- Debugmode (`FLASK_DEBUG`) +- DRY_RUN (alleen eerste chapters) +- Volume size +- Host & Port + +De `.env` wordt automatisch geladen door Docker Compose en door Flask. + +--- + +# ๐Ÿ“ฆ Projectstructuur + +bookscraper/ +โ”‚ +โ”œโ”€โ”€ scraper/ +โ”‚ โ”œโ”€โ”€ book_scraper.py # De scraper engine +โ”‚ โ”œโ”€โ”€ logger.py # SSE logger +โ”‚ โ”œโ”€โ”€ sites.py # Site configuratie (selectors etc.) +โ”‚ โ”œโ”€โ”€ utils.py # Helpers +โ”‚ โ””โ”€โ”€ ... +โ”‚ +โ”œโ”€โ”€ templates/ +โ”‚ โ””โ”€โ”€ index.html # UI +โ”‚ +โ”œโ”€โ”€ output/ # Book results +โ”‚ +โ”œโ”€โ”€ app.py # Flask webserver + endpoints +โ”œโ”€โ”€ replacements.txt # Tekstvervangers +โ”œโ”€โ”€ Dockerfile +โ”œโ”€โ”€ docker-compose.yml +โ”œโ”€โ”€ requirements.txt +โ””โ”€โ”€ .env + + +--- + +# โ–ถ๏ธ Project handmatig starten (ZONDER Docker) + +Zorg dat dependencies geรฏnstalleerd zijn: + +```bash +pip install -r requirements.txt +Start de Flask server: +python app.py + +Open daarna: +๐Ÿ‘‰ http://localhost:5000 + +Docker Build (zonder compose) + +Manueel builden: + +docker build -t bookscraper . +docker run -p 5000:5000 --env-file .env bookscraper +docker run \ + -p 5000:5000 \ + --env-file .env \ + -v $(pwd)/output:/app/output \ + bookscraper diff --git a/bookscraper/app.py b/bookscraper/app.py index 4f6d9a6..ed983c6 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -1,53 +1,71 @@ -from flask import Flask, request, render_template_string -from scraper.book_scraper import BookScraper -from scraper.sites import BookSite -import sys +# app.py +from flask import Flask, request, Response, render_template +import time +import queue import os -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from scraper.book_scraper import BookScraper +from scraper.sites import BookSite +from scraper.logger import add_listener, remove_listener, LOG_BUFFER app = Flask(__name__) -# --- GET: toon formulier --- -@app.route("/", methods=["GET"]) +@app.route("/") def index(): - return render_template_string(""" - - -

BookScraper

-
-
-
- -
- - - """) - - -# --- POST: scraper uitvoeren --- -@app.route("/", methods=["POST"]) -def run_scraper(): - url = request.form.get("url") + return render_template("index.html") + +# ---------------------------------------------------------- +# RUN SCRAPER +# ---------------------------------------------------------- + +@app.route("/run", methods=["POST"]) +def run_scraper(): + data = request.json site = BookSite() - scraper = BookScraper(site, url) + scraper = BookScraper(site, data["url"]) result = scraper.execute() - return render_template_string(""" - - -

Scrape result: {{title}}

-

Debug output:

-
-{{debug}}
-        
-

Terug

- - - """, title=result["title"], debug=result["debug"]) + return { + "title": result["title"], + "buffer": LOG_BUFFER.getvalue() + } + +# ---------------------------------------------------------- +# REALTIME LOG STREAM (SSE) +# ---------------------------------------------------------- + +@app.route("/stream") +def stream(): + + def event_stream(): + q = queue.Queue() + + # push logregels van BookScraper naar SSE + def listener(line): + q.put(line) + + add_listener(listener) + + try: + while True: + msg = q.get() # blokkeert totdat logregel binnenkomt + yield f"data: {msg}\n\n" + except GeneratorExit: + pass + finally: + remove_listener(listener) + + return Response(event_stream(), mimetype="text/event-stream") + + +# ---------------------------------------------------------- if __name__ == "__main__": - app.run(debug=True) + debug = os.getenv("FLASK_DEBUG", "0") == "1" + host = os.getenv("HOST", "0.0.0.0") + port = int(os.getenv("PORT", "5000")) + + app.run(debug=debug, host=host, port=port) diff --git a/bookscraper/docker-compose.yml b/bookscraper/docker-compose.yml new file mode 100644 index 0000000..ade0a70 --- /dev/null +++ b/bookscraper/docker-compose.yml @@ -0,0 +1,25 @@ +version: "3.9" + +services: + bookscraper: + build: + context: . + dockerfile: Dockerfile + container_name: bookscraper + ports: + - "5050:5000" + + # Mount alles zoals je lokaal al werkt + volumes: + - .:/app # volledige projectmap + - /Users/peter/Desktop/books:/app/output + + # Bestaande .env wordt automatisch geladen door Docker Compose + env_file: + - .env + + # Zorg dat Flask NIET in debugmode gaat (jouw code bepaalt dit) + environment: + FLASK_ENV: "production" + + restart: unless-stopped diff --git a/bookscraper/output/ๅˆๆˆๅฌๅ”ค/piaotian/cover.jpg b/bookscraper/output/ๅˆๆˆๅฌๅ”ค/piaotian/cover.jpg deleted file mode 100644 index 733afb4..0000000 Binary files a/bookscraper/output/ๅˆๆˆๅฌๅ”ค/piaotian/cover.jpg and /dev/null differ diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py index 56b4f25..83b0348 100644 --- a/bookscraper/scraper/book_scraper.py +++ b/bookscraper/scraper/book_scraper.py @@ -6,18 +6,14 @@ from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from PIL import Image from io import BytesIO -from dotenv import load_dotenv -from scraper.logger import setup_logger, LOG_BUFFER +from scraper.logger import log_debug from scraper.utils import clean_text, load_replacements -load_dotenv() -logger = setup_logger() - class Chapter: - def __init__(self, number, title, url): - self.number = number + def __init__(self, num, title, url): + self.number = num self.title = title self.url = url self.text = "" @@ -34,88 +30,135 @@ class BookScraper: self.cover_url = "" self.chapters = [] - self.chapter_base = None self.base_path = None + self.chapter_base = None + + # ENV + self.DRY_RUN = os.getenv("DRY_RUN", "1") == "1" + self.TEST_LIMIT = int(os.getenv("TEST_LIMIT", "10")) + self.MAX_DL = float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1")) + self.min_delay = 1.0 / self.MAX_DL if self.MAX_DL > 0 else 1.0 + self._last_download_time = 0 + + # replacements.txt + fp = os.path.join(os.getcwd(), "replacements.txt") + extra = load_replacements(fp) + self.site.replacements.update(extra) + + self.start_time = None + self.total_chapters = 0 + self.volume_dirs = {} + + # ------------------------------------------------------------ + # RATE LIMITER + # ------------------------------------------------------------ - # ENV settings - self.DRY_RUN = os.getenv("DRY_RUN", "0") == "1" - self.TEST_CHAPTER_LIMIT = int(os.getenv("TEST_CHAPTER_LIMIT", "10")) - self.MAX_VOL_SIZE = int(os.getenv("MAX_VOL_SIZE", "1500")) - self.MAX_DL_PER_SEC = int(os.getenv("MAX_DL_PER_SEC", "2")) + def throttle(self): + now = time.time() + elapsed = now - self._last_download_time - # Load text replacements - self.replacements = load_replacements("replacements.txt") + if elapsed < self.min_delay: + time.sleep(self.min_delay - elapsed) - # ----------------------------------------------------- + self._last_download_time = time.time() + + # ------------------------------------------------------------ def execute(self): - LOG_BUFFER.seek(0) - LOG_BUFFER.truncate(0) + log_debug(f"Starting scraper for {self.url}") - logger.debug("Starting scraper for %s", self.url) - soup = self.get_document(self.url) + self.start_time = time.time() + soup = self.get_doc_with_retry(self.url) self.parse_title(soup) self.parse_author(soup) self.parse_description(soup) self.parse_cover(soup) + self.prepare_output_folder() chapter_page = self.get_chapter_page(soup) self.parse_chapter_links(chapter_page) + self.prepare_volume_folders() if self.DRY_RUN: - logger.debug( - "DRY RUN โ†’ downloading only first %s chapters", self.TEST_CHAPTER_LIMIT) - self.get_some_chapters(self.TEST_CHAPTER_LIMIT) + self.download_some(self.TEST_LIMIT) else: - self.get_all_chapters() - self.split_into_volumes() + self.download_all() + + return {"title": self.book_title} + + # ------------------------------------------------------------ + # HTTP GET WITH RETRIES + HARD 429 COOLDOWN WITH COUNTDOWN + # ------------------------------------------------------------ + def get_doc_with_retry(self, url): + attempt = 1 + + while True: + self.throttle() + log_debug(f"GET {url} (attempt {attempt})") + + try: + resp = requests.get( + url, + headers={"User-Agent": "Mozilla/5.0"}, + timeout=10, + ) + except Exception as e: + log_debug(f"Network error {e} โ†’ retry in {attempt + 1}s") + time.sleep(attempt + 1) + attempt += 1 + continue - return { - "title": self.book_title, - "debug": LOG_BUFFER.getvalue() - } + code = resp.status_code + log_debug(f"HTTP {code} for {url}") + + # 429 โ†’ hard cooldown with countdown + if code == 429: + cooldown = 60 + log_debug(f"429 detected โ€” cooldown {cooldown}s") + for i in range(cooldown, 0, -1): + log_debug(f"429 cooldownโ€ฆ {i}s remaining") + time.sleep(1) + attempt += 1 + continue - # ----------------------------------------------------- - # NETWORK - # ----------------------------------------------------- - def get_document(self, url): - logger.debug("GET %s", url) - time.sleep(1 / max(1, self.MAX_DL_PER_SEC)) + # recoverable + if code in (403, 500): + wait = min(5 * attempt, 30) + log_debug(f"HTTP {code} โ†’ retry in {wait}s") + time.sleep(wait) + attempt += 1 + continue - resp = requests.get( - url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) - resp.encoding = self.site.encoding + if code == 200: + resp.encoding = self.site.encoding + return BeautifulSoup(resp.text, "lxml") - logger.debug("HTTP %s for %s", resp.status_code, url) - return BeautifulSoup(resp.text, "lxml") + # unexpected + wait = attempt + 1 + log_debug(f"Unexpected HTTP {code} โ†’ sleep {wait}s") + time.sleep(wait) + attempt += 1 - # ----------------------------------------------------- - # BASIC PARSERS (piaotia structure) - # ----------------------------------------------------- + # ------------------------------------------------------------ def parse_title(self, soup): h1 = soup.find("h1") - if h1: - self.book_title = h1.get_text(strip=True) - else: - self.book_title = "UnknownTitle" - logger.debug("Book title: %s", self.book_title) + self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle" + log_debug(f"Book title = {self.book_title}") def parse_author(self, soup): - td = soup.find("td", string=lambda t: t and "ไฝœ" in t and "่€…" in t) - if td: - raw = td.get_text(strip=True) - if "๏ผš" in raw: - self.book_author = raw.split("๏ผš", 1)[1].strip() - else: - self.book_author = "UnknownAuthor" - else: - self.book_author = "UnknownAuthor" - logger.debug("Book author: %s", self.book_author) + td = soup.find("td", string=lambda t: t and "ไฝœ" in t) + self.book_author = ( + td.get_text(strip=True).split("๏ผš")[1] + if td and "๏ผš" in td.get_text() + else "UnknownAuthor" + ) + log_debug(f"Book author = {self.book_author}") def parse_description(self, soup): span = soup.find("span", string=lambda t: t and "ๅ†…ๅฎน็ฎ€ไป‹" in t) if not span: + log_debug("No description found") self.book_description = "" return @@ -123,113 +166,210 @@ class BookScraper: for sib in span.next_siblings: if getattr(sib, "name", None) == "span": break - txt = sib.get_text(strip=True) if not isinstance( - sib, str) else sib.strip() - if txt: - parts.append(txt) + text = ( + sib.get_text(strip=True) + if hasattr(sib, "get_text") + else str(sib).strip() + ) + if text: + parts.append(text) self.book_description = "\n".join(parts) - logger.debug("Description parsed (%s chars)", - len(self.book_description)) + log_debug(f"Description length = {len(self.book_description)}") + # ------------------------------------------------------------ def parse_cover(self, soup): - selector = ( - "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table " - "> tr:nth-of-type(4) > td:nth-of-type(1) > table > tr:nth-of-type(1) " - "> td:nth-of-type(2) > a:nth-of-type(1) > img" - ) - img = soup.select_one(selector) - if img: - self.cover_url = urljoin(self.site.root, img.get("src")) - else: - logger.debug("Cover not found!") - logger.debug("Cover URL = %s", self.cover_url) + cover = soup.find( + "img", src=lambda v: v and "files/article/image" in v) + if not cover: + log_debug("Cover not found") + return - # ----------------------------------------------------- + self.cover_url = urljoin(self.site.root, cover.get("src")) + log_debug(f"Cover URL = {self.cover_url}") + + # ------------------------------------------------------------ def prepare_output_folder(self): - output_root = os.getenv("OUTPUT_DIR", "./output") - self.base_path = Path(output_root) / self.book_title / self.site.name + self.base_path = Path("output") / self.book_title / self.site.name self.base_path.mkdir(parents=True, exist_ok=True) - logger.debug("Output directory: %s", self.base_path) if self.cover_url: - self.save_image(self.cover_url, self.base_path / "cover.jpg") + self.download_cover() + + def download_cover(self): + log_debug(f"Downloading cover: {self.cover_url}") - def save_image(self, url, path): - logger.debug("Downloading cover: %s", url) resp = requests.get( - url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) - if resp.status_code == 200: + self.cover_url, + headers={"User-Agent": "Mozilla/5.0"}, + timeout=10, + ) + + if resp.status_code != 200: + return + + if "html" in resp.headers.get("Content-Type", ""): + return + + try: img = Image.open(BytesIO(resp.content)) - img.save(path) - logger.debug("Cover saved to %s", path) + except: + return + + img.save(self.base_path / "cover.jpg") + log_debug("Cover saved") - # ----------------------------------------------------- - # CHAPTER PAGE - # ----------------------------------------------------- + # ------------------------------------------------------------ def get_chapter_page(self, soup): node = soup.select_one( - "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table") - link = node.select_one("a") - href = link.get("href") - chapter_url = urljoin(self.site.root, href) - - parsed = urlparse(chapter_url) - base = parsed.path.rsplit("/", 1)[0] + "/" - self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{base}" + "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" + ) + href = node.select_one("a").get("href") + url = urljoin(self.site.root, href) - logger.debug("Chapter index URL = %s", chapter_url) - logger.debug("CHAPTER_BASE = %s", self.chapter_base) + parsed = urlparse(url) + bp = parsed.path.rsplit("/", 1)[0] + "/" + self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{bp}" - return self.get_document(chapter_url) + return self.get_doc_with_retry(url) + # ------------------------------------------------------------ def parse_chapter_links(self, soup): - container = soup.select_one("div.centent") - links = container.select("ul li a[href]") + cont = soup.select_one(self.site.chapter_list_selector) + items = cont.select("ul li a[href]") - for i, a in enumerate(links, 1): + self.chapters = [] + idx = 1 + for a in items: href = a.get("href") if not href.endswith(".html"): continue - - abs_url = urljoin(self.chapter_base, href) title = a.get_text(strip=True) - self.chapters.append(Chapter(i, title, abs_url)) + full = urljoin(self.chapter_base, href) + self.chapters.append(Chapter(idx, title, full)) + idx += 1 + + self.total_chapters = len(self.chapters) + log_debug(f"Found {self.total_chapters} chapters") + + # ------------------------------------------------------------ + def prepare_volume_folders(self): + max_size = int(os.getenv("MAX_VOL_SIZE", "200")) + num_vols = (self.total_chapters + max_size - 1) // max_size - logger.debug("Total chapters: %s", len(self.chapters)) + for v in range(1, num_vols + 1): + d = self.base_path / f"v{v}" + d.mkdir(parents=True, exist_ok=True) + self.volume_dirs[v] = d - # ----------------------------------------------------- - # DOWNLOAD CHAPTERS - # ----------------------------------------------------- - def get_all_chapters(self): + # ------------------------------------------------------------ + def download_all(self): for ch in self.chapters: - ch.text = self.fetch_chapter(ch) - logger.debug("CH %s length = %s", ch.number, len(ch.text)) + self.download_chapter(ch) - def get_some_chapters(self, limit): + def download_some(self, limit): for ch in self.chapters[:limit]: - ch.text = self.fetch_chapter(ch) - filename = self.base_path / f"{ch.number:05d}_{ch.title}.txt" - filename.write_text(ch.text, encoding="utf-8") - logger.debug("Saved test chapter: %s", filename) + self.download_chapter(ch) + + # ------------------------------------------------------------ + def download_chapter(self, ch): + # Determine volume + filename + max_size = int(os.getenv("MAX_VOL_SIZE", "200")) + volume = ((ch.number - 1) // max_size) + 1 + vdir = self.volume_dirs.get(volume, self.base_path) + + expected_name = f"{ch.number:05d}_{ch.title}.txt" + fname = vdir / expected_name + expected_full_path = str(fname.resolve()) + + # STRICT SKIP CHECK + if fname.exists() and fname.is_file(): + actual_size = fname.stat().st_size + + # correct name? + if fname.name == expected_name: + expected_dir = str(vdir.resolve()) + actual_dir = str(fname.parent.resolve()) + + if expected_dir == actual_dir: + if actual_size > 300: + log_debug( + f"Skip chapter {ch.number}/{self.total_chapters}: already exists\n" + f" Path: {expected_full_path}\n" + f" Size: {actual_size} bytes" + ) + return + else: + log_debug( + f"Existing file too small ({actual_size} bytes), redownloading: {expected_full_path}" + ) + else: + log_debug( + f"Directory mismatch for chapter {ch.number}, redownloading" + ) + else: + log_debug( + f"Filename mismatch for chapter {ch.number}, redownloading\n" + f" Expected: {expected_name}\n" + f" Found: {fname.name}" + ) + + # PROGRESS INFO + percent = (ch.number / self.total_chapters) * 100 + elapsed = time.time() - self.start_time + avg_time = elapsed / max(ch.number - 1, 1) + remaining = self.total_chapters - ch.number + eta_seconds = max(0, remaining * avg_time) + + eta_min = int(eta_seconds // 60) + eta_sec = int(eta_seconds % 60) + + log_debug( + f"Fetching chapter {ch.number}/{self.total_chapters} " + f"({percent:.2f}%, ETA {eta_min}m {eta_sec}s): " + f"{ch.title}" + ) + + # RETRY EMPTY CONTENT + attempt = 1 + while True: + soup = self.get_doc_with_retry(ch.url) + text = self.parse_chapter_text(soup) - def fetch_chapter(self, ch): - soup = self.get_document(ch.url) - text = self.parse_chapter_text(soup) - return clean_text(text, self.replacements) + if text.strip(): + ch.text = text + break + + wait = min(10 + attempt, 30) + log_debug(f"Empty chapter โ†’ retry in {wait}s") + time.sleep(wait) + attempt += 1 + + fname.write_text(ch.text, encoding="utf-8") + log_debug(f"Saved chapter to v{volume}: {fname}") + chapter_delay = float(os.getenv("CHAPTER_DELAY", "2")) + log_debug(f"Throttling {chapter_delay}s before next chapter") + time.sleep(chapter_delay) + + # ------------------------------------------------------------ def parse_chapter_text(self, soup): body = soup.body + if not body: + return "" + h1 = body.find("h1") + if not h1: + return "" parts = [] collecting = False for sib in h1.next_siblings: - if getattr(sib, "get", None) and sib.get("class") == ["bottomlink"]: - break - if getattr(sib, "get", None) and sib.get("class") == ["toplink"]: + if getattr(sib, "class", None) == ["toplink"]: continue + if getattr(sib, "class", None) == ["bottomlink"]: + break if getattr(sib, "name", None) in ["script", "style"]: continue @@ -238,32 +378,14 @@ class BookScraper: collecting = True continue - txt = sib.strip() if isinstance(sib, str) else sib.get_text("\n", strip=True) - if txt: - parts.append(txt) - - return "\n".join(parts).strip() - - # ----------------------------------------------------- - # SPLIT VOLUMES - # ----------------------------------------------------- - def split_into_volumes(self): - logger.debug( - "Splitting into volumes (max %s chapters per volume)", self.MAX_VOL_SIZE) - - chapters = len(self.chapters) - volume = 1 - index = 0 - - while index < chapters: - chunk = self.chapters[index:index + self.MAX_VOL_SIZE] - volume_dir = self.base_path / f"v{volume}" - volume_dir.mkdir(exist_ok=True) - - for ch in chunk: - filename = volume_dir / f"{ch.number:05d}_{ch.title}.txt" - filename.write_text(ch.text, encoding="utf-8") - - logger.debug("Volume %s saved (%s chapters)", volume, len(chunk)) - volume += 1 - index += self.MAX_VOL_SIZE + text = ( + sib.get_text("\n", strip=True) + if hasattr(sib, "get_text") + else str(sib).strip() + ) + if text: + parts.append(text) + + raw = "\n".join(parts) + raw = clean_text(raw, self.site.replacements) + return raw.strip() diff --git a/bookscraper/scraper/logger.py b/bookscraper/scraper/logger.py index f70d0d5..e0f28f1 100644 --- a/bookscraper/scraper/logger.py +++ b/bookscraper/scraper/logger.py @@ -2,26 +2,72 @@ import logging from io import StringIO -# In-memory buffer returned to web UI +# In-memory buffer (voor eindresultaat) LOG_BUFFER = StringIO() +# List van callbacks (SSE-clients) +LISTENERS = [] + + +def add_listener(callback): + """Registreer een SSE listener callback.""" + LISTENERS.append(callback) + + +def remove_listener(callback): + """Verwijder SSE listener (bij disconnect).""" + if callback in LISTENERS: + LISTENERS.remove(callback) + + +def broadcast(line): + """Stuur logregel naar alle listeners.""" + for cb in LISTENERS[:]: + try: + cb(line) + except Exception: + LISTENERS.remove(cb) + def setup_logger(): + """Creรซer logger die naar console, buffer รฉn SSE broadcast.""" logger = logging.getLogger("bookscraper") logger.setLevel(logging.DEBUG) - logger.handlers = [] # voorkomen dubbele handlers bij reload + logger.handlers = [] - # Console handler + # formatter + fmt = logging.Formatter("[%(levelname)s] %(message)s") + + # console handler ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) - ch.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) + ch.setFormatter(fmt) + + # buffer handler + bh = logging.StreamHandler(LOG_BUFFER) + bh.setLevel(logging.DEBUG) + bh.setFormatter(fmt) - # Buffer handler for returning to UI - mh = logging.StreamHandler(LOG_BUFFER) - mh.setLevel(logging.DEBUG) - mh.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) + # SSE handler + class SSEHandler(logging.Handler): + def emit(self, record): + msg = self.format(record) + broadcast(msg) + + sh = SSEHandler() + sh.setLevel(logging.DEBUG) + sh.setFormatter(fmt) logger.addHandler(ch) - logger.addHandler(mh) + logger.addHandler(bh) + logger.addHandler(sh) return logger + + +# Globale logger +LOGGER = setup_logger() + + +def log_debug(msg): + LOGGER.debug(msg) diff --git a/bookscraper/scraper/sites.py b/bookscraper/scraper/sites.py index 89d3451..51023dc 100644 --- a/bookscraper/scraper/sites.py +++ b/bookscraper/scraper/sites.py @@ -3,7 +3,7 @@ class BookSite: self.name = "piaotian" self.root = "https://www.ptwxz.com" self.chapter_list_selector = "div.centent" - self.encoding = "gb2312" + self.encoding = "GB18030" self.replacements = { "  ": "\n", "ๆ‰‹ๆœบ็”จๆˆท่ฏท่ฎฟ้—ฎhttp://m.piaotian.net": "", diff --git a/bookscraper/templates/index.html b/bookscraper/templates/index.html index 03526d9..7cb4612 100644 --- a/bookscraper/templates/index.html +++ b/bookscraper/templates/index.html @@ -1,22 +1,40 @@ - Book Scraper + BookScraper + -

Book Scraper

+

BookScraper

-{% if error %} -

{{ error }}

-{% endif %} + + -
-

- -

- -
+

Realtime log:

+
+ +