From dff30e5768f81cbf648ce1ea83d43151bc4a2698 Mon Sep 17 00:00:00 2001 From: "peter.fong" Date: Wed, 3 Dec 2025 11:42:24 +0100 Subject: [PATCH] text parsing optimization --- bookscraper/app.py | 17 ++- bookscraper/scraper/replacements/encoding.txt | 37 ++++++ bookscraper/scraper/replacements/html.txt | 27 +++++ bookscraper/scraper/replacements/junk.txt | 77 +++++++++++++ bookscraper/scraper/tasks/audio_tasks.py | 6 +- bookscraper/scraper/tasks/parse_tasks.py | 107 +++++++++++++----- bookscraper/scraper/tasks/scraping.py | 11 +- bookscraper/scraper/ui_log.py | 10 ++ bookscraper/scraper/utils.py | 94 +++++++++++---- bookscraper/templates/result.html | 34 +++++- 10 files changed, 358 insertions(+), 62 deletions(-) create mode 100644 bookscraper/scraper/replacements/encoding.txt create mode 100644 bookscraper/scraper/replacements/html.txt create mode 100644 bookscraper/scraper/replacements/junk.txt diff --git a/bookscraper/app.py b/bookscraper/app.py index 9840714..bf758c8 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -17,7 +17,7 @@ from scraper.abort import set_abort from scraper.progress import get_progress # UI LOGS (GLOBAL — no book_id) -from scraper.ui_log import get_ui_logs +from scraper.ui_log import get_ui_logs, reset_ui_logs # <-- ADDED from celery.result import AsyncResult @@ -58,6 +58,11 @@ def start_scraping(): if not url: return render_template("result.html", error="Geen URL opgegeven.") + # --------------------------------------------------------- + # NEW: Clear UI log buffer when starting a new scrape + # --------------------------------------------------------- + reset_ui_logs() + log_debug(f"[WEB] Scraping via Celery: {url}") async_result = celery_app.send_task( @@ -70,11 +75,19 @@ def start_scraping(): "result.html", message="Scraping gestart.", scraping_task_id=async_result.id, - # voor result.html cover rendering book_title=None, ) +# ===================================================== +# CLEAR UI LOGS MANUALLY (NEW) +# ===================================================== +@app.route("/clear-logs", methods=["POST"]) +def clear_logs(): + reset_ui_logs() + return jsonify({"status": "ok", "message": "UI logs cleared"}) + + # ===================================================== # ABORT (per book_id) # ===================================================== diff --git a/bookscraper/scraper/replacements/encoding.txt b/bookscraper/scraper/replacements/encoding.txt new file mode 100644 index 0000000..6343bfc --- /dev/null +++ b/bookscraper/scraper/replacements/encoding.txt @@ -0,0 +1,37 @@ +#scraper/replacements/encoding.txt +# --- fix common encoding artifacts --- +\u3000= # IDEOGRAPHIC SPACE → empty +\u00A0= # non-breaking space → empty + +# full-width punctuation +,=, +。=. +!=! +?=? +;=; +:=: +(=( +)=) +【=[ +】=] +《=< +》=> + +# hyphen variants +–=- +—=- +―=- +\u3000= +\u00A0= + = + = += +—=— +“=" +”=" +’=' +…=… +•=* +▁= +▲= + = diff --git a/bookscraper/scraper/replacements/html.txt b/bookscraper/scraper/replacements/html.txt new file mode 100644 index 0000000..57dd8e8 --- /dev/null +++ b/bookscraper/scraper/replacements/html.txt @@ -0,0 +1,27 @@ +#scraper/replacements/html.txt +
=\n +
=\n +
=\n + = +  = +   = + = + = + = +“=" +”=" +‘=' +’=' +<=< +>=> +©= +®= +™= +fontbigbigbig= +fontbigbig= +font1= +font2= +font3= +strongstrong= +divdiv= +spanspan= diff --git a/bookscraper/scraper/replacements/junk.txt b/bookscraper/scraper/replacements/junk.txt new file mode 100644 index 0000000..b5fc95b --- /dev/null +++ b/bookscraper/scraper/replacements/junk.txt @@ -0,0 +1,77 @@ +#scraper/replacements/junk.txt +# --- Navigation --- +上一章= +下一章= +上一頁= +下一頁= +返回顶部= +返回目录= +返回书页= +章节目录= +章节列表= +快捷键= +(快捷键 ←)= +(快捷键 →)= +(快捷键)= +(快捷键 ←)= +(快捷键 →)= +上一页= +下一页= +手机阅读= +返回= +上一页阅读= +下一页阅读= + +# --- Booksite footer disclaimers --- +重要声明= +所有的文字= +均由网友发表= +均由网友上传= +本站立场无关= +阅读更多小说= +返回飘天文学网= +小说阅读网= +最新章节请返回= +永久地址= +All rights reserved= +Copyright= +飘天文学= + +# --- Piaotia specific --- +请记住本书域名= +请收藏本书= +加入书签= +加入书架= +收藏本书= +推荐本书= +本章未完= +请稍后= +最新网址= +小说网= +小说阅读= +将本书加入书架= +章节出错= +点此举报= +举报原因= + +# --- Ads / QR / watermark --- +关注公众号= +微信扫一扫= +扫码阅读= +二维码= +QQ交流群= +加QQ群= +广告= +广告位= +sponsor= +sponsored= +ADVERTISEMENT= +Advertisment= +Adblock= + +# --- Mode / UI related --- +选择背景颜色= +选择字体大小= +繁體中文= +模式选择= +阅读模式= diff --git a/bookscraper/scraper/tasks/audio_tasks.py b/bookscraper/scraper/tasks/audio_tasks.py index c1a3ff0..f735516 100644 --- a/bookscraper/scraper/tasks/audio_tasks.py +++ b/bookscraper/scraper/tasks/audio_tasks.py @@ -98,7 +98,7 @@ def generate_audio( # ============================================================ container_path = chapter_text - log(f"[AUDIO] CH{chapter_number}: container_path={container_path}") + # log(f"[AUDIO] CH{chapter_number}: container_path={container_path}") # 1) Strip container prefix to get relative path: BOOK/VOLUME/FILE if container_path.startswith(CONTAINER_PREFIX): @@ -120,7 +120,7 @@ def generate_audio( # 2) Construct real host path host_path = os.path.join(HOST_PATH, relative_path) - log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}") + # log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}") # ============================================================ # PREPARE OUTPUT DIR (always correct) @@ -132,7 +132,7 @@ def generate_audio( safe_num = f"{chapter_number:04d}" audio_file = os.path.join(base_dir, f"{safe_num}.m4a") - log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}") + # log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}") if os.path.exists(audio_file): log(f"[AUDIO] Skip CH{chapter_number} → already exists") diff --git a/bookscraper/scraper/tasks/parse_tasks.py b/bookscraper/scraper/tasks/parse_tasks.py index ddea90e..52066f9 100644 --- a/bookscraper/scraper/tasks/parse_tasks.py +++ b/bookscraper/scraper/tasks/parse_tasks.py @@ -1,50 +1,35 @@ # ========================================================= # File: scraper/tasks/parse_tasks.py # Purpose: Parse downloaded HTML into clean chapter text. -# -# Abort Behavior: -# - parse MUST ALWAYS RUN once download has started -# - even if the user triggers abort afterwards -# - (abort only prevents new chapters from starting) -# -# Logging: -# - Same unified log_msg(book_id, message) as download_tasks -# - publisher.log → console -# - ui_log.push_ui → GUI +# Enhanced version: Piaotia H1→content extractor + clean pipeline +# NO HARDCODED REPLACEMENTS — everything comes from replacement files # ========================================================= from celery_app import celery_app from bs4 import BeautifulSoup -from scraper.utils import clean_text, load_replacements +from scraper.utils import clean_text, load_all_replacements from scraper.tasks.download_tasks import log_msg # unified logger -print(">>> [IMPORT] parse_tasks.py loaded") +print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)") @celery_app.task(bind=True, queue="parse", ignore_result=False) def parse_chapter(self, download_result: dict, meta: dict): - """ - Parse raw HTML returned by download_chapter into clean chapter text. - """ - # Extract book_id stored by download_tasks book_id = download_result.get("book_id", "NOBOOK") # ------------------------------------------------------------ - # 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS + # SKIPPED DOWNLOAD → SKIP PARSE # ------------------------------------------------------------ if download_result.get("skipped"): chapter = download_result.get("chapter") log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)") - - # Ensure book_id is present in the returned dict download_result["book_id"] = book_id - return download_result # ------------------------------------------------------------ - # 2) Normal Parsing + # NORMAL PARSE # ------------------------------------------------------------ chapter_num = download_result["chapter"] chapter_url = download_result["url"] @@ -54,14 +39,19 @@ def parse_chapter(self, download_result: dict, meta: dict): soup = BeautifulSoup(html, "lxml") + # ------------------------------------------------------------ + # STRICT SELECTORS (direct content blocks) + # ------------------------------------------------------------ selectors = [ "#content", - ".content", "div#content", + ".content", "div.content", + "#chaptercontent", "div#chaptercontent", "#chapterContent", ".read-content", + "div.read-content", ] node = None @@ -71,20 +61,81 @@ def parse_chapter(self, download_result: dict, meta: dict): node = tmp break - raw = node.get_text() if node else soup.get_text() + # ------------------------------------------------------------ + # PIAOTIA FALLBACK: + # Extract content between

and the "bottomlink" block. + # ------------------------------------------------------------ + raw = None + if node is None: + h1 = soup.find("h1") + if h1: + content_parts = [] + for sib in h1.next_siblings: + + # stop at bottom navigation/footer block + sib_class = getattr(sib, "get", lambda *_: None)("class") + if sib_class and ( + "bottomlink" in sib_class or sib_class == "bottomlink" + ): + break + + # ignore typical noise containers + if getattr(sib, "name", None) in ["script", "style", "center"]: + continue + + if hasattr(sib, "get_text"): + content_parts.append(sib.get_text(separator="\n")) + else: + content_parts.append(str(sib)) + + raw = "\n".join(content_parts) # ------------------------------------------------------------ - # Apply global replacements + # FINAL FALLBACK # ------------------------------------------------------------ - REPL = load_replacements() - text = clean_text(raw, REPL) + if raw is None: + if node: + raw = node.get_text(separator="\n") + else: + # drop scripts & styles + for tag in soup(["script", "style", "noscript"]): + tag.decompose() + + raw = soup.get_text(separator="\n") # ------------------------------------------------------------ - # Chapter 1 gets full header + # MULTIPASS CLEANING via replacement files ONLY + # ------------------------------------------------------------ + REPL = load_all_replacements() + + text = raw + for _ in range(5): # like the C# CleanText loop + text = clean_text(text, REPL) + + # ------------------------------------------------------------ + # Collapse excessive empty lines + # ------------------------------------------------------------ + cleaned = [] + prev_blank = False + + for line in text.split("\n"): + stripped = line.rstrip() + if stripped == "": + if prev_blank: + continue + prev_blank = True + cleaned.append("") + else: + prev_blank = False + cleaned.append(stripped) + + text = "\n".join(cleaned) + + # ------------------------------------------------------------ + # Add header to chapter 1 # ------------------------------------------------------------ if chapter_num == 1: book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN" - header = ( f"{meta.get('title','')}\n" f"Author: {meta.get('author','')}\n" diff --git a/bookscraper/scraper/tasks/scraping.py b/bookscraper/scraper/tasks/scraping.py index 8b0b9fe..0694089 100644 --- a/bookscraper/scraper/tasks/scraping.py +++ b/bookscraper/scraper/tasks/scraping.py @@ -12,6 +12,7 @@ import redis from scraper.sites import BookSite from scraper.book_scraper import BookScraper from scraper.abort import clear_abort # no circular deps +from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT print(">>> [IMPORT] scraping.py loaded") @@ -24,6 +25,11 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True) def start_scrape_book(self, url: str): """Scrapes metadata + chapters and prepares download tracking.""" + # ------------------------------------------------------------ + # NEW: clear UI log buffer at start of new run + # ------------------------------------------------------------ + reset_ui_logs() + log(f"[SCRAPING] Start scraping for: {url}") # ------------------------------------------------------------ @@ -50,10 +56,10 @@ def start_scrape_book(self, url: str): log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters") # ------------------------------------------------------------ - # BOOK RUN ID (CHANGED: use book title instead of UUID) + # BOOK RUN ID (using title as ID) # ------------------------------------------------------------ title = result.get("title") or "UnknownBook" - book_id = title # ← your requirement: title is unique and consistent + book_id = title # user requirement result["book_id"] = book_id @@ -74,7 +80,6 @@ def start_scrape_book(self, url: str): # ------------------------------------------------------------ # DISPATCH DOWNLOAD CONTROLLER # ------------------------------------------------------------ - # controller task signature = launch_downloads(book_id, scrape_result) celery_app.send_task( "scraper.tasks.controller_tasks.launch_downloads", args=[book_id, result], diff --git a/bookscraper/scraper/ui_log.py b/bookscraper/scraper/ui_log.py index 18db819..312e20e 100644 --- a/bookscraper/scraper/ui_log.py +++ b/bookscraper/scraper/ui_log.py @@ -34,3 +34,13 @@ def get_ui_logs(limit: int = None): limit = LOG_BUFFER_SIZE return r.lrange(UI_LOG_KEY, -limit, -1) + + +def reset_ui_logs(): + """ + Clear the entire UI log buffer. + Used by: + - Clear button in GUI + - Auto-clear when new book scraping starts + """ + r.delete(UI_LOG_KEY) diff --git a/bookscraper/scraper/utils.py b/bookscraper/scraper/utils.py index 08e45f0..0bdd2f9 100644 --- a/bookscraper/scraper/utils.py +++ b/bookscraper/scraper/utils.py @@ -1,19 +1,30 @@ +# ============================================================ +# File: scraper/utils.py +# Purpose: +# Centralised replacement loader + text cleaner +# using 3 replacement categories: +# 1) HTML replacements +# 2) Encoding replacements +# 3) Junk-term replacements (generic "noise" phrases) +# +# Nothing in this file contains hardcoded cleanup rules. +# EVERYTHING comes from replacement files ONLY. +# ============================================================ + import os import re from pathlib import Path # ------------------------------------------------------------ -# Load replacements from text_replacements.txt (optional file) +# Generic key=value replacement loader # ------------------------------------------------------------ -def load_replacements(filepath="text_replacements.txt") -> dict: +def load_replacement_file(path: Path) -> dict: """ - Load key=value style replacements. - Empty or missing file → return {}. - Lines starting with '#' are ignored. + Loads key=value pairs from a file. + Missing file → {}. + Ignores empty lines and lines starting with '#'. """ - path = Path(filepath) - if not path.exists(): return {} @@ -22,8 +33,10 @@ def load_replacements(filepath="text_replacements.txt") -> dict: with open(path, "r", encoding="utf-8") as f: for line in f: line = line.strip() + if not line or line.startswith("#"): continue + if "=" in line: key, val = line.split("=", 1) repl[key.strip()] = val.strip() @@ -32,36 +45,69 @@ def load_replacements(filepath="text_replacements.txt") -> dict: # ------------------------------------------------------------ -# Clean extracted HTML text +# Load all categories (HTML → encoding → junk) +# Order matters: later overrides earlier. +# ------------------------------------------------------------ +def load_all_replacements() -> dict: + root = Path(__file__).parent / "replacements" + + html_file = root / "html.txt" + enc_file = root / "encoding.txt" + junk_file = root / "junk.txt" + + repl = {} + repl.update(load_replacement_file(html_file)) + repl.update(load_replacement_file(enc_file)) + repl.update(load_replacement_file(junk_file)) + + return repl + + +# ------------------------------------------------------------ +# Legacy compatibility wrapper +# Many modules still import: from scraper.utils import load_replacements +# This wrapper keeps everything working. # ------------------------------------------------------------ -def clean_text(raw: str, repl_dict: dict = None) -> str: +def load_replacements(filepath=None) -> dict: """ - Normalize whitespace, remove junk, apply replacements. - repl_dict is optional → {} if none provided. + Backward-compatible alias. + - If called with no filepath → return merged replacements. + - If called with a filepath → load that one file only. """ - if repl_dict is None: - repl_dict = {} + if filepath is None: + return load_all_replacements() + else: + # Allow explicit loading of a single file + path = Path(filepath) + return load_replacement_file(path) - txt = raw.replace("\r", "") # normalize CRLF - # Collapse 3+ blank lines → max 1 empty line - txt = re.sub(r"\n{3,}", "\n\n", txt) +# ------------------------------------------------------------ +# Clean text using loaded replacements +# ------------------------------------------------------------ +def clean_text(raw: str, repl: dict) -> str: + """ + Apply replacements and basic whitespace normalisation. + No hardcoded rules live here. + """ + if not raw: + return "" - # Apply replacements - for key, val in repl_dict.items(): + txt = raw.replace("\r", "") + + # Apply loaded replacements + for key, val in repl.items(): txt = txt.replace(key, val) + # Collapse 3+ blank lines → max 1 + txt = re.sub(r"\n{3,}", "\n\n", txt) + return txt.strip() # ------------------------------------------------------------ -# Determine save path for a chapter (shared by download & save) +# Determine chapter save path # ------------------------------------------------------------ def get_save_path(chapter_num: int, base_path: str) -> str: - """ - Returns the filesystem path where this chapter should be saved. - Formats the filename as 0001.txt, 0002.txt, ... - """ - filename = f"{chapter_num:04d}.txt" return os.path.join(base_path, filename) diff --git a/bookscraper/templates/result.html b/bookscraper/templates/result.html index 81a12c8..57aabf9 100644 --- a/bookscraper/templates/result.html +++ b/bookscraper/templates/result.html @@ -31,6 +31,21 @@ border-radius: 6px; font-size: 13px; } + + /* NEW: Clear button */ + #clearLogBtn { + margin-bottom: 10px; + padding: 8px 16px; + background: #777; + color: white; + border: none; + border-radius: 6px; + cursor: pointer; + } + #clearLogBtn:hover { + background: #555; + } + #abortBtn { padding: 12px 20px; background: #d9534f; @@ -68,7 +83,7 @@
{{ message }}
{% endif %} - + {% if book_title %}
Cover:
@@ -103,7 +118,11 @@
- Live log: + Live log:
+ + + +
@@ -204,6 +223,17 @@ }) .catch(() => setTimeout(pollLogs, 1500)); } + + // ========================================================= + // NEW: Clear logs button handler + // ========================================================= + function clearLogs() { + fetch("/clear-logs", { method: "POST" }) + .then(() => { + document.getElementById("logbox").innerHTML = ""; + }) + .catch((e) => console.error("Clear logs failed:", e)); + }