From dff30e5768f81cbf648ce1ea83d43151bc4a2698 Mon Sep 17 00:00:00 2001
From: "peter.fong" <peter@fong.live>
Date: Wed, 3 Dec 2025 11:42:24 +0100
Subject: [PATCH] text parsing optimization

---
 bookscraper/app.py                            |  17 ++-
 bookscraper/scraper/replacements/encoding.txt |  37 ++++++
 bookscraper/scraper/replacements/html.txt     |  27 +++++
 bookscraper/scraper/replacements/junk.txt     |  77 +++++++++++++
 bookscraper/scraper/tasks/audio_tasks.py      |   6 +-
 bookscraper/scraper/tasks/parse_tasks.py      | 107 +++++++++++++-----
 bookscraper/scraper/tasks/scraping.py         |  11 +-
 bookscraper/scraper/ui_log.py                 |  10 ++
 bookscraper/scraper/utils.py                  |  94 +++++++++++----
 bookscraper/templates/result.html             |  34 +++++-
 10 files changed, 358 insertions(+), 62 deletions(-)
 create mode 100644 bookscraper/scraper/replacements/encoding.txt
 create mode 100644 bookscraper/scraper/replacements/html.txt
 create mode 100644 bookscraper/scraper/replacements/junk.txt
diff --git a/bookscraper/app.py b/bookscraper/app.py
index 9840714..bf758c8 100644
--- a/bookscraper/app.py
+++ b/bookscraper/app.py
@@ -17,7 +17,7 @@ from scraper.abort import set_abort
 from scraper.progress import get_progress
 
 # UI LOGS (GLOBAL — no book_id)
-from scraper.ui_log import get_ui_logs
+from scraper.ui_log import get_ui_logs, reset_ui_logs  # <-- ADDED
 
 from celery.result import AsyncResult
 
@@ -58,6 +58,11 @@ def start_scraping():
     if not url:
         return render_template("result.html", error="Geen URL opgegeven.")
 
+    # ---------------------------------------------------------
+    # NEW: Clear UI log buffer when starting a new scrape
+    # ---------------------------------------------------------
+    reset_ui_logs()
+
     log_debug(f"[WEB] Scraping via Celery: {url}")
 
     async_result = celery_app.send_task(
@@ -70,11 +75,19 @@ def start_scraping():
         "result.html",
         message="Scraping gestart.",
         scraping_task_id=async_result.id,
-        # voor result.html cover rendering
         book_title=None,
     )
 
 
+# =====================================================
+# CLEAR UI LOGS MANUALLY (NEW)
+# =====================================================
+@app.route("/clear-logs", methods=["POST"])
+def clear_logs():
+    reset_ui_logs()
+    return jsonify({"status": "ok", "message": "UI logs cleared"})
+
+
 # =====================================================
 # ABORT (per book_id)
 # =====================================================
diff --git a/bookscraper/scraper/replacements/encoding.txt b/bookscraper/scraper/replacements/encoding.txt
new file mode 100644
index 0000000..6343bfc
--- /dev/null
+++ b/bookscraper/scraper/replacements/encoding.txt
@@ -0,0 +1,37 @@
+#scraper/replacements/encoding.txt
+# --- fix common encoding artifacts ---
+\u3000=       # IDEOGRAPHIC SPACE → empty
+\u00A0=       # non-breaking space → empty
+
+# full-width punctuation
+，=,
+。=.
+！=!
+？=?
+；=;
+：=:
+（=(
+）=)
+【=[
+】=]
+《=<
+》=>
+
+# hyphen variants
+–=-
+—=-
+―=-
+\u3000=
+\u00A0=
+ÃÂÃÂ =
+ÃÂ =
+ï»¿=
+â=—
+â="
+â="
+â='
+â¦=…
+â¢=*
+â=
+â²=
+Â =
diff --git a/bookscraper/scraper/replacements/html.txt b/bookscraper/scraper/replacements/html.txt
new file mode 100644
index 0000000..57dd8e8
--- /dev/null
+++ b/bookscraper/scraper/replacements/html.txt
@@ -0,0 +1,27 @@
+#scraper/replacements/html.txt
+<br>=\n
+<br/>=\n
+<br />=\n
+&nbsp;=
+&nbsp&nbsp=
+&nbsp&nbsp&nbsp=
+&emsp;=
+&ensp;=
+&thinsp;=
+&ldquo;="
+&rdquo;="
+&lsquo;='
+&rsquo;='
+&lt;=<
+&gt;=>
+&copy;=
+&reg;=
+&trade;=
+fontbigbigbig=
+fontbigbig=
+font1=
+font2=
+font3=
+strongstrong=
+divdiv=
+spanspan=
diff --git a/bookscraper/scraper/replacements/junk.txt b/bookscraper/scraper/replacements/junk.txt
new file mode 100644
index 0000000..b5fc95b
--- /dev/null
+++ b/bookscraper/scraper/replacements/junk.txt
@@ -0,0 +1,77 @@
+#scraper/replacements/junk.txt
+# --- Navigation ---
+上一章=
+下一章=
+上一頁=
+下一頁=
+返回顶部=
+返回目录=
+返回书页=
+章节目录=
+章节列表=
+快捷键=
+（快捷键  ←）=
+（快捷键  →）=
+（快捷键）=
+（快捷键 ←）=
+（快捷键 →）=
+上一页=
+下一页=
+手机阅读=
+返回=
+上一页阅读=
+下一页阅读=
+
+# --- Booksite footer disclaimers ---
+重要声明=
+所有的文字=
+均由网友发表=
+均由网友上传=
+本站立场无关=
+阅读更多小说=
+返回飘天文学网=
+小说阅读网=
+最新章节请返回=
+永久地址=
+All rights reserved=
+Copyright=
+飘天文学=
+
+# --- Piaotia specific ---
+请记住本书域名=
+请收藏本书=
+加入书签=
+加入书架=
+收藏本书=
+推荐本书=
+本章未完=
+请稍后=
+最新网址=
+小说网=
+小说阅读=
+将本书加入书架=
+章节出错=
+点此举报=
+举报原因=
+
+# --- Ads / QR / watermark ---
+关注公众号=
+微信扫一扫=
+扫码阅读=
+二维码=
+QQ交流群=
+加QQ群=
+广告=
+广告位=
+sponsor=
+sponsored=
+ADVERTISEMENT=
+Advertisment=
+Adblock=
+
+# --- Mode / UI related ---
+选择背景颜色=
+选择字体大小=
+繁體中文=
+模式选择=
+阅读模式=
diff --git a/bookscraper/scraper/tasks/audio_tasks.py b/bookscraper/scraper/tasks/audio_tasks.py
index c1a3ff0..f735516 100644
--- a/bookscraper/scraper/tasks/audio_tasks.py
+++ b/bookscraper/scraper/tasks/audio_tasks.py
@@ -98,7 +98,7 @@ def generate_audio(
     # ============================================================
 
     container_path = chapter_text
-    log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
+    # log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
 
     # 1) Strip container prefix to get relative path: BOOK/VOLUME/FILE
     if container_path.startswith(CONTAINER_PREFIX):
@@ -120,7 +120,7 @@ def generate_audio(
 
     # 2) Construct real host path
     host_path = os.path.join(HOST_PATH, relative_path)
-    log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
+    # log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
 
     # ============================================================
     # PREPARE OUTPUT DIR  (always correct)
@@ -132,7 +132,7 @@ def generate_audio(
     safe_num = f"{chapter_number:04d}"
     audio_file = os.path.join(base_dir, f"{safe_num}.m4a")
 
-    log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
+    # log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
 
     if os.path.exists(audio_file):
         log(f"[AUDIO] Skip CH{chapter_number} → already exists")
diff --git a/bookscraper/scraper/tasks/parse_tasks.py b/bookscraper/scraper/tasks/parse_tasks.py
index ddea90e..52066f9 100644
--- a/bookscraper/scraper/tasks/parse_tasks.py
+++ b/bookscraper/scraper/tasks/parse_tasks.py
@@ -1,50 +1,35 @@
 # =========================================================
 # File: scraper/tasks/parse_tasks.py
 # Purpose: Parse downloaded HTML into clean chapter text.
-#
-# Abort Behavior:
-#   - parse MUST ALWAYS RUN once download has started
-#   - even if the user triggers abort afterwards
-#   - (abort only prevents new chapters from starting)
-#
-# Logging:
-#   - Same unified log_msg(book_id, message) as download_tasks
-#   - publisher.log → console
-#   - ui_log.push_ui → GUI
+# Enhanced version: Piaotia H1→content extractor + clean pipeline
+# NO HARDCODED REPLACEMENTS — everything comes from replacement files
 # =========================================================
 
 from celery_app import celery_app
 from bs4 import BeautifulSoup
 
-from scraper.utils import clean_text, load_replacements
+from scraper.utils import clean_text, load_all_replacements
 from scraper.tasks.download_tasks import log_msg  # unified logger
 
-print(">>> [IMPORT] parse_tasks.py loaded")
+print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
 
 
 @celery_app.task(bind=True, queue="parse", ignore_result=False)
 def parse_chapter(self, download_result: dict, meta: dict):
-    """
-    Parse raw HTML returned by download_chapter into clean chapter text.
-    """
 
-    # Extract book_id stored by download_tasks
     book_id = download_result.get("book_id", "NOBOOK")
 
     # ------------------------------------------------------------
-    # 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
+    # SKIPPED DOWNLOAD → SKIP PARSE
     # ------------------------------------------------------------
     if download_result.get("skipped"):
         chapter = download_result.get("chapter")
         log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
-
-        # Ensure book_id is present in the returned dict
         download_result["book_id"] = book_id
-
         return download_result
 
     # ------------------------------------------------------------
-    # 2) Normal Parsing
+    # NORMAL PARSE
     # ------------------------------------------------------------
     chapter_num = download_result["chapter"]
     chapter_url = download_result["url"]
@@ -54,14 +39,19 @@ def parse_chapter(self, download_result: dict, meta: dict):
 
     soup = BeautifulSoup(html, "lxml")
 
+    # ------------------------------------------------------------
+    # STRICT SELECTORS (direct content blocks)
+    # ------------------------------------------------------------
     selectors = [
         "#content",
-        ".content",
         "div#content",
+        ".content",
         "div.content",
+        "#chaptercontent",
         "div#chaptercontent",
         "#chapterContent",
         ".read-content",
+        "div.read-content",
     ]
 
     node = None
@@ -71,20 +61,81 @@ def parse_chapter(self, download_result: dict, meta: dict):
             node = tmp
             break
 
-    raw = node.get_text() if node else soup.get_text()
+    # ------------------------------------------------------------
+    # PIAOTIA FALLBACK:
+    # Extract content between <H1> and the "bottomlink" block.
+    # ------------------------------------------------------------
+    raw = None
+    if node is None:
+        h1 = soup.find("h1")
+        if h1:
+            content_parts = []
+            for sib in h1.next_siblings:
+
+                # stop at bottom navigation/footer block
+                sib_class = getattr(sib, "get", lambda *_: None)("class")
+                if sib_class and (
+                    "bottomlink" in sib_class or sib_class == "bottomlink"
+                ):
+                    break
+
+                # ignore typical noise containers
+                if getattr(sib, "name", None) in ["script", "style", "center"]:
+                    continue
+
+                if hasattr(sib, "get_text"):
+                    content_parts.append(sib.get_text(separator="\n"))
+                else:
+                    content_parts.append(str(sib))
+
+            raw = "\n".join(content_parts)
 
     # ------------------------------------------------------------
-    # Apply global replacements
+    # FINAL FALLBACK
     # ------------------------------------------------------------
-    REPL = load_replacements()
-    text = clean_text(raw, REPL)
+    if raw is None:
+        if node:
+            raw = node.get_text(separator="\n")
+        else:
+            # drop scripts & styles
+            for tag in soup(["script", "style", "noscript"]):
+                tag.decompose()
+
+            raw = soup.get_text(separator="\n")
 
     # ------------------------------------------------------------
-    # Chapter 1 gets full header
+    # MULTIPASS CLEANING via replacement files ONLY
+    # ------------------------------------------------------------
+    REPL = load_all_replacements()
+
+    text = raw
+    for _ in range(5):  # like the C# CleanText loop
+        text = clean_text(text, REPL)
+
+    # ------------------------------------------------------------
+    # Collapse excessive empty lines
+    # ------------------------------------------------------------
+    cleaned = []
+    prev_blank = False
+
+    for line in text.split("\n"):
+        stripped = line.rstrip()
+        if stripped == "":
+            if prev_blank:
+                continue
+            prev_blank = True
+            cleaned.append("")
+        else:
+            prev_blank = False
+            cleaned.append(stripped)
+
+    text = "\n".join(cleaned)
+
+    # ------------------------------------------------------------
+    # Add header to chapter 1
     # ------------------------------------------------------------
     if chapter_num == 1:
         book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
-
         header = (
             f"{meta.get('title','')}\n"
             f"Author: {meta.get('author','')}\n"
diff --git a/bookscraper/scraper/tasks/scraping.py b/bookscraper/scraper/tasks/scraping.py
index 8b0b9fe..0694089 100644
--- a/bookscraper/scraper/tasks/scraping.py
+++ b/bookscraper/scraper/tasks/scraping.py
@@ -12,6 +12,7 @@ import redis
 from scraper.sites import BookSite
 from scraper.book_scraper import BookScraper
 from scraper.abort import clear_abort  # no circular deps
+from scraper.ui_log import reset_ui_logs  # <-- NEW IMPORT
 
 print(">>> [IMPORT] scraping.py loaded")
 
@@ -24,6 +25,11 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
 def start_scrape_book(self, url: str):
     """Scrapes metadata + chapters and prepares download tracking."""
 
+    # ------------------------------------------------------------
+    # NEW: clear UI log buffer at start of new run
+    # ------------------------------------------------------------
+    reset_ui_logs()
+
     log(f"[SCRAPING] Start scraping for: {url}")
 
     # ------------------------------------------------------------
@@ -50,10 +56,10 @@ def start_scrape_book(self, url: str):
     log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
 
     # ------------------------------------------------------------
-    # BOOK RUN ID  (CHANGED: use book title instead of UUID)
+    # BOOK RUN ID (using title as ID)
     # ------------------------------------------------------------
     title = result.get("title") or "UnknownBook"
-    book_id = title  # ← your requirement: title is unique and consistent
+    book_id = title  # user requirement
 
     result["book_id"] = book_id
 
@@ -74,7 +80,6 @@ def start_scrape_book(self, url: str):
     # ------------------------------------------------------------
     # DISPATCH DOWNLOAD CONTROLLER
     # ------------------------------------------------------------
-    # controller task signature = launch_downloads(book_id, scrape_result)
     celery_app.send_task(
         "scraper.tasks.controller_tasks.launch_downloads",
         args=[book_id, result],
diff --git a/bookscraper/scraper/ui_log.py b/bookscraper/scraper/ui_log.py
index 18db819..312e20e 100644
--- a/bookscraper/scraper/ui_log.py
+++ b/bookscraper/scraper/ui_log.py
@@ -34,3 +34,13 @@ def get_ui_logs(limit: int = None):
         limit = LOG_BUFFER_SIZE
 
     return r.lrange(UI_LOG_KEY, -limit, -1)
+
+
+def reset_ui_logs():
+    """
+    Clear the entire UI log buffer.
+    Used by:
+      - Clear button in GUI
+      - Auto-clear when new book scraping starts
+    """
+    r.delete(UI_LOG_KEY)
diff --git a/bookscraper/scraper/utils.py b/bookscraper/scraper/utils.py
index 08e45f0..0bdd2f9 100644
--- a/bookscraper/scraper/utils.py
+++ b/bookscraper/scraper/utils.py
@@ -1,19 +1,30 @@
+# ============================================================
+# File: scraper/utils.py
+# Purpose:
+#   Centralised replacement loader + text cleaner
+#   using 3 replacement categories:
+#       1) HTML replacements
+#       2) Encoding replacements
+#       3) Junk-term replacements (generic "noise" phrases)
+#
+# Nothing in this file contains hardcoded cleanup rules.
+# EVERYTHING comes from replacement files ONLY.
+# ============================================================
+
 import os
 import re
 from pathlib import Path
 
 
 # ------------------------------------------------------------
-# Load replacements from text_replacements.txt (optional file)
+# Generic key=value replacement loader
 # ------------------------------------------------------------
-def load_replacements(filepath="text_replacements.txt") -> dict:
+def load_replacement_file(path: Path) -> dict:
     """
-    Load key=value style replacements.
-    Empty or missing file → return {}.
-    Lines starting with '#' are ignored.
+    Loads key=value pairs from a file.
+    Missing file → {}.
+    Ignores empty lines and lines starting with '#'.
     """
-    path = Path(filepath)
-
     if not path.exists():
         return {}
 
@@ -22,8 +33,10 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
     with open(path, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
+
             if not line or line.startswith("#"):
                 continue
+
             if "=" in line:
                 key, val = line.split("=", 1)
                 repl[key.strip()] = val.strip()
@@ -32,36 +45,69 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
 
 
 # ------------------------------------------------------------
-# Clean extracted HTML text
+# Load all categories (HTML → encoding → junk)
+# Order matters: later overrides earlier.
+# ------------------------------------------------------------
+def load_all_replacements() -> dict:
+    root = Path(__file__).parent / "replacements"
+
+    html_file = root / "html.txt"
+    enc_file = root / "encoding.txt"
+    junk_file = root / "junk.txt"
+
+    repl = {}
+    repl.update(load_replacement_file(html_file))
+    repl.update(load_replacement_file(enc_file))
+    repl.update(load_replacement_file(junk_file))
+
+    return repl
+
+
+# ------------------------------------------------------------
+# Legacy compatibility wrapper
+# Many modules still import: from scraper.utils import load_replacements
+# This wrapper keeps everything working.
 # ------------------------------------------------------------
-def clean_text(raw: str, repl_dict: dict = None) -> str:
+def load_replacements(filepath=None) -> dict:
     """
-    Normalize whitespace, remove junk, apply replacements.
-    repl_dict is optional → {} if none provided.
+    Backward-compatible alias.
+    - If called with no filepath → return merged replacements.
+    - If called with a filepath → load that one file only.
     """
-    if repl_dict is None:
-        repl_dict = {}
+    if filepath is None:
+        return load_all_replacements()
+    else:
+        # Allow explicit loading of a single file
+        path = Path(filepath)
+        return load_replacement_file(path)
 
-    txt = raw.replace("\r", "")  # normalize CRLF
 
-    # Collapse 3+ blank lines → max 1 empty line
-    txt = re.sub(r"\n{3,}", "\n\n", txt)
+# ------------------------------------------------------------
+# Clean text using loaded replacements
+# ------------------------------------------------------------
+def clean_text(raw: str, repl: dict) -> str:
+    """
+    Apply replacements and basic whitespace normalisation.
+    No hardcoded rules live here.
+    """
+    if not raw:
+        return ""
 
-    # Apply replacements
-    for key, val in repl_dict.items():
+    txt = raw.replace("\r", "")
+
+    # Apply loaded replacements
+    for key, val in repl.items():
         txt = txt.replace(key, val)
 
+    # Collapse 3+ blank lines → max 1
+    txt = re.sub(r"\n{3,}", "\n\n", txt)
+
     return txt.strip()
 
 
 # ------------------------------------------------------------
-# Determine save path for a chapter (shared by download & save)
+# Determine chapter save path
 # ------------------------------------------------------------
 def get_save_path(chapter_num: int, base_path: str) -> str:
-    """
-    Returns the filesystem path where this chapter should be saved.
-    Formats the filename as 0001.txt, 0002.txt, ...
-    """
-
     filename = f"{chapter_num:04d}.txt"
     return os.path.join(base_path, filename)
diff --git a/bookscraper/templates/result.html b/bookscraper/templates/result.html
index 81a12c8..57aabf9 100644
--- a/bookscraper/templates/result.html
+++ b/bookscraper/templates/result.html
@@ -31,6 +31,21 @@
         border-radius: 6px;
         font-size: 13px;
       }
+
+      /* NEW: Clear button */
+      #clearLogBtn {
+        margin-bottom: 10px;
+        padding: 8px 16px;
+        background: #777;
+        color: white;
+        border: none;
+        border-radius: 6px;
+        cursor: pointer;
+      }
+      #clearLogBtn:hover {
+        background: #555;
+      }
+
       #abortBtn {
         padding: 12px 20px;
         background: #d9534f;
@@ -68,7 +83,7 @@
     <div class="box">{{ message }}</div>
     {% endif %}
 
-    <!-- COVER WEERGAVE (toegevoegd) -->
+    <!-- COVER -->
     {% if book_title %}
     <div class="box">
       <strong>Cover:</strong><br />
@@ -103,7 +118,11 @@
     </div>
 
     <div class="box">
-      <strong>Live log:</strong>
+      <strong>Live log:</strong><br />
+
+      <!-- NEW BUTTON -->
+      <button id="clearLogBtn" onclick="clearLogs()">Clear logs</button>
+
       <div id="logbox" class="logbox"></div>
     </div>
 
@@ -204,6 +223,17 @@
           })
           .catch(() => setTimeout(pollLogs, 1500));
       }
+
+      // =========================================================
+      // NEW: Clear logs button handler
+      // =========================================================
+      function clearLogs() {
+        fetch("/clear-logs", { method: "POST" })
+          .then(() => {
+            document.getElementById("logbox").innerHTML = "";
+          })
+          .catch((e) => console.error("Clear logs failed:", e));
+      }
     </script>
   </body>
 </html>