text parsing optimization

2 weeks ago · dff30e5768
parent 1a720fbea0
commit dff30e5768
10 changed files with 358 additions and 62 deletions
--- a/bookscraper/app.py
+++ b/bookscraper/app.py
@ -17,7 +17,7 @@ from scraper.abort import set_abort
 from scraper.progress import get_progress

 # UI LOGS (GLOBAL — no book_id)
-from scraper.ui_log import get_ui_logs
+from scraper.ui_log import get_ui_logs, reset_ui_logs  # <-- ADDED

 from celery.result import AsyncResult

@ -58,6 +58,11 @@ def start_scraping():
    if not url:
        return render_template("result.html", error="Geen URL opgegeven.")

+    # ---------------------------------------------------------
+    # NEW: Clear UI log buffer when starting a new scrape
+    # ---------------------------------------------------------
+    reset_ui_logs()
+
    log_debug(f"[WEB] Scraping via Celery: {url}")

    async_result = celery_app.send_task(
@ -70,11 +75,19 @@ def start_scraping():
        "result.html",
        message="Scraping gestart.",
        scraping_task_id=async_result.id,
-        # voor result.html cover rendering
        book_title=None,
    )


+# =====================================================
+# CLEAR UI LOGS MANUALLY (NEW)
+# =====================================================
+@app.route("/clear-logs", methods=["POST"])
+def clear_logs():
+    reset_ui_logs()
+    return jsonify({"status": "ok", "message": "UI logs cleared"})
+
+
 # =====================================================
 # ABORT (per book_id)
 # =====================================================
--- a/bookscraper/scraper/replacements/encoding.txt
+++ b/bookscraper/scraper/replacements/encoding.txt
@ -0,0 +1,37 @@
+#scraper/replacements/encoding.txt
+# --- fix common encoding artifacts ---
+\u3000=       # IDEOGRAPHIC SPACE → empty
+\u00A0=       # non-breaking space → empty
+
+# full-width punctuation
+，=,
+。=.
+！=!
+？=?
+；=;
+：=:
+（=(
+）=)
+【=[
+】=]
+《=<
+》=>
+
+# hyphen variants
+–=-
+—=-
+―=-
+\u3000=
+\u00A0=
+ÃÂÃÂ =
+ÃÂ =
+ï»¿=
+â=—
+â="
+â="
+â='
+â¦=…
+â¢=*
+â=
+â²=
+Â =
--- a/bookscraper/scraper/replacements/html.txt
+++ b/bookscraper/scraper/replacements/html.txt
@ -0,0 +1,27 @@
+#scraper/replacements/html.txt
+<br>=\n
+<br/>=\n
+<br />=\n
+&nbsp;=
+&nbsp&nbsp=
+&nbsp&nbsp&nbsp=
+&emsp;=
+&ensp;=
+&thinsp;=
+&ldquo;="
+&rdquo;="
+&lsquo;='
+&rsquo;='
+&lt;=<
+&gt;=>
+&copy;=
+&reg;=
+&trade;=
+fontbigbigbig=
+fontbigbig=
+font1=
+font2=
+font3=
+strongstrong=
+divdiv=
+spanspan=
--- a/bookscraper/scraper/replacements/junk.txt
+++ b/bookscraper/scraper/replacements/junk.txt
@ -0,0 +1,77 @@
+#scraper/replacements/junk.txt
+# --- Navigation ---
+上一章=
+下一章=
+上一頁=
+下一頁=
+返回顶部=
+返回目录=
+返回书页=
+章节目录=
+章节列表=
+快捷键=
+（快捷键  ←）=
+（快捷键  →）=
+（快捷键）=
+（快捷键 ←）=
+（快捷键 →）=
+上一页=
+下一页=
+手机阅读=
+返回=
+上一页阅读=
+下一页阅读=
+
+# --- Booksite footer disclaimers ---
+重要声明=
+所有的文字=
+均由网友发表=
+均由网友上传=
+本站立场无关=
+阅读更多小说=
+返回飘天文学网=
+小说阅读网=
+最新章节请返回=
+永久地址=
+All rights reserved=
+Copyright=
+飘天文学=
+
+# --- Piaotia specific ---
+请记住本书域名=
+请收藏本书=
+加入书签=
+加入书架=
+收藏本书=
+推荐本书=
+本章未完=
+请稍后=
+最新网址=
+小说网=
+小说阅读=
+将本书加入书架=
+章节出错=
+点此举报=
+举报原因=
+
+# --- Ads / QR / watermark ---
+关注公众号=
+微信扫一扫=
+扫码阅读=
+二维码=
+QQ交流群=
+加QQ群=
+广告=
+广告位=
+sponsor=
+sponsored=
+ADVERTISEMENT=
+Advertisment=
+Adblock=
+
+# --- Mode / UI related ---
+选择背景颜色=
+选择字体大小=
+繁體中文=
+模式选择=
+阅读模式=
--- a/bookscraper/scraper/tasks/audio_tasks.py
+++ b/bookscraper/scraper/tasks/audio_tasks.py
@ -98,7 +98,7 @@ def generate_audio(
    # ============================================================

    container_path = chapter_text
-    log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
+    # log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")

    # 1) Strip container prefix to get relative path: BOOK/VOLUME/FILE
    if container_path.startswith(CONTAINER_PREFIX):
@ -120,7 +120,7 @@ def generate_audio(

    # 2) Construct real host path
    host_path = os.path.join(HOST_PATH, relative_path)
-    log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
+    # log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")

    # ============================================================
    # PREPARE OUTPUT DIR  (always correct)
@ -132,7 +132,7 @@ def generate_audio(
    safe_num = f"{chapter_number:04d}"
    audio_file = os.path.join(base_dir, f"{safe_num}.m4a")

-    log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
+    # log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")

    if os.path.exists(audio_file):
        log(f"[AUDIO] Skip CH{chapter_number} → already exists")
--- a/bookscraper/scraper/tasks/parse_tasks.py
+++ b/bookscraper/scraper/tasks/parse_tasks.py
@ -1,50 +1,35 @@
 # =========================================================
 # File: scraper/tasks/parse_tasks.py
 # Purpose: Parse downloaded HTML into clean chapter text.
-#
-# Abort Behavior:
-#   - parse MUST ALWAYS RUN once download has started
-#   - even if the user triggers abort afterwards
-#   - (abort only prevents new chapters from starting)
-#
-# Logging:
-#   - Same unified log_msg(book_id, message) as download_tasks
-#   - publisher.log → console
-#   - ui_log.push_ui → GUI
+# Enhanced version: Piaotia H1→content extractor + clean pipeline
+# NO HARDCODED REPLACEMENTS — everything comes from replacement files
 # =========================================================

 from celery_app import celery_app
 from bs4 import BeautifulSoup

-from scraper.utils import clean_text, load_replacements
+from scraper.utils import clean_text, load_all_replacements
 from scraper.tasks.download_tasks import log_msg  # unified logger

-print(">>> [IMPORT] parse_tasks.py loaded")
+print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")


@celery_app.task(bind=True, queue="parse", ignore_result=False)
 def parse_chapter(self, download_result: dict, meta: dict):
-    """
-    Parse raw HTML returned by download_chapter into clean chapter text.
-    """

-    # Extract book_id stored by download_tasks
    book_id = download_result.get("book_id", "NOBOOK")

    # ------------------------------------------------------------
-    # 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
+    # SKIPPED DOWNLOAD → SKIP PARSE
    # ------------------------------------------------------------
    if download_result.get("skipped"):
        chapter = download_result.get("chapter")
        log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
-
-        # Ensure book_id is present in the returned dict
        download_result["book_id"] = book_id
-
        return download_result

    # ------------------------------------------------------------
-    # 2) Normal Parsing
+    # NORMAL PARSE
    # ------------------------------------------------------------
    chapter_num = download_result["chapter"]
    chapter_url = download_result["url"]
@ -54,14 +39,19 @@ def parse_chapter(self, download_result: dict, meta: dict):

    soup = BeautifulSoup(html, "lxml")

+    # ------------------------------------------------------------
+    # STRICT SELECTORS (direct content blocks)
+    # ------------------------------------------------------------
    selectors = [
        "#content",
-        ".content",
        "div#content",
+        ".content",
        "div.content",
+        "#chaptercontent",
        "div#chaptercontent",
        "#chapterContent",
        ".read-content",
+        "div.read-content",
    ]

    node = None
@ -71,20 +61,81 @@ def parse_chapter(self, download_result: dict, meta: dict):
            node = tmp
            break

-    raw = node.get_text() if node else soup.get_text()
+    # ------------------------------------------------------------
+    # PIAOTIA FALLBACK:
+    # Extract content between <H1> and the "bottomlink" block.
+    # ------------------------------------------------------------
+    raw = None
+    if node is None:
+        h1 = soup.find("h1")
+        if h1:
+            content_parts = []
+            for sib in h1.next_siblings:
+
+                # stop at bottom navigation/footer block
+                sib_class = getattr(sib, "get", lambda *_: None)("class")
+                if sib_class and (
+                    "bottomlink" in sib_class or sib_class == "bottomlink"
+                ):
+                    break
+
+                # ignore typical noise containers
+                if getattr(sib, "name", None) in ["script", "style", "center"]:
+                    continue
+
+                if hasattr(sib, "get_text"):
+                    content_parts.append(sib.get_text(separator="\n"))
+                else:
+                    content_parts.append(str(sib))
+
+            raw = "\n".join(content_parts)

    # ------------------------------------------------------------
-    # Apply global replacements
+    # FINAL FALLBACK
    # ------------------------------------------------------------
-    REPL = load_replacements()
-    text = clean_text(raw, REPL)
+    if raw is None:
+        if node:
+            raw = node.get_text(separator="\n")
+        else:
+            # drop scripts & styles
+            for tag in soup(["script", "style", "noscript"]):
+                tag.decompose()
+
+            raw = soup.get_text(separator="\n")

    # ------------------------------------------------------------
-    # Chapter 1 gets full header
+    # MULTIPASS CLEANING via replacement files ONLY
+    # ------------------------------------------------------------
+    REPL = load_all_replacements()
+
+    text = raw
+    for _ in range(5):  # like the C# CleanText loop
+        text = clean_text(text, REPL)
+
+    # ------------------------------------------------------------
+    # Collapse excessive empty lines
+    # ------------------------------------------------------------
+    cleaned = []
+    prev_blank = False
+
+    for line in text.split("\n"):
+        stripped = line.rstrip()
+        if stripped == "":
+            if prev_blank:
+                continue
+            prev_blank = True
+            cleaned.append("")
+        else:
+            prev_blank = False
+            cleaned.append(stripped)
+
+    text = "\n".join(cleaned)
+
+    # ------------------------------------------------------------
+    # Add header to chapter 1
    # ------------------------------------------------------------
    if chapter_num == 1:
        book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
-
        header = (
            f"{meta.get('title','')}\n"
            f"Author: {meta.get('author','')}\n"
--- a/bookscraper/scraper/tasks/scraping.py
+++ b/bookscraper/scraper/tasks/scraping.py
@ -12,6 +12,7 @@ import redis
 from scraper.sites import BookSite
 from scraper.book_scraper import BookScraper
 from scraper.abort import clear_abort  # no circular deps
+from scraper.ui_log import reset_ui_logs  # <-- NEW IMPORT

 print(">>> [IMPORT] scraping.py loaded")

@ -24,6 +25,11 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
 def start_scrape_book(self, url: str):
    """Scrapes metadata + chapters and prepares download tracking."""

+    # ------------------------------------------------------------
+    # NEW: clear UI log buffer at start of new run
+    # ------------------------------------------------------------
+    reset_ui_logs()
+
    log(f"[SCRAPING] Start scraping for: {url}")

    # ------------------------------------------------------------
@ -50,10 +56,10 @@ def start_scrape_book(self, url: str):
    log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")

    # ------------------------------------------------------------
-    # BOOK RUN ID  (CHANGED: use book title instead of UUID)
+    # BOOK RUN ID (using title as ID)
    # ------------------------------------------------------------
    title = result.get("title") or "UnknownBook"
-    book_id = title  # ← your requirement: title is unique and consistent
+    book_id = title  # user requirement

    result["book_id"] = book_id

@ -74,7 +80,6 @@ def start_scrape_book(self, url: str):
    # ------------------------------------------------------------
    # DISPATCH DOWNLOAD CONTROLLER
    # ------------------------------------------------------------
-    # controller task signature = launch_downloads(book_id, scrape_result)
    celery_app.send_task(
        "scraper.tasks.controller_tasks.launch_downloads",
        args=[book_id, result],
--- a/bookscraper/scraper/ui_log.py
+++ b/bookscraper/scraper/ui_log.py
@ -34,3 +34,13 @@ def get_ui_logs(limit: int = None):
        limit = LOG_BUFFER_SIZE

    return r.lrange(UI_LOG_KEY, -limit, -1)
+
+
+def reset_ui_logs():
+    """
+    Clear the entire UI log buffer.
+    Used by:
+      - Clear button in GUI
+      - Auto-clear when new book scraping starts
+    """
+    r.delete(UI_LOG_KEY)
--- a/bookscraper/scraper/utils.py
+++ b/bookscraper/scraper/utils.py
@ -1,19 +1,30 @@
+# ============================================================
+# File: scraper/utils.py
+# Purpose:
+#   Centralised replacement loader + text cleaner
+#   using 3 replacement categories:
+#       1) HTML replacements
+#       2) Encoding replacements
+#       3) Junk-term replacements (generic "noise" phrases)
+#
+# Nothing in this file contains hardcoded cleanup rules.
+# EVERYTHING comes from replacement files ONLY.
+# ============================================================
+
 import os
 import re
 from pathlib import Path


 # ------------------------------------------------------------
-# Load replacements from text_replacements.txt (optional file)
+# Generic key=value replacement loader
 # ------------------------------------------------------------
-def load_replacements(filepath="text_replacements.txt") -> dict:
+def load_replacement_file(path: Path) -> dict:
    """
-    Load key=value style replacements.
-    Empty or missing file → return {}.
-    Lines starting with '#' are ignored.
+    Loads key=value pairs from a file.
+    Missing file → {}.
+    Ignores empty lines and lines starting with '#'.
    """
-    path = Path(filepath)
-
    if not path.exists():
        return {}

@ -22,8 +33,10 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
+
            if not line or line.startswith("#"):
                continue
+
            if "=" in line:
                key, val = line.split("=", 1)
                repl[key.strip()] = val.strip()
@ -32,36 +45,69 @@ def load_replacements(filepath="text_replacements.txt") -> dict:


 # ------------------------------------------------------------
-# Clean extracted HTML text
+# Load all categories (HTML → encoding → junk)
+# Order matters: later overrides earlier.
+# ------------------------------------------------------------
+def load_all_replacements() -> dict:
+    root = Path(__file__).parent / "replacements"
+
+    html_file = root / "html.txt"
+    enc_file = root / "encoding.txt"
+    junk_file = root / "junk.txt"
+
+    repl = {}
+    repl.update(load_replacement_file(html_file))
+    repl.update(load_replacement_file(enc_file))
+    repl.update(load_replacement_file(junk_file))
+
+    return repl
+
+
+# ------------------------------------------------------------
+# Legacy compatibility wrapper
+# Many modules still import: from scraper.utils import load_replacements
+# This wrapper keeps everything working.
 # ------------------------------------------------------------
-def clean_text(raw: str, repl_dict: dict = None) -> str:
+def load_replacements(filepath=None) -> dict:
    """
-    Normalize whitespace, remove junk, apply replacements.
-    repl_dict is optional → {} if none provided.
+    Backward-compatible alias.
+    - If called with no filepath → return merged replacements.
+    - If called with a filepath → load that one file only.
    """
-    if repl_dict is None:
-        repl_dict = {}
+    if filepath is None:
+        return load_all_replacements()
+    else:
+        # Allow explicit loading of a single file
+        path = Path(filepath)
+        return load_replacement_file(path)

-    txt = raw.replace("\r", "")  # normalize CRLF

-    # Collapse 3+ blank lines → max 1 empty line
-    txt = re.sub(r"\n{3,}", "\n\n", txt)
+# ------------------------------------------------------------
+# Clean text using loaded replacements
+# ------------------------------------------------------------
+def clean_text(raw: str, repl: dict) -> str:
+    """
+    Apply replacements and basic whitespace normalisation.
+    No hardcoded rules live here.
+    """
+    if not raw:
+        return ""

-    # Apply replacements
-    for key, val in repl_dict.items():
+    txt = raw.replace("\r", "")
+
+    # Apply loaded replacements
+    for key, val in repl.items():
        txt = txt.replace(key, val)

+    # Collapse 3+ blank lines → max 1
+    txt = re.sub(r"\n{3,}", "\n\n", txt)
+
    return txt.strip()


 # ------------------------------------------------------------
-# Determine save path for a chapter (shared by download & save)
+# Determine chapter save path
 # ------------------------------------------------------------
 def get_save_path(chapter_num: int, base_path: str) -> str:
-    """
-    Returns the filesystem path where this chapter should be saved.
-    Formats the filename as 0001.txt, 0002.txt, ...
-    """
-
    filename = f"{chapter_num:04d}.txt"
    return os.path.join(base_path, filename)
--- a/bookscraper/templates/result.html
+++ b/bookscraper/templates/result.html
@ -31,6 +31,21 @@
        border-radius: 6px;
        font-size: 13px;
      }
+
+      /* NEW: Clear button */
+      #clearLogBtn {
+        margin-bottom: 10px;
+        padding: 8px 16px;
+        background: #777;
+        color: white;
+        border: none;
+        border-radius: 6px;
+        cursor: pointer;
+      }
+      #clearLogBtn:hover {
+        background: #555;
+      }
+
      #abortBtn {
        padding: 12px 20px;
        background: #d9534f;
@ -68,7 +83,7 @@
    <div class="box">{{ message }}</div>
    {% endif %}

-    <!-- COVER WEERGAVE (toegevoegd) -->
+    <!-- COVER -->
    {% if book_title %}
    <div class="box">
      <strong>Cover:</strong><br />
@ -103,7 +118,11 @@
    </div>

    <div class="box">
-      <strong>Live log:</strong>
+      <strong>Live log:</strong><br />
+
+      <!-- NEW BUTTON -->
+      <button id="clearLogBtn" onclick="clearLogs()">Clear logs</button>
+
      <div id="logbox" class="logbox"></div>
    </div>

@ -204,6 +223,17 @@
          })
          .catch(() => setTimeout(pollLogs, 1500));
      }
+
+      // =========================================================
+      // NEW: Clear logs button handler
+      // =========================================================
+      function clearLogs() {
+        fetch("/clear-logs", { method: "POST" })
+          .then(() => {
+            document.getElementById("logbox").innerHTML = "";
+          })
+          .catch((e) => console.error("Clear logs failed:", e));
+      }
    </script>
  </body>
 </html>