text parsing optimization

2 weeks ago · dff30e5768
parent 1a720fbea0
commit dff30e5768
10 changed files with 358 additions and 62 deletions
--- a/bookscraper/app.py
+++ b/bookscraper/app.py
@ -17,7 +17,7 @@ from scraper.abort import set_abort
 from scraper.progress import get_progress
 # UI LOGS (GLOBAL — no book_id)
-from scraper.ui_log import get_ui_logs
+from scraper.ui_log import get_ui_logs, reset_ui_logs  # <-- ADDED
 from celery.result import AsyncResult
@ -58,6 +58,11 @@ def start_scraping():
    if not url:
        return render_template("result.html", error="Geen URL opgegeven.")
    # ---------------------------------------------------------
    # NEW: Clear UI log buffer when starting a new scrape
    # ---------------------------------------------------------
    reset_ui_logs()
    log_debug(f"[WEB] Scraping via Celery: {url}")
    async_result = celery_app.send_task(
@ -70,11 +75,19 @@ def start_scraping():
        "result.html",
        message="Scraping gestart.",
        scraping_task_id=async_result.id,
        # voor result.html cover rendering
        book_title=None,
    )
 # =====================================================
 # CLEAR UI LOGS MANUALLY (NEW)
 # =====================================================
@app.route("/clear-logs", methods=["POST"])
 def clear_logs():
    reset_ui_logs()
    return jsonify({"status": "ok", "message": "UI logs cleared"})
 # =====================================================
 # ABORT (per book_id)
 # =====================================================
--- a/bookscraper/scraper/replacements/encoding.txt
+++ b/bookscraper/scraper/replacements/encoding.txt
@ -0,0 +1,37 @@
 #scraper/replacements/encoding.txt
 # --- fix common encoding artifacts ---
 \u3000=       # IDEOGRAPHIC SPACE → empty
 \u00A0=       # non-breaking space → empty
 # full-width punctuation
 ，=,
 。=.
 ！=!
 ？=?
 ；=;
 ：=:
 （=(
 ）=)
 【=[
 】=]
 《=<
 》=>
 # hyphen variants
 –=-
 —=-
 ―=-
 \u3000=
 \u00A0=
 ÃÂÃÂ =
 ÃÂ =
 ï»¿=
 â=—
 â="
 â="
 â='
 â¦=…
 â¢=*
 â=
 â²=
 Â =
--- a/bookscraper/scraper/replacements/html.txt
+++ b/bookscraper/scraper/replacements/html.txt
@ -0,0 +1,27 @@
 #scraper/replacements/html.txt
 <br>=\n
 <br/>=\n
 <br />=\n
 &nbsp;=
 &nbsp&nbsp=
 &nbsp&nbsp&nbsp=
 &emsp;=
 &ensp;=
 &thinsp;=
 &ldquo;="
 &rdquo;="
 &lsquo;='
 &rsquo;='
 &lt;=<
 &gt;=>
 &copy;=
 &reg;=
 &trade;=
 fontbigbigbig=
 fontbigbig=
 font1=
 font2=
 font3=
 strongstrong=
 divdiv=
 spanspan=
--- a/bookscraper/scraper/replacements/junk.txt
+++ b/bookscraper/scraper/replacements/junk.txt
@ -0,0 +1,77 @@
 #scraper/replacements/junk.txt
 # --- Navigation ---
 上一章=
 下一章=
 上一頁=
 下一頁=
 返回顶部=
 返回目录=
 返回书页=
 章节目录=
 章节列表=
 快捷键=
 （快捷键  ←）=
 （快捷键  →）=
 （快捷键）=
 （快捷键 ←）=
 （快捷键 →）=
 上一页=
 下一页=
 手机阅读=
 返回=
 上一页阅读=
 下一页阅读=
 # --- Booksite footer disclaimers ---
 重要声明=
 所有的文字=
 均由网友发表=
 均由网友上传=
 本站立场无关=
 阅读更多小说=
 返回飘天文学网=
 小说阅读网=
 最新章节请返回=
 永久地址=
 All rights reserved=
 Copyright=
 飘天文学=
 # --- Piaotia specific ---
 请记住本书域名=
 请收藏本书=
 加入书签=
 加入书架=
 收藏本书=
 推荐本书=
 本章未完=
 请稍后=
 最新网址=
 小说网=
 小说阅读=
 将本书加入书架=
 章节出错=
 点此举报=
 举报原因=
 # --- Ads / QR / watermark ---
 关注公众号=
 微信扫一扫=
 扫码阅读=
 二维码=
 QQ交流群=
 加QQ群=
 广告=
 广告位=
 sponsor=
 sponsored=
 ADVERTISEMENT=
 Advertisment=
 Adblock=
 # --- Mode / UI related ---
 选择背景颜色=
 选择字体大小=
 繁體中文=
 模式选择=
 阅读模式=
--- a/bookscraper/scraper/tasks/audio_tasks.py
+++ b/bookscraper/scraper/tasks/audio_tasks.py
@ -98,7 +98,7 @@ def generate_audio(
    # ============================================================
    container_path = chapter_text
-    log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
+    # log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
    # 1) Strip container prefix to get relative path: BOOK/VOLUME/FILE
    if container_path.startswith(CONTAINER_PREFIX):
@ -120,7 +120,7 @@ def generate_audio(
    # 2) Construct real host path
    host_path = os.path.join(HOST_PATH, relative_path)
-    log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
+    # log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
    # ============================================================
    # PREPARE OUTPUT DIR  (always correct)
@ -132,7 +132,7 @@ def generate_audio(
    safe_num = f"{chapter_number:04d}"
    audio_file = os.path.join(base_dir, f"{safe_num}.m4a")
-    log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
+    # log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
    if os.path.exists(audio_file):
        log(f"[AUDIO] Skip CH{chapter_number} → already exists")
--- a/bookscraper/scraper/tasks/parse_tasks.py
+++ b/bookscraper/scraper/tasks/parse_tasks.py
@ -1,50 +1,35 @@
 # =========================================================
 # File: scraper/tasks/parse_tasks.py
 # Purpose: Parse downloaded HTML into clean chapter text.
-#
+# Enhanced version: Piaotia H1→content extractor + clean pipeline
-# Abort Behavior:
+# NO HARDCODED REPLACEMENTS — everything comes from replacement files
 #   - parse MUST ALWAYS RUN once download has started
 #   - even if the user triggers abort afterwards
 #   - (abort only prevents new chapters from starting)
 #
 # Logging:
 #   - Same unified log_msg(book_id, message) as download_tasks
 #   - publisher.log → console
 #   - ui_log.push_ui → GUI
 # =========================================================
 from celery_app import celery_app
 from bs4 import BeautifulSoup
-from scraper.utils import clean_text, load_replacements
+from scraper.utils import clean_text, load_all_replacements
 from scraper.tasks.download_tasks import log_msg  # unified logger
-print(">>> [IMPORT] parse_tasks.py loaded")
+print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
@celery_app.task(bind=True, queue="parse", ignore_result=False)
 def parse_chapter(self, download_result: dict, meta: dict):
    """
    Parse raw HTML returned by download_chapter into clean chapter text.
    """
    # Extract book_id stored by download_tasks
    book_id = download_result.get("book_id", "NOBOOK")
    # ------------------------------------------------------------
-    # 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
+    # SKIPPED DOWNLOAD → SKIP PARSE
    # ------------------------------------------------------------
    if download_result.get("skipped"):
        chapter = download_result.get("chapter")
        log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
        # Ensure book_id is present in the returned dict
        download_result["book_id"] = book_id
        return download_result
    # ------------------------------------------------------------
-    # 2) Normal Parsing
+    # NORMAL PARSE
    # ------------------------------------------------------------
    chapter_num = download_result["chapter"]
    chapter_url = download_result["url"]
@ -54,14 +39,19 @@ def parse_chapter(self, download_result: dict, meta: dict):
    soup = BeautifulSoup(html, "lxml")
    # ------------------------------------------------------------
    # STRICT SELECTORS (direct content blocks)
    # ------------------------------------------------------------
    selectors = [
        "#content",
        ".content",
        "div#content",
        ".content",
        "div.content",
        "#chaptercontent",
        "div#chaptercontent",
        "#chapterContent",
        ".read-content",
        "div.read-content",
    ]
    node = None
@ -71,20 +61,81 @@ def parse_chapter(self, download_result: dict, meta: dict):
            node = tmp
            break
-    raw = node.get_text() if node else soup.get_text()
+    # ------------------------------------------------------------
    # PIAOTIA FALLBACK:
    # Extract content between <H1> and the "bottomlink" block.
    # ------------------------------------------------------------
    raw = None
    if node is None:
        h1 = soup.find("h1")
        if h1:
            content_parts = []
            for sib in h1.next_siblings:
                # stop at bottom navigation/footer block
                sib_class = getattr(sib, "get", lambda *_: None)("class")
                if sib_class and (
                    "bottomlink" in sib_class or sib_class == "bottomlink"
                ):
                    break
                # ignore typical noise containers
                if getattr(sib, "name", None) in ["script", "style", "center"]:
                    continue
                if hasattr(sib, "get_text"):
                    content_parts.append(sib.get_text(separator="\n"))
                else:
                    content_parts.append(str(sib))
            raw = "\n".join(content_parts)
    # ------------------------------------------------------------
    # FINAL FALLBACK
    # ------------------------------------------------------------
    if raw is None:
        if node:
            raw = node.get_text(separator="\n")
        else:
            # drop scripts & styles
            for tag in soup(["script", "style", "noscript"]):
                tag.decompose()
            raw = soup.get_text(separator="\n")
    # ------------------------------------------------------------
-    # Apply global replacements
+    # MULTIPASS CLEANING via replacement files ONLY
    # ------------------------------------------------------------
-    REPL = load_replacements()
+    REPL = load_all_replacements()
    text = clean_text(raw, REPL)
    text = raw
    for _ in range(5):  # like the C# CleanText loop
        text = clean_text(text, REPL)
    # ------------------------------------------------------------
    # Collapse excessive empty lines
    # ------------------------------------------------------------
-    # Chapter 1 gets full header
+    cleaned = []
    prev_blank = False
    for line in text.split("\n"):
        stripped = line.rstrip()
        if stripped == "":
            if prev_blank:
                continue
            prev_blank = True
            cleaned.append("")
        else:
            prev_blank = False
            cleaned.append(stripped)
    text = "\n".join(cleaned)
    # ------------------------------------------------------------
    # Add header to chapter 1
    # ------------------------------------------------------------
    if chapter_num == 1:
        book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
        header = (
            f"{meta.get('title','')}\n"
            f"Author: {meta.get('author','')}\n"
--- a/bookscraper/scraper/tasks/scraping.py
+++ b/bookscraper/scraper/tasks/scraping.py
@ -12,6 +12,7 @@ import redis
 from scraper.sites import BookSite
 from scraper.book_scraper import BookScraper
 from scraper.abort import clear_abort  # no circular deps
 from scraper.ui_log import reset_ui_logs  # <-- NEW IMPORT
 print(">>> [IMPORT] scraping.py loaded")
@ -24,6 +25,11 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
 def start_scrape_book(self, url: str):
    """Scrapes metadata + chapters and prepares download tracking."""
    # ------------------------------------------------------------
    # NEW: clear UI log buffer at start of new run
    # ------------------------------------------------------------
    reset_ui_logs()
    log(f"[SCRAPING] Start scraping for: {url}")
    # ------------------------------------------------------------
@ -50,10 +56,10 @@ def start_scrape_book(self, url: str):
    log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
    # ------------------------------------------------------------
-    # BOOK RUN ID  (CHANGED: use book title instead of UUID)
+    # BOOK RUN ID (using title as ID)
    # ------------------------------------------------------------
    title = result.get("title") or "UnknownBook"
-    book_id = title  # ← your requirement: title is unique and consistent
+    book_id = title  # user requirement
    result["book_id"] = book_id
@ -74,7 +80,6 @@ def start_scrape_book(self, url: str):
    # ------------------------------------------------------------
    # DISPATCH DOWNLOAD CONTROLLER
    # ------------------------------------------------------------
    # controller task signature = launch_downloads(book_id, scrape_result)
    celery_app.send_task(
        "scraper.tasks.controller_tasks.launch_downloads",
        args=[book_id, result],
--- a/bookscraper/scraper/ui_log.py
+++ b/bookscraper/scraper/ui_log.py
@ -34,3 +34,13 @@ def get_ui_logs(limit: int = None):
        limit = LOG_BUFFER_SIZE
    return r.lrange(UI_LOG_KEY, -limit, -1)
 def reset_ui_logs():
    """
    Clear the entire UI log buffer.
    Used by:
      - Clear button in GUI
      - Auto-clear when new book scraping starts
    """
    r.delete(UI_LOG_KEY)
--- a/bookscraper/scraper/utils.py
+++ b/bookscraper/scraper/utils.py
@ -1,19 +1,30 @@
 # ============================================================
 # File: scraper/utils.py
 # Purpose:
 #   Centralised replacement loader + text cleaner
 #   using 3 replacement categories:
 #       1) HTML replacements
 #       2) Encoding replacements
 #       3) Junk-term replacements (generic "noise" phrases)
 #
 # Nothing in this file contains hardcoded cleanup rules.
 # EVERYTHING comes from replacement files ONLY.
 # ============================================================
 import os
 import re
 from pathlib import Path
 # ------------------------------------------------------------
-# Load replacements from text_replacements.txt (optional file)
+# Generic key=value replacement loader
 # ------------------------------------------------------------
-def load_replacements(filepath="text_replacements.txt") -> dict:
+def load_replacement_file(path: Path) -> dict:
    """
-    Load key=value style replacements.
+    Loads key=value pairs from a file.
-    Empty or missing file → return {}.
+    Missing file → {}.
-    Lines starting with '#' are ignored.
+    Ignores empty lines and lines starting with '#'.
    """
    path = Path(filepath)
    if not path.exists():
        return {}
@ -22,8 +33,10 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if "=" in line:
                key, val = line.split("=", 1)
                repl[key.strip()] = val.strip()
@ -32,36 +45,69 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
 # ------------------------------------------------------------
-# Clean extracted HTML text
+# Load all categories (HTML → encoding → junk)
 # Order matters: later overrides earlier.
 # ------------------------------------------------------------
 def load_all_replacements() -> dict:
    root = Path(__file__).parent / "replacements"
    html_file = root / "html.txt"
    enc_file = root / "encoding.txt"
    junk_file = root / "junk.txt"
    repl = {}
    repl.update(load_replacement_file(html_file))
    repl.update(load_replacement_file(enc_file))
    repl.update(load_replacement_file(junk_file))
    return repl
 # ------------------------------------------------------------
 # Legacy compatibility wrapper
 # Many modules still import: from scraper.utils import load_replacements
 # This wrapper keeps everything working.
 # ------------------------------------------------------------
-def clean_text(raw: str, repl_dict: dict = None) -> str:
+def load_replacements(filepath=None) -> dict:
    """
-    Normalize whitespace, remove junk, apply replacements.
+    Backward-compatible alias.
-    repl_dict is optional → {} if none provided.
+    - If called with no filepath → return merged replacements.
    - If called with a filepath → load that one file only.
    """
-    if repl_dict is None:
+    if filepath is None:
-        repl_dict = {}
+        return load_all_replacements()
    else:
        # Allow explicit loading of a single file
        path = Path(filepath)
        return load_replacement_file(path)
    txt = raw.replace("\r", "")  # normalize CRLF
-    # Collapse 3+ blank lines → max 1 empty line
+# ------------------------------------------------------------
-    txt = re.sub(r"\n{3,}", "\n\n", txt)
+# Clean text using loaded replacements
 # ------------------------------------------------------------
 def clean_text(raw: str, repl: dict) -> str:
    """
    Apply replacements and basic whitespace normalisation.
    No hardcoded rules live here.
    """
    if not raw:
        return ""
-    # Apply replacements
+    txt = raw.replace("\r", "")
-    for key, val in repl_dict.items():
+
    # Apply loaded replacements
    for key, val in repl.items():
        txt = txt.replace(key, val)
    # Collapse 3+ blank lines → max 1
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    return txt.strip()
 # ------------------------------------------------------------
-# Determine save path for a chapter (shared by download & save)
+# Determine chapter save path
 # ------------------------------------------------------------
 def get_save_path(chapter_num: int, base_path: str) -> str:
    """
    Returns the filesystem path where this chapter should be saved.
    Formats the filename as 0001.txt, 0002.txt, ...
    """
    filename = f"{chapter_num:04d}.txt"
    return os.path.join(base_path, filename)
--- a/bookscraper/templates/result.html
+++ b/bookscraper/templates/result.html
@ -31,6 +31,21 @@
        border-radius: 6px;
        font-size: 13px;
      }
      /* NEW: Clear button */
      #clearLogBtn {
        margin-bottom: 10px;
        padding: 8px 16px;
        background: #777;
        color: white;
        border: none;
        border-radius: 6px;
        cursor: pointer;
      }
      #clearLogBtn:hover {
        background: #555;
      }
      #abortBtn {
        padding: 12px 20px;
        background: #d9534f;
@ -68,7 +83,7 @@
    <div class="box">{{ message }}</div>
    {% endif %}
-    <!-- COVER WEERGAVE (toegevoegd) -->
+    <!-- COVER -->
    {% if book_title %}
    <div class="box">
      <strong>Cover:</strong><br />
@ -103,7 +118,11 @@
    </div>
    <div class="box">
-      <strong>Live log:</strong>
+      <strong>Live log:</strong><br />
      <!-- NEW BUTTON -->
      <button id="clearLogBtn" onclick="clearLogs()">Clear logs</button>
      <div id="logbox" class="logbox"></div>
    </div>
@ -204,6 +223,17 @@
          })
          .catch(() => setTimeout(pollLogs, 1500));
      }
      // =========================================================
      // NEW: Clear logs button handler
      // =========================================================
      function clearLogs() {
        fetch("/clear-logs", { method: "POST" })
          .then(() => {
            document.getElementById("logbox").innerHTML = "";
          })
          .catch((e) => console.error("Clear logs failed:", e));
      }
    </script>
  </body>
 </html>