diff --git a/bookscraper/app.py b/bookscraper/app.py
index 9840714..bf758c8 100644
--- a/bookscraper/app.py
+++ b/bookscraper/app.py
@@ -17,7 +17,7 @@ from scraper.abort import set_abort
from scraper.progress import get_progress
# UI LOGS (GLOBAL — no book_id)
-from scraper.ui_log import get_ui_logs
+from scraper.ui_log import get_ui_logs, reset_ui_logs # <-- ADDED
from celery.result import AsyncResult
@@ -58,6 +58,11 @@ def start_scraping():
if not url:
return render_template("result.html", error="Geen URL opgegeven.")
+ # ---------------------------------------------------------
+ # NEW: Clear UI log buffer when starting a new scrape
+ # ---------------------------------------------------------
+ reset_ui_logs()
+
log_debug(f"[WEB] Scraping via Celery: {url}")
async_result = celery_app.send_task(
@@ -70,11 +75,19 @@ def start_scraping():
"result.html",
message="Scraping gestart.",
scraping_task_id=async_result.id,
- # voor result.html cover rendering
book_title=None,
)
+# =====================================================
+# CLEAR UI LOGS MANUALLY (NEW)
+# =====================================================
+@app.route("/clear-logs", methods=["POST"])
+def clear_logs():
+ reset_ui_logs()
+ return jsonify({"status": "ok", "message": "UI logs cleared"})
+
+
# =====================================================
# ABORT (per book_id)
# =====================================================
diff --git a/bookscraper/scraper/replacements/encoding.txt b/bookscraper/scraper/replacements/encoding.txt
new file mode 100644
index 0000000..6343bfc
--- /dev/null
+++ b/bookscraper/scraper/replacements/encoding.txt
@@ -0,0 +1,37 @@
+#scraper/replacements/encoding.txt
+# --- fix common encoding artifacts ---
+\u3000= # IDEOGRAPHIC SPACE → empty
+\u00A0= # non-breaking space → empty
+
+# full-width punctuation
+,=,
+。=.
+!=!
+?=?
+;=;
+:=:
+(=(
+)=)
+【=[
+】=]
+《=<
+》=>
+
+# hyphen variants
+–=-
+—=-
+―=-
+\u3000=
+\u00A0=
+ÃÂÃÂ =
+ÃÂ =
+=
+â=—
+â="
+â="
+â='
+â¦=…
+â¢=*
+â=
+â²=
+Â =
diff --git a/bookscraper/scraper/replacements/html.txt b/bookscraper/scraper/replacements/html.txt
new file mode 100644
index 0000000..57dd8e8
--- /dev/null
+++ b/bookscraper/scraper/replacements/html.txt
@@ -0,0 +1,27 @@
+#scraper/replacements/html.txt
+
=\n
+
=\n
+
=\n
+ =
+  =
+   =
+ =
+ =
+ =
+“="
+”="
+‘='
+’='
+<=<
+>=>
+©=
+®=
+™=
+fontbigbigbig=
+fontbigbig=
+font1=
+font2=
+font3=
+strongstrong=
+divdiv=
+spanspan=
diff --git a/bookscraper/scraper/replacements/junk.txt b/bookscraper/scraper/replacements/junk.txt
new file mode 100644
index 0000000..b5fc95b
--- /dev/null
+++ b/bookscraper/scraper/replacements/junk.txt
@@ -0,0 +1,77 @@
+#scraper/replacements/junk.txt
+# --- Navigation ---
+上一章=
+下一章=
+上一頁=
+下一頁=
+返回顶部=
+返回目录=
+返回书页=
+章节目录=
+章节列表=
+快捷键=
+(快捷键 ←)=
+(快捷键 →)=
+(快捷键)=
+(快捷键 ←)=
+(快捷键 →)=
+上一页=
+下一页=
+手机阅读=
+返回=
+上一页阅读=
+下一页阅读=
+
+# --- Booksite footer disclaimers ---
+重要声明=
+所有的文字=
+均由网友发表=
+均由网友上传=
+本站立场无关=
+阅读更多小说=
+返回飘天文学网=
+小说阅读网=
+最新章节请返回=
+永久地址=
+All rights reserved=
+Copyright=
+飘天文学=
+
+# --- Piaotia specific ---
+请记住本书域名=
+请收藏本书=
+加入书签=
+加入书架=
+收藏本书=
+推荐本书=
+本章未完=
+请稍后=
+最新网址=
+小说网=
+小说阅读=
+将本书加入书架=
+章节出错=
+点此举报=
+举报原因=
+
+# --- Ads / QR / watermark ---
+关注公众号=
+微信扫一扫=
+扫码阅读=
+二维码=
+QQ交流群=
+加QQ群=
+广告=
+广告位=
+sponsor=
+sponsored=
+ADVERTISEMENT=
+Advertisment=
+Adblock=
+
+# --- Mode / UI related ---
+选择背景颜色=
+选择字体大小=
+繁體中文=
+模式选择=
+阅读模式=
diff --git a/bookscraper/scraper/tasks/audio_tasks.py b/bookscraper/scraper/tasks/audio_tasks.py
index c1a3ff0..f735516 100644
--- a/bookscraper/scraper/tasks/audio_tasks.py
+++ b/bookscraper/scraper/tasks/audio_tasks.py
@@ -98,7 +98,7 @@ def generate_audio(
# ============================================================
container_path = chapter_text
- log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
+ # log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
# 1) Strip container prefix to get relative path: BOOK/VOLUME/FILE
if container_path.startswith(CONTAINER_PREFIX):
@@ -120,7 +120,7 @@ def generate_audio(
# 2) Construct real host path
host_path = os.path.join(HOST_PATH, relative_path)
- log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
+ # log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
# ============================================================
# PREPARE OUTPUT DIR (always correct)
@@ -132,7 +132,7 @@ def generate_audio(
safe_num = f"{chapter_number:04d}"
audio_file = os.path.join(base_dir, f"{safe_num}.m4a")
- log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
+ # log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
if os.path.exists(audio_file):
log(f"[AUDIO] Skip CH{chapter_number} → already exists")
diff --git a/bookscraper/scraper/tasks/parse_tasks.py b/bookscraper/scraper/tasks/parse_tasks.py
index ddea90e..52066f9 100644
--- a/bookscraper/scraper/tasks/parse_tasks.py
+++ b/bookscraper/scraper/tasks/parse_tasks.py
@@ -1,50 +1,35 @@
# =========================================================
# File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text.
-#
-# Abort Behavior:
-# - parse MUST ALWAYS RUN once download has started
-# - even if the user triggers abort afterwards
-# - (abort only prevents new chapters from starting)
-#
-# Logging:
-# - Same unified log_msg(book_id, message) as download_tasks
-# - publisher.log → console
-# - ui_log.push_ui → GUI
+# Enhanced version: Piaotia H1→content extractor + clean pipeline
+# NO HARDCODED REPLACEMENTS — everything comes from replacement files
# =========================================================
from celery_app import celery_app
from bs4 import BeautifulSoup
-from scraper.utils import clean_text, load_replacements
+from scraper.utils import clean_text, load_all_replacements
from scraper.tasks.download_tasks import log_msg # unified logger
-print(">>> [IMPORT] parse_tasks.py loaded")
+print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
@celery_app.task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, download_result: dict, meta: dict):
- """
- Parse raw HTML returned by download_chapter into clean chapter text.
- """
- # Extract book_id stored by download_tasks
book_id = download_result.get("book_id", "NOBOOK")
# ------------------------------------------------------------
- # 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
+ # SKIPPED DOWNLOAD → SKIP PARSE
# ------------------------------------------------------------
if download_result.get("skipped"):
chapter = download_result.get("chapter")
log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
-
- # Ensure book_id is present in the returned dict
download_result["book_id"] = book_id
-
return download_result
# ------------------------------------------------------------
- # 2) Normal Parsing
+ # NORMAL PARSE
# ------------------------------------------------------------
chapter_num = download_result["chapter"]
chapter_url = download_result["url"]
@@ -54,14 +39,19 @@ def parse_chapter(self, download_result: dict, meta: dict):
soup = BeautifulSoup(html, "lxml")
+ # ------------------------------------------------------------
+ # STRICT SELECTORS (direct content blocks)
+ # ------------------------------------------------------------
selectors = [
"#content",
- ".content",
"div#content",
+ ".content",
"div.content",
+ "#chaptercontent",
"div#chaptercontent",
"#chapterContent",
".read-content",
+ "div.read-content",
]
node = None
@@ -71,20 +61,81 @@ def parse_chapter(self, download_result: dict, meta: dict):
node = tmp
break
- raw = node.get_text() if node else soup.get_text()
+ # ------------------------------------------------------------
+ # PIAOTIA FALLBACK:
+ # Extract content between
and the "bottomlink" block.
+ # ------------------------------------------------------------
+ raw = None
+ if node is None:
+ h1 = soup.find("h1")
+ if h1:
+ content_parts = []
+ for sib in h1.next_siblings:
+
+ # stop at bottom navigation/footer block
+ sib_class = getattr(sib, "get", lambda *_: None)("class")
+ if sib_class and (
+ "bottomlink" in sib_class or sib_class == "bottomlink"
+ ):
+ break
+
+ # ignore typical noise containers
+ if getattr(sib, "name", None) in ["script", "style", "center"]:
+ continue
+
+ if hasattr(sib, "get_text"):
+ content_parts.append(sib.get_text(separator="\n"))
+ else:
+ content_parts.append(str(sib))
+
+ raw = "\n".join(content_parts)
# ------------------------------------------------------------
- # Apply global replacements
+ # FINAL FALLBACK
# ------------------------------------------------------------
- REPL = load_replacements()
- text = clean_text(raw, REPL)
+ if raw is None:
+ if node:
+ raw = node.get_text(separator="\n")
+ else:
+ # drop scripts & styles
+ for tag in soup(["script", "style", "noscript"]):
+ tag.decompose()
+
+ raw = soup.get_text(separator="\n")
# ------------------------------------------------------------
- # Chapter 1 gets full header
+ # MULTIPASS CLEANING via replacement files ONLY
+ # ------------------------------------------------------------
+ REPL = load_all_replacements()
+
+ text = raw
+ for _ in range(5): # like the C# CleanText loop
+ text = clean_text(text, REPL)
+
+ # ------------------------------------------------------------
+ # Collapse excessive empty lines
+ # ------------------------------------------------------------
+ cleaned = []
+ prev_blank = False
+
+ for line in text.split("\n"):
+ stripped = line.rstrip()
+ if stripped == "":
+ if prev_blank:
+ continue
+ prev_blank = True
+ cleaned.append("")
+ else:
+ prev_blank = False
+ cleaned.append(stripped)
+
+ text = "\n".join(cleaned)
+
+ # ------------------------------------------------------------
+ # Add header to chapter 1
# ------------------------------------------------------------
if chapter_num == 1:
book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
-
header = (
f"{meta.get('title','')}\n"
f"Author: {meta.get('author','')}\n"
diff --git a/bookscraper/scraper/tasks/scraping.py b/bookscraper/scraper/tasks/scraping.py
index 8b0b9fe..0694089 100644
--- a/bookscraper/scraper/tasks/scraping.py
+++ b/bookscraper/scraper/tasks/scraping.py
@@ -12,6 +12,7 @@ import redis
from scraper.sites import BookSite
from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort # no circular deps
+from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT
print(">>> [IMPORT] scraping.py loaded")
@@ -24,6 +25,11 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
def start_scrape_book(self, url: str):
"""Scrapes metadata + chapters and prepares download tracking."""
+ # ------------------------------------------------------------
+ # NEW: clear UI log buffer at start of new run
+ # ------------------------------------------------------------
+ reset_ui_logs()
+
log(f"[SCRAPING] Start scraping for: {url}")
# ------------------------------------------------------------
@@ -50,10 +56,10 @@ def start_scrape_book(self, url: str):
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
# ------------------------------------------------------------
- # BOOK RUN ID (CHANGED: use book title instead of UUID)
+ # BOOK RUN ID (using title as ID)
# ------------------------------------------------------------
title = result.get("title") or "UnknownBook"
- book_id = title # ← your requirement: title is unique and consistent
+ book_id = title # user requirement
result["book_id"] = book_id
@@ -74,7 +80,6 @@ def start_scrape_book(self, url: str):
# ------------------------------------------------------------
# DISPATCH DOWNLOAD CONTROLLER
# ------------------------------------------------------------
- # controller task signature = launch_downloads(book_id, scrape_result)
celery_app.send_task(
"scraper.tasks.controller_tasks.launch_downloads",
args=[book_id, result],
diff --git a/bookscraper/scraper/ui_log.py b/bookscraper/scraper/ui_log.py
index 18db819..312e20e 100644
--- a/bookscraper/scraper/ui_log.py
+++ b/bookscraper/scraper/ui_log.py
@@ -34,3 +34,13 @@ def get_ui_logs(limit: int = None):
limit = LOG_BUFFER_SIZE
return r.lrange(UI_LOG_KEY, -limit, -1)
+
+
+def reset_ui_logs():
+ """
+ Clear the entire UI log buffer.
+ Used by:
+ - Clear button in GUI
+ - Auto-clear when new book scraping starts
+ """
+ r.delete(UI_LOG_KEY)
diff --git a/bookscraper/scraper/utils.py b/bookscraper/scraper/utils.py
index 08e45f0..0bdd2f9 100644
--- a/bookscraper/scraper/utils.py
+++ b/bookscraper/scraper/utils.py
@@ -1,19 +1,30 @@
+# ============================================================
+# File: scraper/utils.py
+# Purpose:
+# Centralised replacement loader + text cleaner
+# using 3 replacement categories:
+# 1) HTML replacements
+# 2) Encoding replacements
+# 3) Junk-term replacements (generic "noise" phrases)
+#
+# Nothing in this file contains hardcoded cleanup rules.
+# EVERYTHING comes from replacement files ONLY.
+# ============================================================
+
import os
import re
from pathlib import Path
# ------------------------------------------------------------
-# Load replacements from text_replacements.txt (optional file)
+# Generic key=value replacement loader
# ------------------------------------------------------------
-def load_replacements(filepath="text_replacements.txt") -> dict:
+def load_replacement_file(path: Path) -> dict:
"""
- Load key=value style replacements.
- Empty or missing file → return {}.
- Lines starting with '#' are ignored.
+ Loads key=value pairs from a file.
+ Missing file → {}.
+ Ignores empty lines and lines starting with '#'.
"""
- path = Path(filepath)
-
if not path.exists():
return {}
@@ -22,8 +33,10 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
+
if not line or line.startswith("#"):
continue
+
if "=" in line:
key, val = line.split("=", 1)
repl[key.strip()] = val.strip()
@@ -32,36 +45,69 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
# ------------------------------------------------------------
-# Clean extracted HTML text
+# Load all categories (HTML → encoding → junk)
+# Order matters: later overrides earlier.
+# ------------------------------------------------------------
+def load_all_replacements() -> dict:
+ root = Path(__file__).parent / "replacements"
+
+ html_file = root / "html.txt"
+ enc_file = root / "encoding.txt"
+ junk_file = root / "junk.txt"
+
+ repl = {}
+ repl.update(load_replacement_file(html_file))
+ repl.update(load_replacement_file(enc_file))
+ repl.update(load_replacement_file(junk_file))
+
+ return repl
+
+
+# ------------------------------------------------------------
+# Legacy compatibility wrapper
+# Many modules still import: from scraper.utils import load_replacements
+# This wrapper keeps everything working.
# ------------------------------------------------------------
-def clean_text(raw: str, repl_dict: dict = None) -> str:
+def load_replacements(filepath=None) -> dict:
"""
- Normalize whitespace, remove junk, apply replacements.
- repl_dict is optional → {} if none provided.
+ Backward-compatible alias.
+ - If called with no filepath → return merged replacements.
+ - If called with a filepath → load that one file only.
"""
- if repl_dict is None:
- repl_dict = {}
+ if filepath is None:
+ return load_all_replacements()
+ else:
+ # Allow explicit loading of a single file
+ path = Path(filepath)
+ return load_replacement_file(path)
- txt = raw.replace("\r", "") # normalize CRLF
- # Collapse 3+ blank lines → max 1 empty line
- txt = re.sub(r"\n{3,}", "\n\n", txt)
+# ------------------------------------------------------------
+# Clean text using loaded replacements
+# ------------------------------------------------------------
+def clean_text(raw: str, repl: dict) -> str:
+ """
+ Apply replacements and basic whitespace normalisation.
+ No hardcoded rules live here.
+ """
+ if not raw:
+ return ""
- # Apply replacements
- for key, val in repl_dict.items():
+ txt = raw.replace("\r", "")
+
+ # Apply loaded replacements
+ for key, val in repl.items():
txt = txt.replace(key, val)
+ # Collapse 3+ blank lines → max 1
+ txt = re.sub(r"\n{3,}", "\n\n", txt)
+
return txt.strip()
# ------------------------------------------------------------
-# Determine save path for a chapter (shared by download & save)
+# Determine chapter save path
# ------------------------------------------------------------
def get_save_path(chapter_num: int, base_path: str) -> str:
- """
- Returns the filesystem path where this chapter should be saved.
- Formats the filename as 0001.txt, 0002.txt, ...
- """
-
filename = f"{chapter_num:04d}.txt"
return os.path.join(base_path, filename)
diff --git a/bookscraper/templates/result.html b/bookscraper/templates/result.html
index 81a12c8..57aabf9 100644
--- a/bookscraper/templates/result.html
+++ b/bookscraper/templates/result.html
@@ -31,6 +31,21 @@
border-radius: 6px;
font-size: 13px;
}
+
+ /* NEW: Clear button */
+ #clearLogBtn {
+ margin-bottom: 10px;
+ padding: 8px 16px;
+ background: #777;
+ color: white;
+ border: none;
+ border-radius: 6px;
+ cursor: pointer;
+ }
+ #clearLogBtn:hover {
+ background: #555;
+ }
+
#abortBtn {
padding: 12px 20px;
background: #d9534f;
@@ -68,7 +83,7 @@
{{ message }}
{% endif %}
-
+
{% if book_title %}
Cover:
@@ -103,7 +118,11 @@
-
Live log:
+
Live log:
+
+
+
+
@@ -204,6 +223,17 @@
})
.catch(() => setTimeout(pollLogs, 1500));
}
+
+ // =========================================================
+ // NEW: Clear logs button handler
+ // =========================================================
+ function clearLogs() {
+ fetch("/clear-logs", { method: "POST" })
+ .then(() => {
+ document.getElementById("logbox").innerHTML = "";
+ })
+ .catch((e) => console.error("Clear logs failed:", e));
+ }