text parsing optimization

feat/redis-state-model
peter.fong 2 weeks ago
parent 1a720fbea0
commit dff30e5768

@ -17,7 +17,7 @@ from scraper.abort import set_abort
from scraper.progress import get_progress
# UI LOGS (GLOBAL — no book_id)
from scraper.ui_log import get_ui_logs
from scraper.ui_log import get_ui_logs, reset_ui_logs # <-- ADDED
from celery.result import AsyncResult
@ -58,6 +58,11 @@ def start_scraping():
if not url:
return render_template("result.html", error="Geen URL opgegeven.")
# ---------------------------------------------------------
# NEW: Clear UI log buffer when starting a new scrape
# ---------------------------------------------------------
reset_ui_logs()
log_debug(f"[WEB] Scraping via Celery: {url}")
async_result = celery_app.send_task(
@ -70,11 +75,19 @@ def start_scraping():
"result.html",
message="Scraping gestart.",
scraping_task_id=async_result.id,
# voor result.html cover rendering
book_title=None,
)
# =====================================================
# CLEAR UI LOGS MANUALLY (NEW)
# =====================================================
@app.route("/clear-logs", methods=["POST"])
def clear_logs():
reset_ui_logs()
return jsonify({"status": "ok", "message": "UI logs cleared"})
# =====================================================
# ABORT (per book_id)
# =====================================================

@ -0,0 +1,37 @@
#scraper/replacements/encoding.txt
# --- fix common encoding artifacts ---
\u3000= # IDEOGRAPHIC SPACE → empty
\u00A0= # non-breaking space → empty
# full-width punctuation
=,
。=.
=!
=?
=;
=:
=(
=)
【=[
】=]
《=<
》=>
# hyphen variants
=-
—=-
―=-
\u3000=
\u00A0=
 =
 =
=
—=—
“="
”="
’='
…=…
•=*
▁=
▲=
 =

@ -0,0 +1,27 @@
#scraper/replacements/html.txt
<br>=\n
<br/>=\n
<br />=\n
&nbsp;=
&nbsp&nbsp=
&nbsp&nbsp&nbsp=
&emsp;=
&ensp;=
&thinsp;=
&ldquo;="
&rdquo;="
&lsquo;='
&rsquo;='
&lt;=<
&gt;=>
&copy;=
&reg;=
&trade;=
fontbigbigbig=
fontbigbig=
font1=
font2=
font3=
strongstrong=
divdiv=
spanspan=

@ -0,0 +1,77 @@
#scraper/replacements/junk.txt
# --- Navigation ---
上一章=
下一章=
上一頁=
下一頁=
返回顶部=
返回目录=
返回书页=
章节目录=
章节列表=
快捷键=
(快捷键 ←)=
(快捷键 →)=
(快捷键)=
(快捷键 ←)=
(快捷键 →)=
上一页=
下一页=
手机阅读=
返回=
上一页阅读=
下一页阅读=
# --- Booksite footer disclaimers ---
重要声明=
所有的文字=
均由网友发表=
均由网友上传=
本站立场无关=
阅读更多小说=
返回飘天文学网=
小说阅读网=
最新章节请返回=
永久地址=
All rights reserved=
Copyright=
飘天文学=
# --- Piaotia specific ---
请记住本书域名=
请收藏本书=
加入书签=
加入书架=
收藏本书=
推荐本书=
本章未完=
请稍后=
最新网址=
小说网=
小说阅读=
将本书加入书架=
章节出错=
点此举报=
举报原因=
# --- Ads / QR / watermark ---
关注公众号=
微信扫一扫=
扫码阅读=
二维码=
QQ交流群=
加QQ群=
广告=
广告位=
sponsor=
sponsored=
ADVERTISEMENT=
Advertisment=
Adblock=
# --- Mode / UI related ---
选择背景颜色=
选择字体大小=
繁體中文=
模式选择=
阅读模式=

@ -98,7 +98,7 @@ def generate_audio(
# ============================================================
container_path = chapter_text
log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
# log(f"[AUDIO] CH{chapter_number}: container_path={container_path}")
# 1) Strip container prefix to get relative path: BOOK/VOLUME/FILE
if container_path.startswith(CONTAINER_PREFIX):
@ -120,7 +120,7 @@ def generate_audio(
# 2) Construct real host path
host_path = os.path.join(HOST_PATH, relative_path)
log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
# log(f"[AUDIO] CH{chapter_number}: resolved_host_path={host_path}")
# ============================================================
# PREPARE OUTPUT DIR (always correct)
@ -132,7 +132,7 @@ def generate_audio(
safe_num = f"{chapter_number:04d}"
audio_file = os.path.join(base_dir, f"{safe_num}.m4a")
log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
# log(f"[AUDIO] CH{chapter_number}: output_file={audio_file}")
if os.path.exists(audio_file):
log(f"[AUDIO] Skip CH{chapter_number} → already exists")

@ -1,50 +1,35 @@
# =========================================================
# File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text.
#
# Abort Behavior:
# - parse MUST ALWAYS RUN once download has started
# - even if the user triggers abort afterwards
# - (abort only prevents new chapters from starting)
#
# Logging:
# - Same unified log_msg(book_id, message) as download_tasks
# - publisher.log → console
# - ui_log.push_ui → GUI
# Enhanced version: Piaotia H1→content extractor + clean pipeline
# NO HARDCODED REPLACEMENTS — everything comes from replacement files
# =========================================================
from celery_app import celery_app
from bs4 import BeautifulSoup
from scraper.utils import clean_text, load_replacements
from scraper.utils import clean_text, load_all_replacements
from scraper.tasks.download_tasks import log_msg # unified logger
print(">>> [IMPORT] parse_tasks.py loaded")
print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
@celery_app.task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, download_result: dict, meta: dict):
"""
Parse raw HTML returned by download_chapter into clean chapter text.
"""
# Extract book_id stored by download_tasks
book_id = download_result.get("book_id", "NOBOOK")
# ------------------------------------------------------------
# 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
# SKIPPED DOWNLOAD → SKIP PARSE
# ------------------------------------------------------------
if download_result.get("skipped"):
chapter = download_result.get("chapter")
log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
# Ensure book_id is present in the returned dict
download_result["book_id"] = book_id
return download_result
# ------------------------------------------------------------
# 2) Normal Parsing
# NORMAL PARSE
# ------------------------------------------------------------
chapter_num = download_result["chapter"]
chapter_url = download_result["url"]
@ -54,14 +39,19 @@ def parse_chapter(self, download_result: dict, meta: dict):
soup = BeautifulSoup(html, "lxml")
# ------------------------------------------------------------
# STRICT SELECTORS (direct content blocks)
# ------------------------------------------------------------
selectors = [
"#content",
".content",
"div#content",
".content",
"div.content",
"#chaptercontent",
"div#chaptercontent",
"#chapterContent",
".read-content",
"div.read-content",
]
node = None
@ -71,20 +61,81 @@ def parse_chapter(self, download_result: dict, meta: dict):
node = tmp
break
raw = node.get_text() if node else soup.get_text()
# ------------------------------------------------------------
# PIAOTIA FALLBACK:
# Extract content between <H1> and the "bottomlink" block.
# ------------------------------------------------------------
raw = None
if node is None:
h1 = soup.find("h1")
if h1:
content_parts = []
for sib in h1.next_siblings:
# stop at bottom navigation/footer block
sib_class = getattr(sib, "get", lambda *_: None)("class")
if sib_class and (
"bottomlink" in sib_class or sib_class == "bottomlink"
):
break
# ignore typical noise containers
if getattr(sib, "name", None) in ["script", "style", "center"]:
continue
if hasattr(sib, "get_text"):
content_parts.append(sib.get_text(separator="\n"))
else:
content_parts.append(str(sib))
raw = "\n".join(content_parts)
# ------------------------------------------------------------
# Apply global replacements
# FINAL FALLBACK
# ------------------------------------------------------------
REPL = load_replacements()
text = clean_text(raw, REPL)
if raw is None:
if node:
raw = node.get_text(separator="\n")
else:
# drop scripts & styles
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
raw = soup.get_text(separator="\n")
# ------------------------------------------------------------
# Chapter 1 gets full header
# MULTIPASS CLEANING via replacement files ONLY
# ------------------------------------------------------------
REPL = load_all_replacements()
text = raw
for _ in range(5): # like the C# CleanText loop
text = clean_text(text, REPL)
# ------------------------------------------------------------
# Collapse excessive empty lines
# ------------------------------------------------------------
cleaned = []
prev_blank = False
for line in text.split("\n"):
stripped = line.rstrip()
if stripped == "":
if prev_blank:
continue
prev_blank = True
cleaned.append("")
else:
prev_blank = False
cleaned.append(stripped)
text = "\n".join(cleaned)
# ------------------------------------------------------------
# Add header to chapter 1
# ------------------------------------------------------------
if chapter_num == 1:
book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
header = (
f"{meta.get('title','')}\n"
f"Author: {meta.get('author','')}\n"

@ -12,6 +12,7 @@ import redis
from scraper.sites import BookSite
from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort # no circular deps
from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT
print(">>> [IMPORT] scraping.py loaded")
@ -24,6 +25,11 @@ r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
def start_scrape_book(self, url: str):
"""Scrapes metadata + chapters and prepares download tracking."""
# ------------------------------------------------------------
# NEW: clear UI log buffer at start of new run
# ------------------------------------------------------------
reset_ui_logs()
log(f"[SCRAPING] Start scraping for: {url}")
# ------------------------------------------------------------
@ -50,10 +56,10 @@ def start_scrape_book(self, url: str):
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
# ------------------------------------------------------------
# BOOK RUN ID (CHANGED: use book title instead of UUID)
# BOOK RUN ID (using title as ID)
# ------------------------------------------------------------
title = result.get("title") or "UnknownBook"
book_id = title # ← your requirement: title is unique and consistent
book_id = title # user requirement
result["book_id"] = book_id
@ -74,7 +80,6 @@ def start_scrape_book(self, url: str):
# ------------------------------------------------------------
# DISPATCH DOWNLOAD CONTROLLER
# ------------------------------------------------------------
# controller task signature = launch_downloads(book_id, scrape_result)
celery_app.send_task(
"scraper.tasks.controller_tasks.launch_downloads",
args=[book_id, result],

@ -34,3 +34,13 @@ def get_ui_logs(limit: int = None):
limit = LOG_BUFFER_SIZE
return r.lrange(UI_LOG_KEY, -limit, -1)
def reset_ui_logs():
"""
Clear the entire UI log buffer.
Used by:
- Clear button in GUI
- Auto-clear when new book scraping starts
"""
r.delete(UI_LOG_KEY)

@ -1,19 +1,30 @@
# ============================================================
# File: scraper/utils.py
# Purpose:
# Centralised replacement loader + text cleaner
# using 3 replacement categories:
# 1) HTML replacements
# 2) Encoding replacements
# 3) Junk-term replacements (generic "noise" phrases)
#
# Nothing in this file contains hardcoded cleanup rules.
# EVERYTHING comes from replacement files ONLY.
# ============================================================
import os
import re
from pathlib import Path
# ------------------------------------------------------------
# Load replacements from text_replacements.txt (optional file)
# Generic key=value replacement loader
# ------------------------------------------------------------
def load_replacements(filepath="text_replacements.txt") -> dict:
def load_replacement_file(path: Path) -> dict:
"""
Load key=value style replacements.
Empty or missing file return {}.
Lines starting with '#' are ignored.
Loads key=value pairs from a file.
Missing file {}.
Ignores empty lines and lines starting with '#'.
"""
path = Path(filepath)
if not path.exists():
return {}
@ -22,8 +33,10 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" in line:
key, val = line.split("=", 1)
repl[key.strip()] = val.strip()
@ -32,36 +45,69 @@ def load_replacements(filepath="text_replacements.txt") -> dict:
# ------------------------------------------------------------
# Clean extracted HTML text
# Load all categories (HTML → encoding → junk)
# Order matters: later overrides earlier.
# ------------------------------------------------------------
def load_all_replacements() -> dict:
root = Path(__file__).parent / "replacements"
html_file = root / "html.txt"
enc_file = root / "encoding.txt"
junk_file = root / "junk.txt"
repl = {}
repl.update(load_replacement_file(html_file))
repl.update(load_replacement_file(enc_file))
repl.update(load_replacement_file(junk_file))
return repl
# ------------------------------------------------------------
# Legacy compatibility wrapper
# Many modules still import: from scraper.utils import load_replacements
# This wrapper keeps everything working.
# ------------------------------------------------------------
def clean_text(raw: str, repl_dict: dict = None) -> str:
def load_replacements(filepath=None) -> dict:
"""
Normalize whitespace, remove junk, apply replacements.
repl_dict is optional {} if none provided.
Backward-compatible alias.
- If called with no filepath return merged replacements.
- If called with a filepath load that one file only.
"""
if repl_dict is None:
repl_dict = {}
if filepath is None:
return load_all_replacements()
else:
# Allow explicit loading of a single file
path = Path(filepath)
return load_replacement_file(path)
txt = raw.replace("\r", "") # normalize CRLF
# Collapse 3+ blank lines → max 1 empty line
txt = re.sub(r"\n{3,}", "\n\n", txt)
# ------------------------------------------------------------
# Clean text using loaded replacements
# ------------------------------------------------------------
def clean_text(raw: str, repl: dict) -> str:
"""
Apply replacements and basic whitespace normalisation.
No hardcoded rules live here.
"""
if not raw:
return ""
# Apply replacements
for key, val in repl_dict.items():
txt = raw.replace("\r", "")
# Apply loaded replacements
for key, val in repl.items():
txt = txt.replace(key, val)
# Collapse 3+ blank lines → max 1
txt = re.sub(r"\n{3,}", "\n\n", txt)
return txt.strip()
# ------------------------------------------------------------
# Determine save path for a chapter (shared by download & save)
# Determine chapter save path
# ------------------------------------------------------------
def get_save_path(chapter_num: int, base_path: str) -> str:
"""
Returns the filesystem path where this chapter should be saved.
Formats the filename as 0001.txt, 0002.txt, ...
"""
filename = f"{chapter_num:04d}.txt"
return os.path.join(base_path, filename)

@ -31,6 +31,21 @@
border-radius: 6px;
font-size: 13px;
}
/* NEW: Clear button */
#clearLogBtn {
margin-bottom: 10px;
padding: 8px 16px;
background: #777;
color: white;
border: none;
border-radius: 6px;
cursor: pointer;
}
#clearLogBtn:hover {
background: #555;
}
#abortBtn {
padding: 12px 20px;
background: #d9534f;
@ -68,7 +83,7 @@
<div class="box">{{ message }}</div>
{% endif %}
<!-- COVER WEERGAVE (toegevoegd) -->
<!-- COVER -->
{% if book_title %}
<div class="box">
<strong>Cover:</strong><br />
@ -103,7 +118,11 @@
</div>
<div class="box">
<strong>Live log:</strong>
<strong>Live log:</strong><br />
<!-- NEW BUTTON -->
<button id="clearLogBtn" onclick="clearLogs()">Clear logs</button>
<div id="logbox" class="logbox"></div>
</div>
@ -204,6 +223,17 @@
})
.catch(() => setTimeout(pollLogs, 1500));
}
// =========================================================
// NEW: Clear logs button handler
// =========================================================
function clearLogs() {
fetch("/clear-logs", { method: "POST" })
.then(() => {
document.getElementById("logbox").innerHTML = "";
})
.catch((e) => console.error("Clear logs failed:", e));
}
</script>
</body>
</html>

Loading…
Cancel
Save