|
|
|
@ -1,50 +1,35 @@
|
|
|
|
# =========================================================
|
|
|
|
# =========================================================
|
|
|
|
# File: scraper/tasks/parse_tasks.py
|
|
|
|
# File: scraper/tasks/parse_tasks.py
|
|
|
|
# Purpose: Parse downloaded HTML into clean chapter text.
|
|
|
|
# Purpose: Parse downloaded HTML into clean chapter text.
|
|
|
|
#
|
|
|
|
# Enhanced version: Piaotia H1→content extractor + clean pipeline
|
|
|
|
# Abort Behavior:
|
|
|
|
# NO HARDCODED REPLACEMENTS — everything comes from replacement files
|
|
|
|
# - parse MUST ALWAYS RUN once download has started
|
|
|
|
|
|
|
|
# - even if the user triggers abort afterwards
|
|
|
|
|
|
|
|
# - (abort only prevents new chapters from starting)
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# Logging:
|
|
|
|
|
|
|
|
# - Same unified log_msg(book_id, message) as download_tasks
|
|
|
|
|
|
|
|
# - publisher.log → console
|
|
|
|
|
|
|
|
# - ui_log.push_ui → GUI
|
|
|
|
|
|
|
|
# =========================================================
|
|
|
|
# =========================================================
|
|
|
|
|
|
|
|
|
|
|
|
from celery_app import celery_app
|
|
|
|
from celery_app import celery_app
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
from scraper.utils import clean_text, load_replacements
|
|
|
|
from scraper.utils import clean_text, load_all_replacements
|
|
|
|
from scraper.tasks.download_tasks import log_msg # unified logger
|
|
|
|
from scraper.tasks.download_tasks import log_msg # unified logger
|
|
|
|
|
|
|
|
|
|
|
|
print(">>> [IMPORT] parse_tasks.py loaded")
|
|
|
|
print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@celery_app.task(bind=True, queue="parse", ignore_result=False)
|
|
|
|
@celery_app.task(bind=True, queue="parse", ignore_result=False)
|
|
|
|
def parse_chapter(self, download_result: dict, meta: dict):
|
|
|
|
def parse_chapter(self, download_result: dict, meta: dict):
|
|
|
|
"""
|
|
|
|
|
|
|
|
Parse raw HTML returned by download_chapter into clean chapter text.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Extract book_id stored by download_tasks
|
|
|
|
|
|
|
|
book_id = download_result.get("book_id", "NOBOOK")
|
|
|
|
book_id = download_result.get("book_id", "NOBOOK")
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
|
|
|
|
# SKIPPED DOWNLOAD → SKIP PARSE
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
if download_result.get("skipped"):
|
|
|
|
if download_result.get("skipped"):
|
|
|
|
chapter = download_result.get("chapter")
|
|
|
|
chapter = download_result.get("chapter")
|
|
|
|
log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
|
|
|
|
log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
|
|
|
|
|
|
|
|
|
|
|
|
# Ensure book_id is present in the returned dict
|
|
|
|
|
|
|
|
download_result["book_id"] = book_id
|
|
|
|
download_result["book_id"] = book_id
|
|
|
|
|
|
|
|
|
|
|
|
return download_result
|
|
|
|
return download_result
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# 2) Normal Parsing
|
|
|
|
# NORMAL PARSE
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
chapter_num = download_result["chapter"]
|
|
|
|
chapter_num = download_result["chapter"]
|
|
|
|
chapter_url = download_result["url"]
|
|
|
|
chapter_url = download_result["url"]
|
|
|
|
@ -54,14 +39,19 @@ def parse_chapter(self, download_result: dict, meta: dict):
|
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
|
|
# STRICT SELECTORS (direct content blocks)
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
selectors = [
|
|
|
|
selectors = [
|
|
|
|
"#content",
|
|
|
|
"#content",
|
|
|
|
".content",
|
|
|
|
|
|
|
|
"div#content",
|
|
|
|
"div#content",
|
|
|
|
|
|
|
|
".content",
|
|
|
|
"div.content",
|
|
|
|
"div.content",
|
|
|
|
|
|
|
|
"#chaptercontent",
|
|
|
|
"div#chaptercontent",
|
|
|
|
"div#chaptercontent",
|
|
|
|
"#chapterContent",
|
|
|
|
"#chapterContent",
|
|
|
|
".read-content",
|
|
|
|
".read-content",
|
|
|
|
|
|
|
|
"div.read-content",
|
|
|
|
]
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
node = None
|
|
|
|
node = None
|
|
|
|
@ -71,20 +61,81 @@ def parse_chapter(self, download_result: dict, meta: dict):
|
|
|
|
node = tmp
|
|
|
|
node = tmp
|
|
|
|
break
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
raw = node.get_text() if node else soup.get_text()
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
|
|
# PIAOTIA FALLBACK:
|
|
|
|
|
|
|
|
# Extract content between <H1> and the "bottomlink" block.
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
|
|
raw = None
|
|
|
|
|
|
|
|
if node is None:
|
|
|
|
|
|
|
|
h1 = soup.find("h1")
|
|
|
|
|
|
|
|
if h1:
|
|
|
|
|
|
|
|
content_parts = []
|
|
|
|
|
|
|
|
for sib in h1.next_siblings:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# stop at bottom navigation/footer block
|
|
|
|
|
|
|
|
sib_class = getattr(sib, "get", lambda *_: None)("class")
|
|
|
|
|
|
|
|
if sib_class and (
|
|
|
|
|
|
|
|
"bottomlink" in sib_class or sib_class == "bottomlink"
|
|
|
|
|
|
|
|
):
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ignore typical noise containers
|
|
|
|
|
|
|
|
if getattr(sib, "name", None) in ["script", "style", "center"]:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if hasattr(sib, "get_text"):
|
|
|
|
|
|
|
|
content_parts.append(sib.get_text(separator="\n"))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
content_parts.append(str(sib))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
raw = "\n".join(content_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
|
|
# FINAL FALLBACK
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
|
|
if raw is None:
|
|
|
|
|
|
|
|
if node:
|
|
|
|
|
|
|
|
raw = node.get_text(separator="\n")
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# drop scripts & styles
|
|
|
|
|
|
|
|
for tag in soup(["script", "style", "noscript"]):
|
|
|
|
|
|
|
|
tag.decompose()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
raw = soup.get_text(separator="\n")
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# Apply global replacements
|
|
|
|
# MULTIPASS CLEANING via replacement files ONLY
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
REPL = load_replacements()
|
|
|
|
REPL = load_all_replacements()
|
|
|
|
text = clean_text(raw, REPL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = raw
|
|
|
|
|
|
|
|
for _ in range(5): # like the C# CleanText loop
|
|
|
|
|
|
|
|
text = clean_text(text, REPL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
|
|
# Collapse excessive empty lines
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# Chapter 1 gets full header
|
|
|
|
cleaned = []
|
|
|
|
|
|
|
|
prev_blank = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for line in text.split("\n"):
|
|
|
|
|
|
|
|
stripped = line.rstrip()
|
|
|
|
|
|
|
|
if stripped == "":
|
|
|
|
|
|
|
|
if prev_blank:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
prev_blank = True
|
|
|
|
|
|
|
|
cleaned.append("")
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
prev_blank = False
|
|
|
|
|
|
|
|
cleaned.append(stripped)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = "\n".join(cleaned)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
|
|
# Add header to chapter 1
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
if chapter_num == 1:
|
|
|
|
if chapter_num == 1:
|
|
|
|
book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
|
|
|
|
book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
|
|
|
|
|
|
|
|
|
|
|
|
header = (
|
|
|
|
header = (
|
|
|
|
f"{meta.get('title','')}\n"
|
|
|
|
f"{meta.get('title','')}\n"
|
|
|
|
f"Author: {meta.get('author','')}\n"
|
|
|
|
f"Author: {meta.get('author','')}\n"
|
|
|
|
|