# ============================================================
# File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text.
# Enhanced Piaotia extractor + selector fallback + clean pipeline.
# Compatible with payload pipeline v3 + book_idx refactor.
# ============================================================
from celery_app import celery_app
from bs4 import BeautifulSoup, NavigableString, Comment
from scraper.tasks.download_tasks import log_msg
from scraper.utils.utils import clean_text, load_all_replacements
from scraper.logger_decorators import logcall
from db.repository import inc_parsed_done
print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)")
# ============================================================
# PIAOTIA ADVANCED CONTENT EXTRACTOR
# ============================================================
def extract_piaotia_content(soup):
h1 = soup.find("h1")
if not h1:
return None
# Find first table after
table = None
for sib in h1.next_siblings:
if getattr(sib, "name", None) == "table":
table = sib
break
if not table:
return None
parts = []
for sib in table.next_siblings:
name = getattr(sib, "name", None)
text = None
if hasattr(sib, "get_text"):
text = sib.get_text(strip=True)
# Stop conditions
if isinstance(sib, Comment) and ("翻页" in sib):
break
if name == "div":
sid = sib.get("id", "")
cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break
if text and ("重要声明" in text or "Copyright" in text):
break
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break
if name in ("script", "style"):
continue
if name == "center":
continue
# Accumulate
if isinstance(sib, NavigableString):
s = sib.strip()
if s:
parts.append(s)
elif hasattr(sib, "get_text"):
t = sib.get_text(separator="\n").strip()
if t:
parts.append(t)
return "\n".join(parts).strip()
# ============================================================
# PARSE TASK — PAYLOAD PIPELINE v3 (book_idx)
# ============================================================
@celery_app.task(bind=True, queue="parse", ignore_result=False)
@logcall
def parse_chapter(self, payload: dict):
if not payload:
return {"skipped": True, "reason": "empty_payload"}
# NEW MODEL
book_idx = payload["book_idx"]
chapter = payload["chapter"]
book_meta = payload.get("book_meta") or {}
num = chapter.get("num")
title = chapter.get("title") or f"Chapter {num}"
html = payload.get("html")
# ------------------------------------------------------------
# DOWNLOAD SKIPPED → PARSE SKIP
# ------------------------------------------------------------
if payload.get("skipped"):
log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)")
return payload
if not html:
log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP")
payload["parsed"] = None
payload["skipped"] = True
return payload
log_msg(book_idx, f"[PARSE] Parsing chapter {num}")
soup = BeautifulSoup(html, "lxml")
# ------------------------------------------------------------
# STRICT SELECTORS
# ------------------------------------------------------------
selectors = [
"#content",
"div#content",
".content",
"div.content",
"#chaptercontent",
"div#chaptercontent",
"#chapterContent",
".read-content",
"div.read-content",
]
node = None
for sel in selectors:
tmp = soup.select_one(sel)
if tmp:
node = tmp
break
raw = None
# strict selectors failed → piaotia extractor
if node is None:
raw = extract_piaotia_content(soup)
else:
raw = node.get_text(separator="\n")
# FINAL FALLBACK
if raw is None:
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
raw = soup.get_text(separator="\n")
# ------------------------------------------------------------
# MULTIPASS CLEANING VIA replacement-block files
# ------------------------------------------------------------
REPL = load_all_replacements()
text = raw
for _ in range(5):
text = clean_text(text, REPL)
# ------------------------------------------------------------
# Collapse double blank lines
# ------------------------------------------------------------
cleaned = []
prev_blank = False
for line in text.split("\n"):
s = line.rstrip()
if s == "":
if prev_blank:
continue
prev_blank = True
cleaned.append("")
else:
prev_blank = False
cleaned.append(s)
text = "\n".join(cleaned)
text = f"{title}\n{text}"
# ------------------------------------------------------------
# Header on chapter 1
# ------------------------------------------------------------
if num == 1:
book_url = book_meta.get("book_url") or "UNKNOWN"
header = (
f"{book_meta.get('title','')}\n"
f"Author: {book_meta.get('author','')}\n"
f"Description:\n{book_meta.get('description','')}\n"
f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
)
text = header + text
log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars")
# ------------------------------------------------------------
# OUTPUT PAYLOAD
# ------------------------------------------------------------
payload["parsed"] = text
payload["skipped"] = False
inc_parsed_done(book_idx)
return payload