You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
206 lines
6.0 KiB
206 lines
6.0 KiB
# ============================================================
|
|
# File: scraper/tasks/parse_tasks.py
|
|
# Purpose: Parse downloaded HTML into clean chapter text.
|
|
# Enhanced Piaotia extractor + selector fallback + clean pipeline.
|
|
# Compatible with payload pipeline v3 + book_idx refactor.
|
|
# ============================================================
|
|
|
|
from celery_app import celery_app
|
|
from bs4 import BeautifulSoup, NavigableString, Comment
|
|
|
|
from scraper.tasks.download_tasks import log_msg
|
|
from scraper.utils.utils import clean_text, load_all_replacements
|
|
from scraper.logger_decorators import logcall
|
|
from db.repository import inc_parsed_done
|
|
|
|
|
|
print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)")
|
|
|
|
|
|
# ============================================================
|
|
# PIAOTIA ADVANCED CONTENT EXTRACTOR
|
|
# ============================================================
|
|
def extract_piaotia_content(soup):
|
|
h1 = soup.find("h1")
|
|
if not h1:
|
|
return None
|
|
|
|
# Find first table after <h1>
|
|
table = None
|
|
for sib in h1.next_siblings:
|
|
if getattr(sib, "name", None) == "table":
|
|
table = sib
|
|
break
|
|
|
|
if not table:
|
|
return None
|
|
|
|
parts = []
|
|
|
|
for sib in table.next_siblings:
|
|
name = getattr(sib, "name", None)
|
|
text = None
|
|
|
|
if hasattr(sib, "get_text"):
|
|
text = sib.get_text(strip=True)
|
|
|
|
# Stop conditions
|
|
if isinstance(sib, Comment) and ("翻页" in sib):
|
|
break
|
|
|
|
if name == "div":
|
|
sid = sib.get("id", "")
|
|
cls = sib.get("class", [])
|
|
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
|
|
break
|
|
|
|
if text and ("重要声明" in text or "Copyright" in text):
|
|
break
|
|
|
|
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
|
|
break
|
|
|
|
if name in ("script", "style"):
|
|
continue
|
|
if name == "center":
|
|
continue
|
|
|
|
# Accumulate
|
|
if isinstance(sib, NavigableString):
|
|
s = sib.strip()
|
|
if s:
|
|
parts.append(s)
|
|
elif hasattr(sib, "get_text"):
|
|
t = sib.get_text(separator="\n").strip()
|
|
if t:
|
|
parts.append(t)
|
|
|
|
return "\n".join(parts).strip()
|
|
|
|
|
|
# ============================================================
|
|
# PARSE TASK — PAYLOAD PIPELINE v3 (book_idx)
|
|
# ============================================================
|
|
@celery_app.task(bind=True, queue="parse", ignore_result=False)
|
|
@logcall
|
|
def parse_chapter(self, payload: dict):
|
|
|
|
if not payload:
|
|
return {"skipped": True, "reason": "empty_payload"}
|
|
|
|
# NEW MODEL
|
|
book_idx = payload["book_idx"]
|
|
chapter = payload["chapter"]
|
|
book_meta = payload.get("book_meta") or {}
|
|
|
|
num = chapter.get("num")
|
|
title = chapter.get("title") or f"Chapter {num}"
|
|
html = payload.get("html")
|
|
|
|
# ------------------------------------------------------------
|
|
# DOWNLOAD SKIPPED → PARSE SKIP
|
|
# ------------------------------------------------------------
|
|
if payload.get("skipped"):
|
|
log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)")
|
|
return payload
|
|
|
|
if not html:
|
|
log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP")
|
|
payload["parsed"] = None
|
|
payload["skipped"] = True
|
|
return payload
|
|
|
|
log_msg(book_idx, f"[PARSE] Parsing chapter {num}")
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# ------------------------------------------------------------
|
|
# STRICT SELECTORS
|
|
# ------------------------------------------------------------
|
|
selectors = [
|
|
"#content",
|
|
"div#content",
|
|
".content",
|
|
"div.content",
|
|
"#chaptercontent",
|
|
"div#chaptercontent",
|
|
"#chapterContent",
|
|
".read-content",
|
|
"div.read-content",
|
|
]
|
|
|
|
node = None
|
|
for sel in selectors:
|
|
tmp = soup.select_one(sel)
|
|
if tmp:
|
|
node = tmp
|
|
break
|
|
|
|
raw = None
|
|
|
|
# strict selectors failed → piaotia extractor
|
|
if node is None:
|
|
raw = extract_piaotia_content(soup)
|
|
else:
|
|
raw = node.get_text(separator="\n")
|
|
|
|
# FINAL FALLBACK
|
|
if raw is None:
|
|
for tag in soup(["script", "style", "noscript"]):
|
|
tag.decompose()
|
|
raw = soup.get_text(separator="\n")
|
|
|
|
# ------------------------------------------------------------
|
|
# MULTIPASS CLEANING VIA replacement-block files
|
|
# ------------------------------------------------------------
|
|
REPL = load_all_replacements()
|
|
|
|
text = raw
|
|
for _ in range(5):
|
|
text = clean_text(text, REPL)
|
|
|
|
# ------------------------------------------------------------
|
|
# Collapse double blank lines
|
|
# ------------------------------------------------------------
|
|
cleaned = []
|
|
prev_blank = False
|
|
|
|
for line in text.split("\n"):
|
|
s = line.rstrip()
|
|
if s == "":
|
|
if prev_blank:
|
|
continue
|
|
prev_blank = True
|
|
cleaned.append("")
|
|
else:
|
|
prev_blank = False
|
|
cleaned.append(s)
|
|
|
|
text = "\n".join(cleaned)
|
|
text = f"{title}\n{text}"
|
|
|
|
# ------------------------------------------------------------
|
|
# Header on chapter 1
|
|
# ------------------------------------------------------------
|
|
if num == 1:
|
|
book_url = book_meta.get("book_url") or "UNKNOWN"
|
|
header = (
|
|
f"{book_meta.get('title','')}\n"
|
|
f"Author: {book_meta.get('author','')}\n"
|
|
f"Description:\n{book_meta.get('description','')}\n"
|
|
f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
|
|
)
|
|
text = header + text
|
|
|
|
log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars")
|
|
|
|
# ------------------------------------------------------------
|
|
# OUTPUT PAYLOAD
|
|
# ------------------------------------------------------------
|
|
payload["parsed"] = text
|
|
payload["skipped"] = False
|
|
|
|
inc_parsed_done(book_idx)
|
|
|
|
return payload
|