|
|
|
|
@ -11,9 +11,85 @@ from bs4 import BeautifulSoup
|
|
|
|
|
from scraper.utils import clean_text, load_all_replacements
|
|
|
|
|
from scraper.tasks.download_tasks import log_msg # unified logger
|
|
|
|
|
|
|
|
|
|
from bs4 import NavigableString, Comment
|
|
|
|
|
|
|
|
|
|
print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_piaotia_content(soup):
|
|
|
|
|
"""
|
|
|
|
|
Extract clean chapter content from Piaotia pages.
|
|
|
|
|
Start after the table following <H1>.
|
|
|
|
|
End before nav/ads/footer/copyright.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
h1 = soup.find("h1")
|
|
|
|
|
if not h1:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# -------- Find first table after <h1> --------
|
|
|
|
|
table = None
|
|
|
|
|
for sib in h1.next_siblings:
|
|
|
|
|
if getattr(sib, "name", None) == "table":
|
|
|
|
|
table = sib
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not table:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
parts = []
|
|
|
|
|
|
|
|
|
|
# -------- Iterate after table --------
|
|
|
|
|
for sib in table.next_siblings:
|
|
|
|
|
|
|
|
|
|
name = getattr(sib, "name", None)
|
|
|
|
|
text = None
|
|
|
|
|
if hasattr(sib, "get_text"):
|
|
|
|
|
text = sib.get_text(strip=True)
|
|
|
|
|
|
|
|
|
|
# === STOP CONDITIONS ===
|
|
|
|
|
|
|
|
|
|
# Comments like <!-- 翻页上AD开始 -->
|
|
|
|
|
if isinstance(sib, Comment) and ("翻页" in sib):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Explicit footer blocks
|
|
|
|
|
if name == "div":
|
|
|
|
|
sid = sib.get("id", "")
|
|
|
|
|
cls = sib.get("class", [])
|
|
|
|
|
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Copyright block — strongest indicator
|
|
|
|
|
if text and ("重要声明" in text or "Copyright" in text):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Navigation or 推荐阅读
|
|
|
|
|
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Skip scripts, ads, centers
|
|
|
|
|
if name in ("script", "style"):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Skip JS containers like <center><script>...</script></center>
|
|
|
|
|
if name == "center":
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# === ACCUMULATE TEXT ===
|
|
|
|
|
if isinstance(sib, NavigableString):
|
|
|
|
|
s = sib.strip()
|
|
|
|
|
if s:
|
|
|
|
|
parts.append(s)
|
|
|
|
|
|
|
|
|
|
elif hasattr(sib, "get_text"):
|
|
|
|
|
t = sib.get_text(separator="\n").strip()
|
|
|
|
|
if t:
|
|
|
|
|
parts.append(t)
|
|
|
|
|
|
|
|
|
|
return "\n".join(parts).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@celery_app.task(bind=True, queue="parse", ignore_result=False)
|
|
|
|
|
def parse_chapter(self, download_result: dict):
|
|
|
|
|
"""
|
|
|
|
|
@ -63,32 +139,38 @@ def parse_chapter(self, download_result: dict):
|
|
|
|
|
node = tmp
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
# PIAOTIA FALLBACK:
|
|
|
|
|
# Extract content between <H1> and the "bottomlink" block.
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
raw = None
|
|
|
|
|
if node is None:
|
|
|
|
|
h1 = soup.find("h1")
|
|
|
|
|
if h1:
|
|
|
|
|
content_parts = []
|
|
|
|
|
for sib in h1.next_siblings:
|
|
|
|
|
|
|
|
|
|
sib_class = getattr(sib, "get", lambda *_: None)("class")
|
|
|
|
|
if sib_class and (
|
|
|
|
|
"bottomlink" in sib_class or sib_class == "bottomlink"
|
|
|
|
|
):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if getattr(sib, "name", None) in ["script", "style", "center"]:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if hasattr(sib, "get_text"):
|
|
|
|
|
content_parts.append(sib.get_text(separator="\n"))
|
|
|
|
|
else:
|
|
|
|
|
content_parts.append(str(sib))
|
|
|
|
|
|
|
|
|
|
raw = "\n".join(content_parts)
|
|
|
|
|
# --- STRICT SELECTOR FAILED → Try Piaotia extractor ---
|
|
|
|
|
if node is None:
|
|
|
|
|
raw = extract_piaotia_content(soup)
|
|
|
|
|
|
|
|
|
|
# # ------------------------------------------------------------
|
|
|
|
|
# # PIAOTIA FALLBACK:
|
|
|
|
|
# # Extract content between <H1> and the "bottomlink" block.
|
|
|
|
|
# # ------------------------------------------------------------
|
|
|
|
|
# raw = None
|
|
|
|
|
# if node is None:
|
|
|
|
|
# h1 = soup.find("h1")
|
|
|
|
|
# if h1:
|
|
|
|
|
# content_parts = []
|
|
|
|
|
# for sib in h1.next_siblings:
|
|
|
|
|
|
|
|
|
|
# sib_class = getattr(sib, "get", lambda *_: None)("class")
|
|
|
|
|
# if sib_class and (
|
|
|
|
|
# "bottomlink" in sib_class or sib_class == "bottomlink"
|
|
|
|
|
# ):
|
|
|
|
|
# break
|
|
|
|
|
|
|
|
|
|
# if getattr(sib, "name", None) in ["script", "style", "center"]:
|
|
|
|
|
# continue
|
|
|
|
|
|
|
|
|
|
# if hasattr(sib, "get_text"):
|
|
|
|
|
# content_parts.append(sib.get_text(separator="\n"))
|
|
|
|
|
# else:
|
|
|
|
|
# content_parts.append(str(sib))
|
|
|
|
|
|
|
|
|
|
# raw = "\n".join(content_parts)
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
# FINAL FALLBACK
|
|
|
|
|
|