kmftools/bookscraper/scraper/tasks/parse_tasks.py

# ============================================================
# File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text.
# Enhanced Piaotia extractor + selector fallback + clean pipeline.
# Compatible with payload pipeline v3.
# ============================================================

from celery_app import celery_app
from bs4 import BeautifulSoup, NavigableString, Comment

from scraper.tasks.download_tasks import log_msg
from scraper.utils.utils import clean_text, load_all_replacements
from scraper.logger_decorators import logcall
from db.repository import inc_parsed_done


print(">>> [IMPORT] parse_tasks.py loaded (RESTORED + payload v3)")


# ============================================================
# PIAOTIA ADVANCED CONTENT EXTRACTOR (unchanged from original)
# ============================================================
def extract_piaotia_content(soup):
    h1 = soup.find("h1")
    if not h1:
        return None

    # Find first table after <h1>
    table = None
    for sib in h1.next_siblings:
        if getattr(sib, "name", None) == "table":
            table = sib
            break

    if not table:
        return None

    parts = []

    for sib in table.next_siblings:
        name = getattr(sib, "name", None)
        text = None

        if hasattr(sib, "get_text"):
            text = sib.get_text(strip=True)

        # STOP CONDITIONS

        # <!-- 翻页 -->
        if isinstance(sib, Comment) and ("翻页" in sib):
            break

        # explicit footer blocks
        if name == "div":
            sid = sib.get("id", "")
            cls = sib.get("class", [])
            if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
                break

        # copyright block
        if text and ("重要声明" in text or "Copyright" in text):
            break

        # navigation blocks
        if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
            break

        if name in ("script", "style"):
            continue

        if name == "center":
            continue

        # ACCUMULATE
        if isinstance(sib, NavigableString):
            s = sib.strip()
            if s:
                parts.append(s)

        elif hasattr(sib, "get_text"):
            t = sib.get_text(separator="\n").strip()
            if t:
                parts.append(t)

    return "\n".join(parts).strip()


# ============================================================
# PARSE TASK — PAYLOAD PIPELINE (CORRECT v3 FORMAT)
# ============================================================
@celery_app.task(bind=True, queue="parse", ignore_result=False)
@logcall
def parse_chapter(self, payload: dict):

    if not payload:
        return {"skipped": True, "reason": "empty_payload"}

    book_id = payload["book_id"]
    chapter = payload["chapter"]
    book_meta = payload.get("book_meta") or {}

    num = chapter.get("num")
    title = chapter.get("title") or f"Chapter {num}"
    html = payload.get("html")

    # SKIPPED DOWNLOAD → SKIP PARSE
    if payload.get("skipped"):
        log_msg(book_id, f"[PARSE] SKIP chapter {num} (download skipped)")
        return payload

    if not html:
        log_msg(book_id, f"[PARSE] Missing HTML for chapter {num} → SKIP")
        payload["parsed"] = None
        payload["skipped"] = True
        return payload

    log_msg(book_id, f"[PARSE] Parsing chapter {num}")

    soup = BeautifulSoup(html, "lxml")

    # ============================================================
    # STRICT SELECTORS
    # ============================================================
    selectors = [
        "#content",
        "div#content",
        ".content",
        "div.content",
        "#chaptercontent",
        "div#chaptercontent",
        "#chapterContent",
        ".read-content",
        "div.read-content",
    ]

    node = None
    for sel in selectors:
        tmp = soup.select_one(sel)
        if tmp:
            node = tmp
            break

    raw = None

    # --- STRICT SELECTOR FAILED → Piaotia extractor ---
    if node is None:
        raw = extract_piaotia_content(soup)
    else:
        raw = node.get_text(separator="\n")

    # FINAL FALLBACK
    if raw is None:
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        raw = soup.get_text(separator="\n")

    # ============================================================
    # MULTIPASS CLEANING via replacement files
    # ============================================================
    REPL = load_all_replacements()

    text = raw
    for _ in range(5):
        text = clean_text(text, REPL)

    # ============================================================
    # Collapse double blank lines
    # ============================================================
    cleaned = []
    prev_blank = False
    for line in text.split("\n"):
        stripped = line.rstrip()
        if stripped == "":
            if prev_blank:
                continue
            prev_blank = True
            cleaned.append("")
        else:
            prev_blank = False
            cleaned.append(stripped)

    text = "\n".join(cleaned)
    text = f"{title}\n{text}"

    # ============================================================
    # Add header to chapter 1
    # ============================================================
    if num == 1:
        book_url = book_meta.get("book_url") or "UNKNOWN"
        header = (
            f"{book_meta.get('title', '')}\n"
            f"Author: {book_meta.get('author','')}\n"
            f"Description:\n{book_meta.get('description','')}\n"
            f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
        )
        text = header + text

    log_msg(book_id, f"[PARSE] Parsed chapter {num}: {len(text)} chars")

    # ============================================================
    # PAYLOAD OUTPUT (v3)
    # ============================================================
    payload["parsed"] = text
    payload["skipped"] = False

    inc_parsed_done(book_id)

    return payload