# ============================================================ # File: scraper/tasks/parse_tasks.py # Purpose: Parse downloaded HTML into clean chapter text. # Enhanced Piaotia extractor + selector fallback + clean pipeline. # Compatible with payload pipeline v3 + book_idx refactor. # ============================================================ from celery_app import celery_app from bs4 import BeautifulSoup, NavigableString, Comment from scraper.tasks.download_tasks import log_msg from scraper.utils.utils import clean_text, load_all_replacements from scraper.logger_decorators import logcall from db.repository import inc_parsed_done print(">>> [IMPORT] parse_tasks.py loaded (book_idx + payload v3)") # ============================================================ # PIAOTIA ADVANCED CONTENT EXTRACTOR # ============================================================ def extract_piaotia_content(soup): h1 = soup.find("h1") if not h1: return None # Find first table after

table = None for sib in h1.next_siblings: if getattr(sib, "name", None) == "table": table = sib break if not table: return None parts = [] for sib in table.next_siblings: name = getattr(sib, "name", None) text = None if hasattr(sib, "get_text"): text = sib.get_text(strip=True) # Stop conditions if isinstance(sib, Comment) and ("翻页" in sib): break if name == "div": sid = sib.get("id", "") cls = sib.get("class", []) if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"): break if text and ("重要声明" in text or "Copyright" in text): break if text and (text.startswith(("推荐阅读", "目录", "目 录"))): break if name in ("script", "style"): continue if name == "center": continue # Accumulate if isinstance(sib, NavigableString): s = sib.strip() if s: parts.append(s) elif hasattr(sib, "get_text"): t = sib.get_text(separator="\n").strip() if t: parts.append(t) return "\n".join(parts).strip() # ============================================================ # PARSE TASK — PAYLOAD PIPELINE v3 (book_idx) # ============================================================ @celery_app.task(bind=True, queue="parse", ignore_result=False) @logcall def parse_chapter(self, payload: dict): if not payload: return {"skipped": True, "reason": "empty_payload"} # NEW MODEL book_idx = payload["book_idx"] chapter = payload["chapter"] book_meta = payload.get("book_meta") or {} num = chapter.get("num") title = chapter.get("title") or f"Chapter {num}" html = payload.get("html") # ------------------------------------------------------------ # DOWNLOAD SKIPPED → PARSE SKIP # ------------------------------------------------------------ if payload.get("skipped"): log_msg(book_idx, f"[PARSE] SKIP chapter {num} (download skipped)") return payload if not html: log_msg(book_idx, f"[PARSE] Missing HTML for chapter {num} → SKIP") payload["parsed"] = None payload["skipped"] = True return payload log_msg(book_idx, f"[PARSE] Parsing chapter {num}") soup = BeautifulSoup(html, "lxml") # ------------------------------------------------------------ # STRICT SELECTORS # ------------------------------------------------------------ selectors = [ "#content", "div#content", ".content", "div.content", "#chaptercontent", "div#chaptercontent", "#chapterContent", ".read-content", "div.read-content", ] node = None for sel in selectors: tmp = soup.select_one(sel) if tmp: node = tmp break raw = None # strict selectors failed → piaotia extractor if node is None: raw = extract_piaotia_content(soup) else: raw = node.get_text(separator="\n") # FINAL FALLBACK if raw is None: for tag in soup(["script", "style", "noscript"]): tag.decompose() raw = soup.get_text(separator="\n") # ------------------------------------------------------------ # MULTIPASS CLEANING VIA replacement-block files # ------------------------------------------------------------ REPL = load_all_replacements() text = raw for _ in range(5): text = clean_text(text, REPL) # ------------------------------------------------------------ # Collapse double blank lines # ------------------------------------------------------------ cleaned = [] prev_blank = False for line in text.split("\n"): s = line.rstrip() if s == "": if prev_blank: continue prev_blank = True cleaned.append("") else: prev_blank = False cleaned.append(s) text = "\n".join(cleaned) text = f"{title}\n{text}" # ------------------------------------------------------------ # Header on chapter 1 # ------------------------------------------------------------ if num == 1: book_url = book_meta.get("book_url") or "UNKNOWN" header = ( f"{book_meta.get('title','')}\n" f"Author: {book_meta.get('author','')}\n" f"Description:\n{book_meta.get('description','')}\n" f"Book URL: {book_url}\n" + "-" * 50 + "\n\n" ) text = header + text log_msg(book_idx, f"[PARSE] Parsed chapter {num}: {len(text)} chars") # ------------------------------------------------------------ # OUTPUT PAYLOAD # ------------------------------------------------------------ payload["parsed"] = text payload["skipped"] = False inc_parsed_done(book_idx) return payload