# ============================================================ # File: scraper/tasks/parse_tasks.py # Purpose: Parse downloaded HTML into clean chapter text. # Enhanced Piaotia extractor + selector fallback + clean pipeline. # Compatible with payload pipeline v3. # ============================================================ from celery_app import celery_app from bs4 import BeautifulSoup, NavigableString, Comment from scraper.tasks.download_tasks import log_msg from scraper.utils.utils import clean_text, load_all_replacements from scraper.logger_decorators import logcall from db.repository import inc_parsed_done print(">>> [IMPORT] parse_tasks.py loaded (RESTORED + payload v3)") # ============================================================ # PIAOTIA ADVANCED CONTENT EXTRACTOR (unchanged from original) # ============================================================ def extract_piaotia_content(soup): h1 = soup.find("h1") if not h1: return None # Find first table after

table = None for sib in h1.next_siblings: if getattr(sib, "name", None) == "table": table = sib break if not table: return None parts = [] for sib in table.next_siblings: name = getattr(sib, "name", None) text = None if hasattr(sib, "get_text"): text = sib.get_text(strip=True) # STOP CONDITIONS # if isinstance(sib, Comment) and ("翻页" in sib): break # explicit footer blocks if name == "div": sid = sib.get("id", "") cls = sib.get("class", []) if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"): break # copyright block if text and ("重要声明" in text or "Copyright" in text): break # navigation blocks if text and (text.startswith(("推荐阅读", "目录", "目 录"))): break if name in ("script", "style"): continue if name == "center": continue # ACCUMULATE if isinstance(sib, NavigableString): s = sib.strip() if s: parts.append(s) elif hasattr(sib, "get_text"): t = sib.get_text(separator="\n").strip() if t: parts.append(t) return "\n".join(parts).strip() # ============================================================ # PARSE TASK — PAYLOAD PIPELINE (CORRECT v3 FORMAT) # ============================================================ @celery_app.task(bind=True, queue="parse", ignore_result=False) @logcall def parse_chapter(self, payload: dict): if not payload: return {"skipped": True, "reason": "empty_payload"} book_id = payload["book_id"] chapter = payload["chapter"] book_meta = payload.get("book_meta") or {} num = chapter.get("num") title = chapter.get("title") or f"Chapter {num}" html = payload.get("html") # SKIPPED DOWNLOAD → SKIP PARSE if payload.get("skipped"): log_msg(book_id, f"[PARSE] SKIP chapter {num} (download skipped)") return payload if not html: log_msg(book_id, f"[PARSE] Missing HTML for chapter {num} → SKIP") payload["parsed"] = None payload["skipped"] = True return payload log_msg(book_id, f"[PARSE] Parsing chapter {num}") soup = BeautifulSoup(html, "lxml") # ============================================================ # STRICT SELECTORS # ============================================================ selectors = [ "#content", "div#content", ".content", "div.content", "#chaptercontent", "div#chaptercontent", "#chapterContent", ".read-content", "div.read-content", ] node = None for sel in selectors: tmp = soup.select_one(sel) if tmp: node = tmp break raw = None # --- STRICT SELECTOR FAILED → Piaotia extractor --- if node is None: raw = extract_piaotia_content(soup) else: raw = node.get_text(separator="\n") # FINAL FALLBACK if raw is None: for tag in soup(["script", "style", "noscript"]): tag.decompose() raw = soup.get_text(separator="\n") # ============================================================ # MULTIPASS CLEANING via replacement files # ============================================================ REPL = load_all_replacements() text = raw for _ in range(5): text = clean_text(text, REPL) # ============================================================ # Collapse double blank lines # ============================================================ cleaned = [] prev_blank = False for line in text.split("\n"): stripped = line.rstrip() if stripped == "": if prev_blank: continue prev_blank = True cleaned.append("") else: prev_blank = False cleaned.append(stripped) text = "\n".join(cleaned) text = f"{title}\n{text}" # ============================================================ # Add header to chapter 1 # ============================================================ if num == 1: book_url = book_meta.get("book_url") or "UNKNOWN" header = ( f"{book_meta.get('title', '')}\n" f"Author: {book_meta.get('author','')}\n" f"Description:\n{book_meta.get('description','')}\n" f"Book URL: {book_url}\n" + "-" * 50 + "\n\n" ) text = header + text log_msg(book_id, f"[PARSE] Parsed chapter {num}: {len(text)} chars") # ============================================================ # PAYLOAD OUTPUT (v3) # ============================================================ payload["parsed"] = text payload["skipped"] = False inc_parsed_done(book_id) return payload