# ========================================================= # File: scraper/tasks/parse_tasks.py # Purpose: Parse downloaded HTML into clean chapter text. # # Abort Behavior: # - parse MUST ALWAYS RUN once download has started # - even if the user triggers abort afterwards # - (abort only prevents new chapters from starting) # # Logging: # - Same unified log_msg(book_id, message) as download_tasks # - publisher.log → console # - ui_log.push_ui → GUI # ========================================================= from celery_app import celery_app from bs4 import BeautifulSoup from scraper.utils import clean_text, load_replacements from scraper.tasks.download_tasks import log_msg # unified logger print(">>> [IMPORT] parse_tasks.py loaded") @celery_app.task(bind=True, queue="parse", ignore_result=False) def parse_chapter(self, download_result: dict, meta: dict): """ Parse raw HTML returned by download_chapter into clean chapter text. """ # Extract book_id stored by download_tasks book_id = download_result.get("book_id", "NOBOOK") # ------------------------------------------------------------ # 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS # ------------------------------------------------------------ if download_result.get("skipped"): chapter = download_result.get("chapter") log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)") return download_result # ------------------------------------------------------------ # 2) Normal Parsing # ------------------------------------------------------------ chapter_num = download_result["chapter"] chapter_url = download_result["url"] html = download_result["html"] log_msg(book_id, f"[PARSE] Parsing chapter {chapter_num}") soup = BeautifulSoup(html, "lxml") selectors = [ "#content", ".content", "div#content", "div.content", "div#chaptercontent", "#chapterContent", ".read-content", ] node = None for sel in selectors: tmp = soup.select_one(sel) if tmp: node = tmp break raw = node.get_text() if node else soup.get_text() # ------------------------------------------------------------ # Apply global replacements # ------------------------------------------------------------ REPL = load_replacements() text = clean_text(raw, REPL) # ------------------------------------------------------------ # Chapter 1 gets full header # ------------------------------------------------------------ if chapter_num == 1: book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN" header = ( f"{meta.get('title','')}\n" f"Author: {meta.get('author','')}\n" f"Description:\n{meta.get('description','')}\n" f"Book URL: {book_url}\n" + "-" * 50 + "\n\n" ) text = header + text log_msg(book_id, f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars") return { "book_id": book_id, "chapter": chapter_num, "url": chapter_url, "text": text, "length": len(text), }