# scraper/tasks/parse_tasks.py from celery_app import celery_app from logbus.publisher import log from bs4 import BeautifulSoup from scraper.utils import clean_text, load_replacements print(">>> [IMPORT] parse_tasks.py loaded") @celery_app.task(bind=True, queue="parse", ignore_result=False) def parse_chapter(self, download_result: dict, meta: dict): """ download_result: { "chapter": int, "url": str, "html": str } meta: { "title": str, "author": str, "description": str } """ chapter_num = download_result["chapter"] url = download_result["url"] html = download_result["html"] log(f"[PARSE] Parsing chapter {chapter_num}") soup = BeautifulSoup(html, "lxml") selectors = [ "#content", ".content", "div#content", "div.content", "div#chaptercontent", "#chapterContent", ".read-content", ] node = None for sel in selectors: tmp = soup.select_one(sel) if tmp: node = tmp break raw = node.get_text() if node else soup.get_text() # replacements REPL = load_replacements() text = clean_text(raw, REPL) # --------------------------------------------------- # HEADER ONLY FOR CHAPTER 1 # --------------------------------------------------- if chapter_num == 1: header = ( f"{meta.get('title','')}\n" f"Author: {meta.get('author','')}\n" f"Description:\n{meta.get('description','')}\n" f"URL: {url}\n" + "-" * 50 + "\n\n" ) text = header + text log(f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars") return { "chapter": chapter_num, "url": url, "text": text, "length": len(text), }