# scraper/tasks/parse_tasks.py from celery import shared_task from logbus.publisher import log from scraper.utils import clean_text from bs4 import BeautifulSoup @shared_task(bind=True, queue="parse", ignore_result=False) def parse_chapter(self, html: str, chapter_url: str): """ Parse downloaded chapter HTML into clean text. Returns a dict: { "url": chapter_url, "text": "...parsed text..." } """ try: log(f"[PARSE] Start parsing: {chapter_url}") soup = BeautifulSoup(html, "html.parser") # Veel Chinese sites gebruiken dit soort containers: possible_blocks = [ "#content", ".content", "div#content", "div.content", "div#chaptercontent", "#chapterContent" ] node = None for sel in possible_blocks: r = soup.select_one(sel) if r: node = r break if not node: log( f"[PARSE] WARNING: no known content block found in {chapter_url}") text = clean_text(soup.get_text()) else: text = clean_text(node.get_text()) log(f"[PARSE] Finished parsing: {chapter_url} ({len(text)} chars)") return { "url": chapter_url, "text": text, } except Exception as exc: log(f"[PARSE] ERROR parsing {chapter_url}: {exc}") raise