kmftools/bookscraper/scraper/tasks/parse_tasks.py

# scraper/tasks/parse_tasks.py

from celery import shared_task
from logbus.publisher import log
from scraper.utils import clean_text
from bs4 import BeautifulSoup


@shared_task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, html: str, chapter_url: str):
    """
    Parse downloaded chapter HTML into clean text.
    Returns a dict:
    {
        "url": chapter_url,
        "text": "...parsed text..."
    }
    """
    try:
        log(f"[PARSE] Start parsing: {chapter_url}")

        soup = BeautifulSoup(html, "html.parser")

        # Veel Chinese sites gebruiken dit soort containers:
        possible_blocks = [
            "#content",
            ".content",
            "div#content",
            "div.content",
            "div#chaptercontent",
            "#chapterContent"
        ]

        node = None
        for sel in possible_blocks:
            r = soup.select_one(sel)
            if r:
                node = r
                break

        if not node:
            log(
                f"[PARSE] WARNING: no known content block found in {chapter_url}")
            text = clean_text(soup.get_text())
        else:
            text = clean_text(node.get_text())

        log(f"[PARSE] Finished parsing: {chapter_url} ({len(text)} chars)")

        return {
            "url": chapter_url,
            "text": text,
        }

    except Exception as exc:
        log(f"[PARSE] ERROR parsing {chapter_url}: {exc}")
        raise