# =========================================================
# File: scraper/tasks/parse_tasks.py
# Purpose: Parse downloaded HTML into clean chapter text.
#
# Abort Behavior:
#   - parse MUST ALWAYS RUN once download has started
#   - even if the user triggers abort afterwards
#   - (abort only prevents new chapters from starting)
#
# Parsing avoids skipping except when download_result indicates skip.
# =========================================================

from celery_app import celery_app
from logbus.publisher import log
from bs4 import BeautifulSoup
from scraper.utils import clean_text, load_replacements

print(">>> [IMPORT] parse_tasks.py loaded")


@celery_app.task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, download_result: dict, meta: dict):
    """
    Parse raw HTML returned by download_chapter into clean chapter text.
    """

    # ------------------------------------------------------------
    # 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
    # (This is the ONLY valid skip in parse)
    # ------------------------------------------------------------
    if download_result.get("skipped"):
        chapter = download_result.get("chapter")
        log(f"[PARSE] SKIP chapter {chapter} (download skipped)")
        return download_result

    # ------------------------------------------------------------
    # 2) Normal Parsing
    # ------------------------------------------------------------
    chapter_num = download_result["chapter"]
    chapter_url = download_result["url"]
    html = download_result["html"]

    log(f"[PARSE] Parsing chapter {chapter_num}")

    soup = BeautifulSoup(html, "lxml")

    selectors = [
        "#content",
        ".content",
        "div#content",
        "div.content",
        "div#chaptercontent",
        "#chapterContent",
        ".read-content",
    ]

    node = None
    for sel in selectors:
        tmp = soup.select_one(sel)
        if tmp:
            node = tmp
            break

    raw = node.get_text() if node else soup.get_text()

    # ------------------------------------------------------------
    # Apply global replacements (from text_replacements file)
    # ------------------------------------------------------------
    REPL = load_replacements()
    text = clean_text(raw, REPL)

    # ------------------------------------------------------------
    # FIX: chapter 1 header now includes meta information
    # ------------------------------------------------------------
    if chapter_num == 1:
        book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"

        header = (
            f"{meta.get('title','')}\n"
            f"Author: {meta.get('author','')}\n"
            f"Description:\n{meta.get('description','')}\n"
            f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
        )
        text = header + text

    log(f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars")

    return {
        "chapter": chapter_num,
        "url": chapter_url,
        "text": text,
        "length": len(text),
    }