kmftools/bookscraper/scraper/engine/parser.py

# ============================================================
# File: scraper/engine/parser.py
# Purpose:
#   High-level scraping API coordinating metadata extraction
#   and chapter extraction using pluggable SiteScraper classes.
#
#   This is the new central engine:
#     - extract_metadata_only() used by INIT flow
#     - extract_metadata_full() used by full scraping pipeline
# ============================================================

from scraper.engine.fetcher import fetch_html


def extract_metadata_only(url: str, site_scraper):
    """
    Extract ONLY lightweight metadata:
      - title
      - author
      - description
      - cover_url
      - chapters_total = 0
    """
    soup = fetch_html(url, site_scraper.encoding)

    title = site_scraper.parse_title(soup)
    author = site_scraper.parse_author(soup)
    description = site_scraper.parse_description(soup)
    cover_url = site_scraper.parse_cover(soup, url)

    return {
        "title": title,
        "author": author,
        "description": description,
        "cover_url": cover_url,
        "chapters_total": 0,
        "book_url": url,
    }


def extract_metadata_full(url: str, site_scraper):
    """
    Full scraping (metadata + chapterlist).
    Used by the scraping Celery pipeline.
    """
    soup = fetch_html(url, site_scraper.encoding)

    # metadata
    meta = extract_metadata_only(url, site_scraper)

    # chapter list
    chapter_page_url = site_scraper.extract_chapter_page_url(soup)
    chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding)
    chapters = site_scraper.parse_chapter_list(chapter_page_soup)

    meta["chapters"] = chapters
    return meta


def build_book_id(title: str) -> str:
    """
    Canonical book_id generator.
    SCRAPE currently uses title as ID → preserve that behavior.
    """
    return title