# ============================================================ # File: scraper/engine/parser.py # Purpose: # High-level scraping API coordinating metadata extraction # and chapter extraction using pluggable SiteScraper classes. # # This is the new central engine: # - extract_metadata_only() used by INIT flow # - extract_metadata_full() used by full scraping pipeline # ============================================================ from scraper.engine.fetcher import fetch_html def extract_metadata_only(url: str, site_scraper): """ Extract ONLY lightweight metadata: - title - author - description - cover_url - chapters_total = 0 """ soup = fetch_html(url, site_scraper.encoding) title = site_scraper.parse_title(soup) author = site_scraper.parse_author(soup) description = site_scraper.parse_description(soup) cover_url = site_scraper.parse_cover(soup, url) return { "title": title, "author": author, "description": description, "cover_url": cover_url, "chapters_total": 0, "book_url": url, } def extract_metadata_full(url: str, site_scraper): """ Full scraping (metadata + chapterlist). Used by the scraping Celery pipeline. """ soup = fetch_html(url, site_scraper.encoding) # metadata meta = extract_metadata_only(url, site_scraper) # chapter list chapter_page_url = site_scraper.extract_chapter_page_url(soup) chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding) chapters = site_scraper.parse_chapter_list(chapter_page_soup) meta["chapters"] = chapters return meta def build_book_id(title: str) -> str: """ Canonical book_id generator. SCRAPE currently uses title as ID → preserve that behavior. """ return title