You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/engine/parser.py

66 lines
1.8 KiB

# ============================================================
# File: scraper/engine/parser.py
# Purpose:
# High-level scraping API coordinating metadata extraction
# and chapter extraction using pluggable SiteScraper classes.
#
# This is the new central engine:
# - extract_metadata_only() used by INIT flow
# - extract_metadata_full() used by full scraping pipeline
# ============================================================
from scraper.engine.fetcher import fetch_html
def extract_metadata_only(url: str, site_scraper):
"""
Extract ONLY lightweight metadata:
- title
- author
- description
- cover_url
- chapters_total = 0
"""
soup = fetch_html(url, site_scraper.encoding)
title = site_scraper.parse_title(soup)
author = site_scraper.parse_author(soup)
description = site_scraper.parse_description(soup)
cover_url = site_scraper.parse_cover(soup, url)
return {
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"chapters_total": 0,
"book_url": url,
}
def extract_metadata_full(url: str, site_scraper):
"""
Full scraping (metadata + chapterlist).
Used by the scraping Celery pipeline.
"""
soup = fetch_html(url, site_scraper.encoding)
# metadata
meta = extract_metadata_only(url, site_scraper)
# chapter list
chapter_page_url = site_scraper.extract_chapter_page_url(soup)
chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding)
chapters = site_scraper.parse_chapter_list(chapter_page_soup)
meta["chapters"] = chapters
return meta
def build_book_id(title: str) -> str:
"""
Canonical book_id generator.
SCRAPE currently uses title as ID → preserve that behavior.
"""
return title