You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
1.8 KiB
66 lines
1.8 KiB
# ============================================================
|
|
# File: scraper/engine/parser.py
|
|
# Purpose:
|
|
# High-level scraping API coordinating metadata extraction
|
|
# and chapter extraction using pluggable SiteScraper classes.
|
|
#
|
|
# This is the new central engine:
|
|
# - extract_metadata_only() used by INIT flow
|
|
# - extract_metadata_full() used by full scraping pipeline
|
|
# ============================================================
|
|
|
|
from scraper.engine.fetcher import fetch_html
|
|
|
|
|
|
def extract_metadata_only(url: str, site_scraper):
|
|
"""
|
|
Extract ONLY lightweight metadata:
|
|
- title
|
|
- author
|
|
- description
|
|
- cover_url
|
|
- chapters_total = 0
|
|
"""
|
|
soup = fetch_html(url, site_scraper.encoding)
|
|
|
|
title = site_scraper.parse_title(soup)
|
|
author = site_scraper.parse_author(soup)
|
|
description = site_scraper.parse_description(soup)
|
|
cover_url = site_scraper.parse_cover(soup, url)
|
|
|
|
return {
|
|
"title": title,
|
|
"author": author,
|
|
"description": description,
|
|
"cover_url": cover_url,
|
|
"chapters_total": 0,
|
|
"book_url": url,
|
|
}
|
|
|
|
|
|
def extract_metadata_full(url: str, site_scraper):
|
|
"""
|
|
Full scraping (metadata + chapterlist).
|
|
Used by the scraping Celery pipeline.
|
|
"""
|
|
soup = fetch_html(url, site_scraper.encoding)
|
|
|
|
# metadata
|
|
meta = extract_metadata_only(url, site_scraper)
|
|
|
|
# chapter list
|
|
chapter_page_url = site_scraper.extract_chapter_page_url(soup)
|
|
chapter_page_soup = fetch_html(chapter_page_url, site_scraper.encoding)
|
|
chapters = site_scraper.parse_chapter_list(chapter_page_soup)
|
|
|
|
meta["chapters"] = chapters
|
|
return meta
|
|
|
|
|
|
def build_book_id(title: str) -> str:
|
|
"""
|
|
Canonical book_id generator.
|
|
SCRAPE currently uses title as ID → preserve that behavior.
|
|
"""
|
|
return title
|