You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
2.0 KiB
75 lines
2.0 KiB
# ============================================================
|
|
# File: scraper/services/init_service.py
|
|
# Purpose:
|
|
# Orchestrate INIT-flow:
|
|
# - resolve site
|
|
# - fetch minimal metadata
|
|
# - derive book_id
|
|
# - register in SQLite
|
|
# - store main cover
|
|
# ============================================================
|
|
|
|
import re
|
|
from scraper.services.site_resolver import SiteResolver
|
|
from scraper.services.scrape_engine import ScrapeEngine
|
|
from scraper.services.cover_service import CoverService
|
|
|
|
from db.repository import register_book
|
|
|
|
|
|
class InitService:
|
|
|
|
@staticmethod
|
|
def derive_book_id(url: str) -> str:
|
|
"""
|
|
PTWXZ URL format ends with /{id}.html.
|
|
If no match → fallback to sanitized URL.
|
|
"""
|
|
m = re.search(r"/(\d+)\.html$", url)
|
|
if m:
|
|
return m.group(1)
|
|
return url.replace("/", "_")
|
|
|
|
@staticmethod
|
|
def execute(url: str) -> dict:
|
|
"""
|
|
Main INIT-flow entry point.
|
|
Returns complete metadata + registration info.
|
|
"""
|
|
|
|
# 1) Determine which BookSite applies
|
|
site = SiteResolver.resolve(url)
|
|
|
|
# 2) Metadata only (no chapters)
|
|
meta = ScrapeEngine.fetch_metadata_only(site, url)
|
|
|
|
title = meta.get("title") or "Unknown"
|
|
author = meta.get("author")
|
|
description = meta.get("description")
|
|
cover_url = meta.get("cover_url")
|
|
|
|
# 3) Determine book_id
|
|
book_id = InitService.derive_book_id(url)
|
|
|
|
# 4) SQLite registration
|
|
register_book(
|
|
book_id=book_id,
|
|
title=title,
|
|
author=author,
|
|
description=description,
|
|
cover_url=cover_url,
|
|
)
|
|
|
|
# 5) Download UI cover
|
|
CoverService.download_main_cover(cover_url, book_id)
|
|
|
|
# 6) Structured output for UI
|
|
return {
|
|
"book_id": book_id,
|
|
"title": title,
|
|
"author": author,
|
|
"description": description,
|
|
"cover_url": cover_url,
|
|
"status": "registered",
|
|
}
|