You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/services/init_service.py

96 lines
2.8 KiB

# ============================================================
# File: scraper/services/init_service.py
# Purpose:
# Orchestrate INIT-flow:
# - resolve site
# - fetch minimal metadata
# - derive book_idx
# - register in SQLite
# - store main cover
# ============================================================
import re
from scraper.services.site_resolver import SiteResolver
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.cover_service import CoverService
from db.repository import register_book
from scraper.logger_decorators import logcall
class InitService:
# ------------------------------------------------------------
# BOOK IDX DERIVATION
# ------------------------------------------------------------
@staticmethod
@logcall
def derive_book_id(url: str) -> str:
"""
PTWXZ URL format ends with /{id}.html.
If no match → fallback to sanitized URL.
Returns:
book_idx (string)
"""
m = re.search(r"/(\d+)\.html$", url)
if m:
return m.group(1)
# Fallback — ensures deterministic ID for unknown formats
return url.replace("/", "_").replace(":", "_")
# ------------------------------------------------------------
# MAIN INIT FLOW
# ------------------------------------------------------------
@staticmethod
@logcall
def execute(url: str) -> dict:
"""
INIT entry point.
Returns complete metadata + registration result.
"""
# 1) Resolve site handler
site = SiteResolver.resolve(url)
# 2) Create unified book_idx
book_idx = InitService.derive_book_id(url)
# Some site objects historically expect .book_id — we support it but DO NOT rely on it.
site.book_id = book_idx
# 3) Fetch initial metadata (title/author/description/cover)
meta = ScrapeEngine.fetch_metadata_only(site, url)
title = meta.get("title") or "Unknown"
author = meta.get("author")
description = meta.get("description")
cover_url = meta.get("cover_url")
# 4) Download & store main cover for UI
cover_path = CoverService.download_main_cover(cover_url, book_idx)
# 5) Register in SQLite (book_idx is the SOLE primary ID)
register_book(
book_idx=book_idx,
title=title,
author=author,
description=description,
cover_url=cover_url,
cover_path=cover_path,
book_url=url,
)
# 6) Return metadata for UI / API
return {
"book_idx": book_idx,
"title": title,
"author": author,
"description": description,
"cover_url": cover_url,
"cover_path": cover_path,
"status": "registered",
}