kmftools/bookscraper/scraper/services/init_service.py

# ============================================================
# File: scraper/services/init_service.py
# Purpose:
#   Orchestrate INIT-flow:
#     - resolve site
#     - fetch minimal metadata
#     - derive book_idx
#     - register in SQLite
#     - store main cover
# ============================================================

import re
from scraper.services.site_resolver import SiteResolver
from scraper.services.scrape_engine import ScrapeEngine
from scraper.services.cover_service import CoverService

from db.repository import register_book

from scraper.logger_decorators import logcall


class InitService:

    # ------------------------------------------------------------
    # BOOK IDX DERIVATION
    # ------------------------------------------------------------
    @staticmethod
    @logcall
    def derive_book_id(url: str) -> str:
        """
        PTWXZ URL format ends with /{id}.html.
        If no match → fallback to sanitized URL.

        Returns:
            book_idx (string)
        """
        m = re.search(r"/(\d+)\.html$", url)
        if m:
            return m.group(1)

        # Fallback — ensures deterministic ID for unknown formats
        return url.replace("/", "_").replace(":", "_")

    # ------------------------------------------------------------
    # MAIN INIT FLOW
    # ------------------------------------------------------------
    @staticmethod
    @logcall
    def execute(url: str) -> dict:
        """
        INIT entry point.
        Returns complete metadata + registration result.
        """

        # 1) Resolve site handler
        site = SiteResolver.resolve(url)

        # 2) Create unified book_idx
        book_idx = InitService.derive_book_id(url)

        # Some site objects historically expect .book_id — we support it but DO NOT rely on it.
        site.book_id = book_idx

        # 3) Fetch initial metadata (title/author/description/cover)
        meta = ScrapeEngine.fetch_metadata_only(site, url)

        title = meta.get("title") or "Unknown"
        author = meta.get("author")
        description = meta.get("description")
        cover_url = meta.get("cover_url")

        # 4) Download & store main cover for UI
        cover_path = CoverService.download_main_cover(cover_url, book_idx)

        # 5) Register in SQLite (book_idx is the SOLE primary ID)
        register_book(
            book_idx=book_idx,
            title=title,
            author=author,
            description=description,
            cover_url=cover_url,
            cover_path=cover_path,
            book_url=url,
        )

        # 6) Return metadata for UI / API
        return {
            "book_idx": book_idx,
            "title": title,
            "author": author,
            "description": description,
            "cover_url": cover_url,
            "cover_path": cover_path,
            "status": "registered",
        }