You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
96 lines
2.8 KiB
96 lines
2.8 KiB
# ============================================================
|
|
# File: scraper/services/init_service.py
|
|
# Purpose:
|
|
# Orchestrate INIT-flow:
|
|
# - resolve site
|
|
# - fetch minimal metadata
|
|
# - derive book_idx
|
|
# - register in SQLite
|
|
# - store main cover
|
|
# ============================================================
|
|
|
|
import re
|
|
from scraper.services.site_resolver import SiteResolver
|
|
from scraper.services.scrape_engine import ScrapeEngine
|
|
from scraper.services.cover_service import CoverService
|
|
|
|
from db.repository import register_book
|
|
|
|
from scraper.logger_decorators import logcall
|
|
|
|
|
|
class InitService:
|
|
|
|
# ------------------------------------------------------------
|
|
# BOOK IDX DERIVATION
|
|
# ------------------------------------------------------------
|
|
@staticmethod
|
|
@logcall
|
|
def derive_book_id(url: str) -> str:
|
|
"""
|
|
PTWXZ URL format ends with /{id}.html.
|
|
If no match → fallback to sanitized URL.
|
|
|
|
Returns:
|
|
book_idx (string)
|
|
"""
|
|
m = re.search(r"/(\d+)\.html$", url)
|
|
if m:
|
|
return m.group(1)
|
|
|
|
# Fallback — ensures deterministic ID for unknown formats
|
|
return url.replace("/", "_").replace(":", "_")
|
|
|
|
# ------------------------------------------------------------
|
|
# MAIN INIT FLOW
|
|
# ------------------------------------------------------------
|
|
@staticmethod
|
|
@logcall
|
|
def execute(url: str) -> dict:
|
|
"""
|
|
INIT entry point.
|
|
Returns complete metadata + registration result.
|
|
"""
|
|
|
|
# 1) Resolve site handler
|
|
site = SiteResolver.resolve(url)
|
|
|
|
# 2) Create unified book_idx
|
|
book_idx = InitService.derive_book_id(url)
|
|
|
|
# Some site objects historically expect .book_id — we support it but DO NOT rely on it.
|
|
site.book_id = book_idx
|
|
|
|
# 3) Fetch initial metadata (title/author/description/cover)
|
|
meta = ScrapeEngine.fetch_metadata_only(site, url)
|
|
|
|
title = meta.get("title") or "Unknown"
|
|
author = meta.get("author")
|
|
description = meta.get("description")
|
|
cover_url = meta.get("cover_url")
|
|
|
|
# 4) Download & store main cover for UI
|
|
cover_path = CoverService.download_main_cover(cover_url, book_idx)
|
|
|
|
# 5) Register in SQLite (book_idx is the SOLE primary ID)
|
|
register_book(
|
|
book_idx=book_idx,
|
|
title=title,
|
|
author=author,
|
|
description=description,
|
|
cover_url=cover_url,
|
|
cover_path=cover_path,
|
|
book_url=url,
|
|
)
|
|
|
|
# 6) Return metadata for UI / API
|
|
return {
|
|
"book_idx": book_idx,
|
|
"title": title,
|
|
"author": author,
|
|
"description": description,
|
|
"cover_url": cover_url,
|
|
"cover_path": cover_path,
|
|
"status": "registered",
|
|
}
|