kmftools/bookscraper/scraper/book_scraper.py

# scraper/book_scraper.py

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements
from scraper.models.book_state import Chapter


class BookScraper:
    """
    Minimal scraper: only metadata + chapter list.
    The DownloadController handles Celery pipelines for:
    - download
    - parse
    - save
    """

    def __init__(self, site, url):
        self.site = site
        self.url = url

        self.book_title = ""
        self.book_author = ""
        self.book_description = ""
        self.cover_url = ""
        self.chapter_base = None

        self.chapters = []

        # Load custom replacements
        extra = load_replacements("replacements.txt")
        self.site.replacements.update(extra)

    # ------------------------------------------------------------
    def execute(self):
        """Main entry point. Returns metadata + chapter URLs."""
        soup = self._fetch(self.url)

        self._parse_title(soup)
        self._parse_author(soup)
        self._parse_description(soup)
        self._parse_cover(soup)

        chapter_page = self.get_chapter_page(soup)
        self.parse_chapter_links(chapter_page)

        log_debug(f"[BookScraper] Completed metadata parse")

        return {
            "title": self.book_title,
            "author": self.book_author,
            "description": self.book_description,
            "cover_url": self.cover_url,  # ← used by DownloadController
            "book_url": self.url,
            "chapters": [
                {"num": ch.number, "title": ch.title, "url": ch.url}
                for ch in self.chapters
            ],
        }

    # ------------------------------------------------------------
    def _fetch(self, url):
        log_debug(f"[BookScraper] Fetch: {url}")
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        resp.encoding = self.site.encoding
        return BeautifulSoup(resp.text, "lxml")

    # ------------------------------------------------------------
    def _parse_title(self, soup):
        h1 = soup.find("h1")
        self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
        log_debug(f"[BookScraper] Title = {self.book_title}")

    def _parse_author(self, soup):
        td = soup.find("td", string=lambda t: t and "作" in t)
        raw = td.get_text(strip=True) if td else ""
        self.book_author = raw.split("：")[1] if "：" in raw else "UnknownAuthor"
        log_debug(f"[BookScraper] Author = {self.book_author}")

    def _parse_description(self, soup):
        span = soup.find("span", string=lambda t: t and "内容简介" in t)
        if not span:
            self.book_description = ""
            log_debug("[BookScraper] Description not found")
            return

        parts = []
        for sib in span.next_siblings:
            if getattr(sib, "name", None) == "span":
                break

            text = (
                sib.get_text(strip=True)
                if hasattr(sib, "get_text")
                else str(sib).strip()
            )

            if text:
                parts.append(text)

        self.book_description = clean_text("\n".join(parts), self.site.replacements)
        log_debug(f"[BookScraper] Description length = {len(self.book_description)}")

    # ------------------------------------------------------------
    def _parse_cover(self, soup):
        """
        Extract correct cover based on book_id path logic.
        1. primary: match "/files/article/image/{vol}/{book_id}/"
        2. fallback: endswith "/{book_id}s.jpg"
        """
        # Extract book_id from URL
        m = re.search(r"/(\d+)\.html$", self.url)
        if not m:
            log_debug("[BookScraper] No book_id found in URL → cannot match cover")
            return

        book_id = m.group(1)

        # Extract vol folder from URL (bookinfo/<vol>/<id>.html)
        m2 = re.search(r"/bookinfo/(\d+)/", self.url)
        volume = m2.group(1) if m2 else None

        log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}")

        imgs = soup.find_all("img", src=True)

        chosen = None

        # --------------------------------------------------------
        # PRIORITY 1: Path-match
        # /files/article/image/{vol}/{book_id}/
        # --------------------------------------------------------
        if volume:
            target_path = f"/files/article/image/{volume}/{book_id}/"
            for img in imgs:
                src = img["src"]
                if target_path in src:
                    chosen = src
                    log_debug(f"[BookScraper] Cover matched by PATH: {src}")
                    break

        # --------------------------------------------------------
        # PRIORITY 2: endswith "/{book_id}s.jpg"
        # --------------------------------------------------------
        if not chosen:
            target_suffix = f"/{book_id}s.jpg"
            for img in imgs:
                src = img["src"]
                if src.endswith(target_suffix):
                    chosen = src
                    log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}")
                    break

        # --------------------------------------------------------
        # No match
        # --------------------------------------------------------
        if not chosen:
            log_debug("[BookScraper] No matching cover found")
            return

        self.cover_url = urljoin(self.site.root, chosen)
        log_debug(f"[BookScraper] Cover URL = {self.cover_url}")

    # ------------------------------------------------------------
    def get_chapter_page(self, soup):
        """Return BeautifulSoup of the main chapter list page."""
        node = soup.select_one(
            "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
        )
        href = node.select_one("a").get("href")
        chapter_url = urljoin(self.site.root, href)

        # base for chapter links
        parts = chapter_url.rsplit("/", 1)
        self.chapter_base = parts[0] + "/"

        return self._fetch(chapter_url)

    # ------------------------------------------------------------
    def parse_chapter_links(self, soup):
        cont = soup.select_one(self.site.chapter_list_selector)
        items = cont.select("ul li a[href]")

        self.chapters = []
        idx = 1

        for a in items:
            href = a.get("href")
            if not href.endswith(".html"):
                continue

            title = a.get_text(strip=True)
            full = urljoin(self.chapter_base, href)

            self.chapters.append(Chapter(idx, title, full))
            idx += 1

        log_debug(f"[BookScraper] Found {len(self.chapters)} chapters")