kmftools/bookscraper/scraper/book_scraper.py

import requests
import os
import time
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from PIL import Image
from io import BytesIO
from dotenv import load_dotenv

from scraper.logger import setup_logger, LOG_BUFFER
from scraper.utils import clean_text, load_replacements

load_dotenv()
logger = setup_logger()


class Chapter:
    def __init__(self, number, title, url):
        self.number = number
        self.title = title
        self.url = url
        self.text = ""


class BookScraper:
    def __init__(self, site, url):
        self.site = site
        self.url = url

        self.book_title = ""
        self.book_author = ""
        self.book_description = ""
        self.cover_url = ""

        self.chapters = []
        self.chapter_base = None
        self.base_path = None

        # ENV settings
        self.DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
        self.TEST_CHAPTER_LIMIT = int(os.getenv("TEST_CHAPTER_LIMIT", "10"))
        self.MAX_VOL_SIZE = int(os.getenv("MAX_VOL_SIZE", "1500"))
        self.MAX_DL_PER_SEC = int(os.getenv("MAX_DL_PER_SEC", "2"))

        # Load text replacements
        self.replacements = load_replacements("replacements.txt")

    # -----------------------------------------------------
    def execute(self):
        LOG_BUFFER.seek(0)
        LOG_BUFFER.truncate(0)

        logger.debug("Starting scraper for %s", self.url)
        soup = self.get_document(self.url)

        self.parse_title(soup)
        self.parse_author(soup)
        self.parse_description(soup)
        self.parse_cover(soup)
        self.prepare_output_folder()

        chapter_page = self.get_chapter_page(soup)
        self.parse_chapter_links(chapter_page)

        if self.DRY_RUN:
            logger.debug(
                "DRY RUN → downloading only first %s chapters", self.TEST_CHAPTER_LIMIT)
            self.get_some_chapters(self.TEST_CHAPTER_LIMIT)
        else:
            self.get_all_chapters()
            self.split_into_volumes()

        return {
            "title": self.book_title,
            "debug": LOG_BUFFER.getvalue()
        }

    # -----------------------------------------------------
    # NETWORK
    # -----------------------------------------------------
    def get_document(self, url):
        logger.debug("GET %s", url)
        time.sleep(1 / max(1, self.MAX_DL_PER_SEC))

        resp = requests.get(
            url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        resp.encoding = self.site.encoding

        logger.debug("HTTP %s for %s", resp.status_code, url)
        return BeautifulSoup(resp.text, "lxml")

    # -----------------------------------------------------
    # BASIC PARSERS (piaotia structure)
    # -----------------------------------------------------
    def parse_title(self, soup):
        h1 = soup.find("h1")
        if h1:
            self.book_title = h1.get_text(strip=True)
        else:
            self.book_title = "UnknownTitle"
        logger.debug("Book title: %s", self.book_title)

    def parse_author(self, soup):
        td = soup.find("td", string=lambda t: t and "作" in t and "者" in t)
        if td:
            raw = td.get_text(strip=True)
            if "：" in raw:
                self.book_author = raw.split("：", 1)[1].strip()
            else:
                self.book_author = "UnknownAuthor"
        else:
            self.book_author = "UnknownAuthor"
        logger.debug("Book author: %s", self.book_author)

    def parse_description(self, soup):
        span = soup.find("span", string=lambda t: t and "内容简介" in t)
        if not span:
            self.book_description = ""
            return

        parts = []
        for sib in span.next_siblings:
            if getattr(sib, "name", None) == "span":
                break
            txt = sib.get_text(strip=True) if not isinstance(
                sib, str) else sib.strip()
            if txt:
                parts.append(txt)

        self.book_description = "\n".join(parts)
        logger.debug("Description parsed (%s chars)",
                     len(self.book_description))

    def parse_cover(self, soup):
        selector = (
            "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table "
            "> tr:nth-of-type(4) > td:nth-of-type(1) > table > tr:nth-of-type(1) "
            "> td:nth-of-type(2) > a:nth-of-type(1) > img"
        )
        img = soup.select_one(selector)
        if img:
            self.cover_url = urljoin(self.site.root, img.get("src"))
        else:
            logger.debug("Cover not found!")
        logger.debug("Cover URL = %s", self.cover_url)

    # -----------------------------------------------------
    def prepare_output_folder(self):
        output_root = os.getenv("OUTPUT_DIR", "./output")
        self.base_path = Path(output_root) / self.book_title / self.site.name
        self.base_path.mkdir(parents=True, exist_ok=True)
        logger.debug("Output directory: %s", self.base_path)

        if self.cover_url:
            self.save_image(self.cover_url, self.base_path / "cover.jpg")

    def save_image(self, url, path):
        logger.debug("Downloading cover: %s", url)
        resp = requests.get(
            url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        if resp.status_code == 200:
            img = Image.open(BytesIO(resp.content))
            img.save(path)
            logger.debug("Cover saved to %s", path)

    # -----------------------------------------------------
    # CHAPTER PAGE
    # -----------------------------------------------------
    def get_chapter_page(self, soup):
        node = soup.select_one(
            "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table")
        link = node.select_one("a")
        href = link.get("href")
        chapter_url = urljoin(self.site.root, href)

        parsed = urlparse(chapter_url)
        base = parsed.path.rsplit("/", 1)[0] + "/"
        self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{base}"

        logger.debug("Chapter index URL = %s", chapter_url)
        logger.debug("CHAPTER_BASE = %s", self.chapter_base)

        return self.get_document(chapter_url)

    def parse_chapter_links(self, soup):
        container = soup.select_one("div.centent")
        links = container.select("ul li a[href]")

        for i, a in enumerate(links, 1):
            href = a.get("href")
            if not href.endswith(".html"):
                continue

            abs_url = urljoin(self.chapter_base, href)
            title = a.get_text(strip=True)
            self.chapters.append(Chapter(i, title, abs_url))

        logger.debug("Total chapters: %s", len(self.chapters))

    # -----------------------------------------------------
    # DOWNLOAD CHAPTERS
    # -----------------------------------------------------
    def get_all_chapters(self):
        for ch in self.chapters:
            ch.text = self.fetch_chapter(ch)
            logger.debug("CH %s length = %s", ch.number, len(ch.text))

    def get_some_chapters(self, limit):
        for ch in self.chapters[:limit]:
            ch.text = self.fetch_chapter(ch)
            filename = self.base_path / f"{ch.number:05d}_{ch.title}.txt"
            filename.write_text(ch.text, encoding="utf-8")
            logger.debug("Saved test chapter: %s", filename)

    def fetch_chapter(self, ch):
        soup = self.get_document(ch.url)
        text = self.parse_chapter_text(soup)
        return clean_text(text, self.replacements)

    def parse_chapter_text(self, soup):
        body = soup.body
        h1 = body.find("h1")

        parts = []
        collecting = False

        for sib in h1.next_siblings:
            if getattr(sib, "get", None) and sib.get("class") == ["bottomlink"]:
                break
            if getattr(sib, "get", None) and sib.get("class") == ["toplink"]:
                continue
            if getattr(sib, "name", None) in ["script", "style"]:
                continue

            if not collecting:
                if getattr(sib, "name", None) == "br":
                    collecting = True
                continue

            txt = sib.strip() if isinstance(sib, str) else sib.get_text("\n", strip=True)
            if txt:
                parts.append(txt)

        return "\n".join(parts).strip()

    # -----------------------------------------------------
    # SPLIT VOLUMES
    # -----------------------------------------------------
    def split_into_volumes(self):
        logger.debug(
            "Splitting into volumes (max %s chapters per volume)", self.MAX_VOL_SIZE)

        chapters = len(self.chapters)
        volume = 1
        index = 0

        while index < chapters:
            chunk = self.chapters[index:index + self.MAX_VOL_SIZE]
            volume_dir = self.base_path / f"v{volume}"
            volume_dir.mkdir(exist_ok=True)

            for ch in chunk:
                filename = volume_dir / f"{ch.number:05d}_{ch.title}.txt"
                filename.write_text(ch.text, encoding="utf-8")

            logger.debug("Volume %s saved (%s chapters)", volume, len(chunk))
            volume += 1
            index += self.MAX_VOL_SIZE