kmftools/bookscraper/scraper/book_scraper.py

import requests
import os
import time
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from PIL import Image
from io import BytesIO

from scraper.logger import log_debug
from scraper.utils import clean_text, load_replacements


class Chapter:
    def __init__(self, num, title, url):
        self.number = num
        self.title = title
        self.url = url
        self.text = ""


class BookScraper:
    def __init__(self, site, url):
        self.site = site
        self.url = url

        self.book_title = ""
        self.book_author = ""
        self.book_description = ""
        self.cover_url = ""

        self.chapters = []
        self.base_path = None
        self.chapter_base = None

        # ENV
        self.DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
        self.TEST_LIMIT = int(os.getenv("TEST_LIMIT", "10"))
        self.MAX_DL = float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1"))
        self.min_delay = 1.0 / self.MAX_DL if self.MAX_DL > 0 else 1.0
        self._last_download_time = 0

        # replacements.txt
        fp = os.path.join(os.getcwd(), "replacements.txt")
        extra = load_replacements(fp)
        self.site.replacements.update(extra)

        self.start_time = None
        self.total_chapters = 0
        self.volume_dirs = {}

    # ------------------------------------------------------------
    # RATE LIMITER
    # ------------------------------------------------------------

    def throttle(self):
        now = time.time()
        elapsed = now - self._last_download_time

        if elapsed < self.min_delay:
            time.sleep(self.min_delay - elapsed)

        self._last_download_time = time.time()

    # ------------------------------------------------------------
    def execute(self):
        log_debug(f"Starting scraper for {self.url}")

        self.start_time = time.time()

        soup = self.get_doc_with_retry(self.url)
        self.parse_title(soup)
        self.parse_author(soup)
        self.parse_description(soup)
        self.parse_cover(soup)

        self.prepare_output_folder()

        chapter_page = self.get_chapter_page(soup)
        self.parse_chapter_links(chapter_page)
        self.prepare_volume_folders()

        if self.DRY_RUN:
            self.download_some(self.TEST_LIMIT)
        else:
            self.download_all()

        return {"title": self.book_title}

    # ------------------------------------------------------------
    # HTTP GET WITH RETRIES + HARD 429 COOLDOWN WITH COUNTDOWN
    # ------------------------------------------------------------
    def get_doc_with_retry(self, url):
        attempt = 1

        while True:
            self.throttle()
            log_debug(f"GET {url} (attempt {attempt})")

            try:
                resp = requests.get(
                    url,
                    headers={"User-Agent": "Mozilla/5.0"},
                    timeout=10,
                )
            except Exception as e:
                log_debug(f"Network error {e} → retry in {attempt + 1}s")
                time.sleep(attempt + 1)
                attempt += 1
                continue

            code = resp.status_code
            log_debug(f"HTTP {code} for {url}")

            # 429 → hard cooldown with countdown
            if code == 429:
                cooldown = 60
                log_debug(f"429 detected — cooldown {cooldown}s")
                for i in range(cooldown, 0, -1):
                    log_debug(f"429 cooldown… {i}s remaining")
                    time.sleep(1)
                attempt += 1
                continue

            # recoverable
            if code in (403, 500):
                wait = min(5 * attempt, 30)
                log_debug(f"HTTP {code} → retry in {wait}s")
                time.sleep(wait)
                attempt += 1
                continue

            if code == 200:
                resp.encoding = self.site.encoding
                return BeautifulSoup(resp.text, "lxml")

            # unexpected
            wait = attempt + 1
            log_debug(f"Unexpected HTTP {code} → sleep {wait}s")
            time.sleep(wait)
            attempt += 1

    # ------------------------------------------------------------
    def parse_title(self, soup):
        h1 = soup.find("h1")
        self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
        log_debug(f"Book title = {self.book_title}")

    def parse_author(self, soup):
        td = soup.find("td", string=lambda t: t and "作" in t)
        self.book_author = (
            td.get_text(strip=True).split("：")[1]
            if td and "：" in td.get_text()
            else "UnknownAuthor"
        )
        log_debug(f"Book author = {self.book_author}")

    def parse_description(self, soup):
        span = soup.find("span", string=lambda t: t and "内容简介" in t)
        if not span:
            log_debug("No description found")
            self.book_description = ""
            return

        parts = []
        for sib in span.next_siblings:
            if getattr(sib, "name", None) == "span":
                break
            text = (
                sib.get_text(strip=True)
                if hasattr(sib, "get_text")
                else str(sib).strip()
            )
            if text:
                parts.append(text)

        self.book_description = "\n".join(parts)
        log_debug(f"Description length = {len(self.book_description)}")

    # ------------------------------------------------------------
    def parse_cover(self, soup):
        cover = soup.find(
            "img", src=lambda v: v and "files/article/image" in v)
        if not cover:
            log_debug("Cover not found")
            return

        self.cover_url = urljoin(self.site.root, cover.get("src"))
        log_debug(f"Cover URL = {self.cover_url}")

    # ------------------------------------------------------------
    def prepare_output_folder(self):
        self.base_path = Path("output") / self.book_title / self.site.name
        self.base_path.mkdir(parents=True, exist_ok=True)

        if self.cover_url:
            self.download_cover()

    def download_cover(self):
        log_debug(f"Downloading cover: {self.cover_url}")

        resp = requests.get(
            self.cover_url,
            headers={"User-Agent": "Mozilla/5.0"},
            timeout=10,
        )

        if resp.status_code != 200:
            return

        if "html" in resp.headers.get("Content-Type", ""):
            return

        try:
            img = Image.open(BytesIO(resp.content))
        except:
            return

        img.save(self.base_path / "cover.jpg")
        log_debug("Cover saved")

    # ------------------------------------------------------------
    def get_chapter_page(self, soup):
        node = soup.select_one(
            "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
        )
        href = node.select_one("a").get("href")
        url = urljoin(self.site.root, href)

        parsed = urlparse(url)
        bp = parsed.path.rsplit("/", 1)[0] + "/"
        self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{bp}"

        return self.get_doc_with_retry(url)

    # ------------------------------------------------------------
    def parse_chapter_links(self, soup):
        cont = soup.select_one(self.site.chapter_list_selector)
        items = cont.select("ul li a[href]")

        self.chapters = []
        idx = 1
        for a in items:
            href = a.get("href")
            if not href.endswith(".html"):
                continue
            title = a.get_text(strip=True)
            full = urljoin(self.chapter_base, href)
            self.chapters.append(Chapter(idx, title, full))
            idx += 1

        self.total_chapters = len(self.chapters)
        log_debug(f"Found {self.total_chapters} chapters")

    # ------------------------------------------------------------
    def prepare_volume_folders(self):
        max_size = int(os.getenv("MAX_VOL_SIZE", "200"))
        num_vols = (self.total_chapters + max_size - 1) // max_size

        for v in range(1, num_vols + 1):
            d = self.base_path / f"v{v}"
            d.mkdir(parents=True, exist_ok=True)
            self.volume_dirs[v] = d

    # ------------------------------------------------------------
    def download_all(self):
        for ch in self.chapters:
            self.download_chapter(ch)

    def download_some(self, limit):
        for ch in self.chapters[:limit]:
            self.download_chapter(ch)

    # ------------------------------------------------------------
    def download_chapter(self, ch):
        # Determine volume + filename
        max_size = int(os.getenv("MAX_VOL_SIZE", "200"))
        volume = ((ch.number - 1) // max_size) + 1
        vdir = self.volume_dirs.get(volume, self.base_path)

        expected_name = f"{ch.number:05d}_{ch.title}.txt"
        fname = vdir / expected_name
        expected_full_path = str(fname.resolve())

        # STRICT SKIP CHECK
        if fname.exists() and fname.is_file():
            actual_size = fname.stat().st_size

            # correct name?
            if fname.name == expected_name:
                expected_dir = str(vdir.resolve())
                actual_dir = str(fname.parent.resolve())

                if expected_dir == actual_dir:
                    if actual_size > 300:
                        log_debug(
                            f"Skip chapter {ch.number}/{self.total_chapters}: already exists\n"
                            f" Path: {expected_full_path}\n"
                            f" Size: {actual_size} bytes"
                        )
                        return
                    else:
                        log_debug(
                            f"Existing file too small ({actual_size} bytes), redownloading: {expected_full_path}"
                        )
                else:
                    log_debug(
                        f"Directory mismatch for chapter {ch.number}, redownloading"
                    )
            else:
                log_debug(
                    f"Filename mismatch for chapter {ch.number}, redownloading\n"
                    f" Expected: {expected_name}\n"
                    f" Found:    {fname.name}"
                )

        # PROGRESS INFO
        percent = (ch.number / self.total_chapters) * 100
        elapsed = time.time() - self.start_time
        avg_time = elapsed / max(ch.number - 1, 1)
        remaining = self.total_chapters - ch.number
        eta_seconds = max(0, remaining * avg_time)

        eta_min = int(eta_seconds // 60)
        eta_sec = int(eta_seconds % 60)

        log_debug(
            f"Fetching chapter {ch.number}/{self.total_chapters} "
            f"({percent:.2f}%, ETA {eta_min}m {eta_sec}s): "
            f"{ch.title}"
        )

        # RETRY EMPTY CONTENT
        attempt = 1
        while True:
            soup = self.get_doc_with_retry(ch.url)
            text = self.parse_chapter_text(soup)

            if text.strip():
                ch.text = text
                break

            wait = min(10 + attempt, 30)
            log_debug(f"Empty chapter → retry in {wait}s")
            time.sleep(wait)
            attempt += 1

        fname.write_text(ch.text, encoding="utf-8")
        log_debug(f"Saved chapter to v{volume}: {fname}")
        chapter_delay = float(os.getenv("CHAPTER_DELAY", "2"))
        log_debug(f"Throttling {chapter_delay}s before next chapter")
        time.sleep(chapter_delay)

    # ------------------------------------------------------------

    def parse_chapter_text(self, soup):
        body = soup.body
        if not body:
            return ""

        h1 = body.find("h1")
        if not h1:
            return ""

        parts = []
        collecting = False

        for sib in h1.next_siblings:
            if getattr(sib, "class", None) == ["toplink"]:
                continue
            if getattr(sib, "class", None) == ["bottomlink"]:
                break
            if getattr(sib, "name", None) in ["script", "style"]:
                continue

            if not collecting:
                if getattr(sib, "name", None) == "br":
                    collecting = True
                continue

            text = (
                sib.get_text("\n", strip=True)
                if hasattr(sib, "get_text")
                else str(sib).strip()
            )
            if text:
                parts.append(text)

        raw = "\n".join(parts)
        raw = clean_text(raw, self.site.replacements)
        return raw.strip()