# scraper/book_scraper.py import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import re from scraper.logger import log_debug from scraper.utils import clean_text, load_replacements from scraper.models.book_state import Chapter class BookScraper: """ Minimal scraper: only metadata + chapter list. The DownloadController handles Celery pipelines for: - download - parse - save """ def __init__(self, site, url): self.site = site self.url = url self.book_title = "" self.book_author = "" self.book_description = "" self.cover_url = "" self.chapter_base = None self.chapters = [] # Load custom replacements extra = load_replacements("replacements.txt") self.site.replacements.update(extra) # ------------------------------------------------------------ def execute(self): """Main entry point. Returns metadata + chapter URLs.""" soup = self._fetch(self.url) self._parse_title(soup) self._parse_author(soup) self._parse_description(soup) self._parse_cover(soup) chapter_page = self.get_chapter_page(soup) self.parse_chapter_links(chapter_page) log_debug(f"[BookScraper] Completed metadata parse") return { "title": self.book_title, "author": self.book_author, "description": self.book_description, "cover_url": self.cover_url, # ← used by DownloadController "book_url": self.url, "chapters": [ {"num": ch.number, "title": ch.title, "url": ch.url} for ch in self.chapters ], } # ------------------------------------------------------------ def _fetch(self, url): log_debug(f"[BookScraper] Fetch: {url}") resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) resp.encoding = self.site.encoding return BeautifulSoup(resp.text, "lxml") # ------------------------------------------------------------ def _parse_title(self, soup): h1 = soup.find("h1") self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle" log_debug(f"[BookScraper] Title = {self.book_title}") def _parse_author(self, soup): td = soup.find("td", string=lambda t: t and "作" in t) raw = td.get_text(strip=True) if td else "" self.book_author = raw.split(":")[1] if ":" in raw else "UnknownAuthor" log_debug(f"[BookScraper] Author = {self.book_author}") def _parse_description(self, soup): span = soup.find("span", string=lambda t: t and "内容简介" in t) if not span: self.book_description = "" log_debug("[BookScraper] Description not found") return parts = [] for sib in span.next_siblings: if getattr(sib, "name", None) == "span": break text = ( sib.get_text(strip=True) if hasattr(sib, "get_text") else str(sib).strip() ) if text: parts.append(text) self.book_description = clean_text("\n".join(parts), self.site.replacements) log_debug(f"[BookScraper] Description length = {len(self.book_description)}") # ------------------------------------------------------------ def _parse_cover(self, soup): """ Extract correct cover based on book_id path logic. 1. primary: match "/files/article/image/{vol}/{book_id}/" 2. fallback: endswith "/{book_id}s.jpg" """ # Extract book_id from URL m = re.search(r"/(\d+)\.html$", self.url) if not m: log_debug("[BookScraper] No book_id found in URL → cannot match cover") return book_id = m.group(1) # Extract vol folder from URL (bookinfo//.html) m2 = re.search(r"/bookinfo/(\d+)/", self.url) volume = m2.group(1) if m2 else None log_debug(f"[BookScraper] Book ID = {book_id}, Volume = {volume}") imgs = soup.find_all("img", src=True) chosen = None # -------------------------------------------------------- # PRIORITY 1: Path-match # /files/article/image/{vol}/{book_id}/ # -------------------------------------------------------- if volume: target_path = f"/files/article/image/{volume}/{book_id}/" for img in imgs: src = img["src"] if target_path in src: chosen = src log_debug(f"[BookScraper] Cover matched by PATH: {src}") break # -------------------------------------------------------- # PRIORITY 2: endswith "/{book_id}s.jpg" # -------------------------------------------------------- if not chosen: target_suffix = f"/{book_id}s.jpg" for img in imgs: src = img["src"] if src.endswith(target_suffix): chosen = src log_debug(f"[BookScraper] Cover matched by SUFFIX: {src}") break # -------------------------------------------------------- # No match # -------------------------------------------------------- if not chosen: log_debug("[BookScraper] No matching cover found") return self.cover_url = urljoin(self.site.root, chosen) log_debug(f"[BookScraper] Cover URL = {self.cover_url}") # ------------------------------------------------------------ def get_chapter_page(self, soup): """Return BeautifulSoup of the main chapter list page.""" node = soup.select_one( "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" ) href = node.select_one("a").get("href") chapter_url = urljoin(self.site.root, href) # base for chapter links parts = chapter_url.rsplit("/", 1) self.chapter_base = parts[0] + "/" return self._fetch(chapter_url) # ------------------------------------------------------------ def parse_chapter_links(self, soup): cont = soup.select_one(self.site.chapter_list_selector) items = cont.select("ul li a[href]") self.chapters = [] idx = 1 for a in items: href = a.get("href") if not href.endswith(".html"): continue title = a.get_text(strip=True) full = urljoin(self.chapter_base, href) self.chapters.append(Chapter(idx, title, full)) idx += 1 log_debug(f"[BookScraper] Found {len(self.chapters)} chapters")