# scraper/book_scraper.py import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from scraper.logger import log_debug from scraper.utils import clean_text, load_replacements from scraper.models.book_state import Chapter class BookScraper: """ Lightweight scraper: only metadata + chapter list. All downloading/parsing/saving is handled by Celery tasks. """ def __init__(self, site, url): self.site = site self.url = url self.book_title = "" self.book_author = "" self.book_description = "" self.cover_url = "" self.chapters = [] self.chapter_base = None # Load custom replacements extra = load_replacements("replacements.txt") self.site.replacements.update(extra) # ------------------------------------------------------------ def parse_book_info(self): """Parse title, author, description, cover from the main page.""" soup = self._fetch(self.url) self._parse_title(soup) self._parse_author(soup) self._parse_description(soup) self._parse_cover(soup) # Parse chapter list page + chapter links chapter_page = self.get_chapter_page(soup) self.parse_chapter_links(chapter_page) # ------------------------------------------------------------ def _fetch(self, url): """Simple fetch (no retry), DownloadController handles errors.""" log_debug(f"[BookScraper] Fetch: {url}") resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) resp.encoding = self.site.encoding return BeautifulSoup(resp.text, "lxml") # ------------------------------------------------------------ def _parse_title(self, soup): h1 = soup.find("h1") self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle" log_debug(f"[BookScraper] Title = {self.book_title}") def _parse_author(self, soup): td = soup.find("td", string=lambda t: t and "作" in t) raw = td.get_text(strip=True) if td else "" self.book_author = raw.split(":")[1] if ":" in raw else "UnknownAuthor" log_debug(f"[BookScraper] Author = {self.book_author}") def _parse_description(self, soup): span = soup.find("span", string=lambda t: t and "内容简介" in t) if not span: self.book_description = "" log_debug("[BookScraper] Description not found") return parts = [] for sib in span.next_siblings: # Stop when next book section begins if getattr(sib, "name", None) == "span": break text = ( sib.get_text(strip=True) if hasattr(sib, "get_text") else str(sib).strip() ) if text: parts.append(text) self.book_description = "\n".join(parts) log_debug( f"[BookScraper] Description length = {len(self.book_description)} characters" ) # ------------------------------------------------------------ def _parse_cover(self, soup): cover = soup.find("img", src=lambda v: v and "files/article/image" in v) if not cover: log_debug("[BookScraper] No cover found") return self.cover_url = urljoin(self.site.root, cover.get("src")) log_debug(f"[BookScraper] Cover URL = {self.cover_url}") # ------------------------------------------------------------ def get_chapter_page(self, soup): """Return BeautifulSoup of the main chapter list page.""" node = soup.select_one( "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" ) href = node.select_one("a").get("href") url = urljoin(self.site.root, href) parsed = urlparse(url) bp = parsed.path.rsplit("/", 1)[0] + "/" self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{bp}" return self._fetch(url) # ------------------------------------------------------------ def parse_chapter_links(self, soup): cont = soup.select_one(self.site.chapter_list_selector) items = cont.select("ul li a[href]") self.chapters = [] idx = 1 for a in items: href = a.get("href") if not href.endswith(".html"): continue title = a.get_text(strip=True) full = urljoin(self.chapter_base, href) self.chapters.append(Chapter(idx, title, full)) idx += 1 log_debug(f"[BookScraper] Found {len(self.chapters)} chapters") # ------------------------------------------------------------ def get_chapter_list(self): """Return the chapter list (DownloadController reads this).""" return self.chapters