# ============================================================ # File: scraper/sites/piaotian.py # Purpose: # Concrete SiteScraper implementation for ptwxz.com (Piaotian). # Moves all parsing logic out of BookScraper. # ============================================================ from scraper.sites.base import SiteScraper from bs4 import BeautifulSoup from urllib.parse import urljoin import re from typing import Optional class PiaotianScraper(SiteScraper): root = "https://www.ptwxz.com" encoding = "GB18030" chapter_list_selector = "div.centent" # ------------------------------------------------------------ # METADATA PARSING # ------------------------------------------------------------ def parse_title(self, soup: BeautifulSoup) -> str: h1 = soup.find("h1") return h1.get_text(strip=True) if h1 else "UnknownBook" def parse_author(self, soup: BeautifulSoup) -> str: td = soup.find("td", string=lambda t: t and "作" in t) raw = td.get_text(strip=True) if td else "" return raw.split(":")[1] if ":" in raw else "UnknownAuthor" def parse_description(self, soup: BeautifulSoup) -> str: span = soup.find("span", string=lambda t: t and "内容简介" in t) if not span: return "" parts = [] for sib in span.next_siblings: # stop when next reappears if getattr(sib, "name", None) == "span": break text = ( sib.get_text(strip=True) if hasattr(sib, "get_text") else str(sib).strip() ) if text: parts.append(text) return "\n".join(parts) # ------------------------------------------------------------ # COVER PARSING # (exactly your BookScraper._parse_cover logic) # ------------------------------------------------------------ def parse_cover(self, soup: BeautifulSoup, url: str) -> Optional[str]: # Extract book_id from URL m = re.search(r"/(\d+)\.html$", url) if not m: return None book_id = m.group(1) # Extract vol (bookinfo//.html) m2 = re.search(r"/bookinfo/(\d+)/", url) volume = m2.group(1) if m2 else None imgs = soup.find_all("img", src=True) chosen = None # Priority 1: match "/files/article/image/{vol}/{book_id}/" if volume: target_path = f"/files/article/image/{volume}/{book_id}/" for img in imgs: src = img["src"] if target_path in src: chosen = src break # Priority 2: endswith "/{book_id}s.jpg" if not chosen: target_suffix = f"/{book_id}s.jpg" for img in imgs: src = img["src"] if src.endswith(target_suffix): chosen = src break if not chosen: return None return urljoin(self.root, chosen) # ------------------------------------------------------------ # CHAPTER EXTRACTION # ------------------------------------------------------------ def extract_chapter_page_url(self, soup: BeautifulSoup) -> str: node = soup.select_one( "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" ) href = node.select_one("a").get("href") return urljoin(self.root, href) def parse_chapter_list(self, soup: BeautifulSoup) -> list: cont = soup.select_one(self.chapter_list_selector) items = cont.select("ul li a[href]") if cont else [] chapters = [] idx = 1 for a in items: href = a.get("href") if not href.endswith(".html"): continue title = a.get_text(strip=True) full_url = urljoin(self.root, href) chapters.append({"num": idx, "title": title, "url": full_url}) idx += 1 return chapters