|
|
# ============================================================
|
|
|
# File: scraper/sites/piaotian.py
|
|
|
# Purpose:
|
|
|
# Concrete SiteScraper implementation for ptwxz.com (Piaotian).
|
|
|
# Moves all parsing logic out of BookScraper.
|
|
|
# ============================================================
|
|
|
|
|
|
from scraper.sites.base import SiteScraper
|
|
|
from bs4 import BeautifulSoup
|
|
|
from urllib.parse import urljoin
|
|
|
import re
|
|
|
|
|
|
|
|
|
class PiaotianScraper(SiteScraper):
|
|
|
root = "https://www.ptwxz.com"
|
|
|
encoding = "GB18030"
|
|
|
chapter_list_selector = "div.centent"
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
# METADATA PARSING
|
|
|
# ------------------------------------------------------------
|
|
|
def parse_title(self, soup: BeautifulSoup) -> str:
|
|
|
h1 = soup.find("h1")
|
|
|
return h1.get_text(strip=True) if h1 else "UnknownBook"
|
|
|
|
|
|
def parse_author(self, soup: BeautifulSoup) -> str:
|
|
|
td = soup.find("td", string=lambda t: t and "作" in t)
|
|
|
raw = td.get_text(strip=True) if td else ""
|
|
|
return raw.split(":")[1] if ":" in raw else "UnknownAuthor"
|
|
|
|
|
|
def parse_description(self, soup: BeautifulSoup) -> str:
|
|
|
span = soup.find("span", string=lambda t: t and "内容简介" in t)
|
|
|
if not span:
|
|
|
return ""
|
|
|
|
|
|
parts = []
|
|
|
for sib in span.next_siblings:
|
|
|
# stop when next <span> reappears
|
|
|
if getattr(sib, "name", None) == "span":
|
|
|
break
|
|
|
|
|
|
text = (
|
|
|
sib.get_text(strip=True)
|
|
|
if hasattr(sib, "get_text")
|
|
|
else str(sib).strip()
|
|
|
)
|
|
|
if text:
|
|
|
parts.append(text)
|
|
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
# COVER PARSING
|
|
|
# (exactly your BookScraper._parse_cover logic)
|
|
|
# ------------------------------------------------------------
|
|
|
def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None:
|
|
|
# Extract book_id from URL
|
|
|
m = re.search(r"/(\d+)\.html$", url)
|
|
|
if not m:
|
|
|
return None
|
|
|
|
|
|
book_id = m.group(1)
|
|
|
|
|
|
# Extract vol (bookinfo/<vol>/<id>.html)
|
|
|
m2 = re.search(r"/bookinfo/(\d+)/", url)
|
|
|
volume = m2.group(1) if m2 else None
|
|
|
|
|
|
imgs = soup.find_all("img", src=True)
|
|
|
chosen = None
|
|
|
|
|
|
# Priority 1: match "/files/article/image/{vol}/{book_id}/"
|
|
|
if volume:
|
|
|
target_path = f"/files/article/image/{volume}/{book_id}/"
|
|
|
for img in imgs:
|
|
|
src = img["src"]
|
|
|
if target_path in src:
|
|
|
chosen = src
|
|
|
break
|
|
|
|
|
|
# Priority 2: endswith "/{book_id}s.jpg"
|
|
|
if not chosen:
|
|
|
target_suffix = f"/{book_id}s.jpg"
|
|
|
for img in imgs:
|
|
|
src = img["src"]
|
|
|
if src.endswith(target_suffix):
|
|
|
chosen = src
|
|
|
break
|
|
|
|
|
|
if not chosen:
|
|
|
return None
|
|
|
|
|
|
return urljoin(self.root, chosen)
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
# CHAPTER EXTRACTION
|
|
|
# ------------------------------------------------------------
|
|
|
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str:
|
|
|
node = soup.select_one(
|
|
|
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
|
|
|
)
|
|
|
href = node.select_one("a").get("href")
|
|
|
return urljoin(self.root, href)
|
|
|
|
|
|
def parse_chapter_list(self, soup: BeautifulSoup) -> list:
|
|
|
cont = soup.select_one(self.chapter_list_selector)
|
|
|
items = cont.select("ul li a[href]") if cont else []
|
|
|
|
|
|
chapters = []
|
|
|
idx = 1
|
|
|
|
|
|
for a in items:
|
|
|
href = a.get("href")
|
|
|
if not href.endswith(".html"):
|
|
|
continue
|
|
|
title = a.get_text(strip=True)
|
|
|
full_url = urljoin(self.root, href)
|
|
|
chapters.append({"num": idx, "title": title, "url": full_url})
|
|
|
idx += 1
|
|
|
|
|
|
return chapters
|