You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/sites/piaotian.py

121 lines
4.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# ============================================================
# File: scraper/sites/piaotian.py
# Purpose:
# Concrete SiteScraper implementation for ptwxz.com (Piaotian).
# Moves all parsing logic out of BookScraper.
# ============================================================
from scraper.sites.base import SiteScraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
class PiaotianScraper(SiteScraper):
root = "https://www.ptwxz.com"
encoding = "GB18030"
chapter_list_selector = "div.centent"
# ------------------------------------------------------------
# METADATA PARSING
# ------------------------------------------------------------
def parse_title(self, soup: BeautifulSoup) -> str:
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "UnknownBook"
def parse_author(self, soup: BeautifulSoup) -> str:
td = soup.find("td", string=lambda t: t and "" in t)
raw = td.get_text(strip=True) if td else ""
return raw.split("")[1] if "" in raw else "UnknownAuthor"
def parse_description(self, soup: BeautifulSoup) -> str:
span = soup.find("span", string=lambda t: t and "内容简介" in t)
if not span:
return ""
parts = []
for sib in span.next_siblings:
# stop when next <span> reappears
if getattr(sib, "name", None) == "span":
break
text = (
sib.get_text(strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
return "\n".join(parts)
# ------------------------------------------------------------
# COVER PARSING
# (exactly your BookScraper._parse_cover logic)
# ------------------------------------------------------------
def parse_cover(self, soup: BeautifulSoup, url: str) -> str | None:
# Extract book_id from URL
m = re.search(r"/(\d+)\.html$", url)
if not m:
return None
book_id = m.group(1)
# Extract vol (bookinfo/<vol>/<id>.html)
m2 = re.search(r"/bookinfo/(\d+)/", url)
volume = m2.group(1) if m2 else None
imgs = soup.find_all("img", src=True)
chosen = None
# Priority 1: match "/files/article/image/{vol}/{book_id}/"
if volume:
target_path = f"/files/article/image/{volume}/{book_id}/"
for img in imgs:
src = img["src"]
if target_path in src:
chosen = src
break
# Priority 2: endswith "/{book_id}s.jpg"
if not chosen:
target_suffix = f"/{book_id}s.jpg"
for img in imgs:
src = img["src"]
if src.endswith(target_suffix):
chosen = src
break
if not chosen:
return None
return urljoin(self.root, chosen)
# ------------------------------------------------------------
# CHAPTER EXTRACTION
# ------------------------------------------------------------
def extract_chapter_page_url(self, soup: BeautifulSoup) -> str:
node = soup.select_one(
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
)
href = node.select_one("a").get("href")
return urljoin(self.root, href)
def parse_chapter_list(self, soup: BeautifulSoup) -> list:
cont = soup.select_one(self.chapter_list_selector)
items = cont.select("ul li a[href]") if cont else []
chapters = []
idx = 1
for a in items:
href = a.get("href")
if not href.endswith(".html"):
continue
title = a.get_text(strip=True)
full_url = urljoin(self.root, href)
chapters.append({"num": idx, "title": title, "url": full_url})
idx += 1
return chapters