|
|
import requests
|
|
|
import os
|
|
|
import time
|
|
|
from pathlib import Path
|
|
|
from bs4 import BeautifulSoup
|
|
|
from urllib.parse import urljoin, urlparse
|
|
|
from PIL import Image
|
|
|
from io import BytesIO
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
from scraper.logger import setup_logger, LOG_BUFFER
|
|
|
from scraper.utils import clean_text, load_replacements
|
|
|
|
|
|
load_dotenv()
|
|
|
logger = setup_logger()
|
|
|
|
|
|
|
|
|
class Chapter:
|
|
|
def __init__(self, number, title, url):
|
|
|
self.number = number
|
|
|
self.title = title
|
|
|
self.url = url
|
|
|
self.text = ""
|
|
|
|
|
|
|
|
|
class BookScraper:
|
|
|
def __init__(self, site, url):
|
|
|
self.site = site
|
|
|
self.url = url
|
|
|
|
|
|
self.book_title = ""
|
|
|
self.book_author = ""
|
|
|
self.book_description = ""
|
|
|
self.cover_url = ""
|
|
|
|
|
|
self.chapters = []
|
|
|
self.chapter_base = None
|
|
|
self.base_path = None
|
|
|
|
|
|
# ENV settings
|
|
|
self.DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
|
|
|
self.TEST_CHAPTER_LIMIT = int(os.getenv("TEST_CHAPTER_LIMIT", "10"))
|
|
|
self.MAX_VOL_SIZE = int(os.getenv("MAX_VOL_SIZE", "1500"))
|
|
|
self.MAX_DL_PER_SEC = int(os.getenv("MAX_DL_PER_SEC", "2"))
|
|
|
|
|
|
# Load text replacements
|
|
|
self.replacements = load_replacements("replacements.txt")
|
|
|
|
|
|
# -----------------------------------------------------
|
|
|
def execute(self):
|
|
|
LOG_BUFFER.seek(0)
|
|
|
LOG_BUFFER.truncate(0)
|
|
|
|
|
|
logger.debug("Starting scraper for %s", self.url)
|
|
|
soup = self.get_document(self.url)
|
|
|
|
|
|
self.parse_title(soup)
|
|
|
self.parse_author(soup)
|
|
|
self.parse_description(soup)
|
|
|
self.parse_cover(soup)
|
|
|
self.prepare_output_folder()
|
|
|
|
|
|
chapter_page = self.get_chapter_page(soup)
|
|
|
self.parse_chapter_links(chapter_page)
|
|
|
|
|
|
if self.DRY_RUN:
|
|
|
logger.debug(
|
|
|
"DRY RUN → downloading only first %s chapters", self.TEST_CHAPTER_LIMIT)
|
|
|
self.get_some_chapters(self.TEST_CHAPTER_LIMIT)
|
|
|
else:
|
|
|
self.get_all_chapters()
|
|
|
self.split_into_volumes()
|
|
|
|
|
|
return {
|
|
|
"title": self.book_title,
|
|
|
"debug": LOG_BUFFER.getvalue()
|
|
|
}
|
|
|
|
|
|
# -----------------------------------------------------
|
|
|
# NETWORK
|
|
|
# -----------------------------------------------------
|
|
|
def get_document(self, url):
|
|
|
logger.debug("GET %s", url)
|
|
|
time.sleep(1 / max(1, self.MAX_DL_PER_SEC))
|
|
|
|
|
|
resp = requests.get(
|
|
|
url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
|
|
|
resp.encoding = self.site.encoding
|
|
|
|
|
|
logger.debug("HTTP %s for %s", resp.status_code, url)
|
|
|
return BeautifulSoup(resp.text, "lxml")
|
|
|
|
|
|
# -----------------------------------------------------
|
|
|
# BASIC PARSERS (piaotia structure)
|
|
|
# -----------------------------------------------------
|
|
|
def parse_title(self, soup):
|
|
|
h1 = soup.find("h1")
|
|
|
if h1:
|
|
|
self.book_title = h1.get_text(strip=True)
|
|
|
else:
|
|
|
self.book_title = "UnknownTitle"
|
|
|
logger.debug("Book title: %s", self.book_title)
|
|
|
|
|
|
def parse_author(self, soup):
|
|
|
td = soup.find("td", string=lambda t: t and "作" in t and "者" in t)
|
|
|
if td:
|
|
|
raw = td.get_text(strip=True)
|
|
|
if ":" in raw:
|
|
|
self.book_author = raw.split(":", 1)[1].strip()
|
|
|
else:
|
|
|
self.book_author = "UnknownAuthor"
|
|
|
else:
|
|
|
self.book_author = "UnknownAuthor"
|
|
|
logger.debug("Book author: %s", self.book_author)
|
|
|
|
|
|
def parse_description(self, soup):
|
|
|
span = soup.find("span", string=lambda t: t and "内容简介" in t)
|
|
|
if not span:
|
|
|
self.book_description = ""
|
|
|
return
|
|
|
|
|
|
parts = []
|
|
|
for sib in span.next_siblings:
|
|
|
if getattr(sib, "name", None) == "span":
|
|
|
break
|
|
|
txt = sib.get_text(strip=True) if not isinstance(
|
|
|
sib, str) else sib.strip()
|
|
|
if txt:
|
|
|
parts.append(txt)
|
|
|
|
|
|
self.book_description = "\n".join(parts)
|
|
|
logger.debug("Description parsed (%s chars)",
|
|
|
len(self.book_description))
|
|
|
|
|
|
def parse_cover(self, soup):
|
|
|
selector = (
|
|
|
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table "
|
|
|
"> tr:nth-of-type(4) > td:nth-of-type(1) > table > tr:nth-of-type(1) "
|
|
|
"> td:nth-of-type(2) > a:nth-of-type(1) > img"
|
|
|
)
|
|
|
img = soup.select_one(selector)
|
|
|
if img:
|
|
|
self.cover_url = urljoin(self.site.root, img.get("src"))
|
|
|
else:
|
|
|
logger.debug("Cover not found!")
|
|
|
logger.debug("Cover URL = %s", self.cover_url)
|
|
|
|
|
|
# -----------------------------------------------------
|
|
|
def prepare_output_folder(self):
|
|
|
output_root = os.getenv("OUTPUT_DIR", "./output")
|
|
|
self.base_path = Path(output_root) / self.book_title / self.site.name
|
|
|
self.base_path.mkdir(parents=True, exist_ok=True)
|
|
|
logger.debug("Output directory: %s", self.base_path)
|
|
|
|
|
|
if self.cover_url:
|
|
|
self.save_image(self.cover_url, self.base_path / "cover.jpg")
|
|
|
|
|
|
def save_image(self, url, path):
|
|
|
logger.debug("Downloading cover: %s", url)
|
|
|
resp = requests.get(
|
|
|
url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
|
|
|
if resp.status_code == 200:
|
|
|
img = Image.open(BytesIO(resp.content))
|
|
|
img.save(path)
|
|
|
logger.debug("Cover saved to %s", path)
|
|
|
|
|
|
# -----------------------------------------------------
|
|
|
# CHAPTER PAGE
|
|
|
# -----------------------------------------------------
|
|
|
def get_chapter_page(self, soup):
|
|
|
node = soup.select_one(
|
|
|
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table")
|
|
|
link = node.select_one("a")
|
|
|
href = link.get("href")
|
|
|
chapter_url = urljoin(self.site.root, href)
|
|
|
|
|
|
parsed = urlparse(chapter_url)
|
|
|
base = parsed.path.rsplit("/", 1)[0] + "/"
|
|
|
self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{base}"
|
|
|
|
|
|
logger.debug("Chapter index URL = %s", chapter_url)
|
|
|
logger.debug("CHAPTER_BASE = %s", self.chapter_base)
|
|
|
|
|
|
return self.get_document(chapter_url)
|
|
|
|
|
|
def parse_chapter_links(self, soup):
|
|
|
container = soup.select_one("div.centent")
|
|
|
links = container.select("ul li a[href]")
|
|
|
|
|
|
for i, a in enumerate(links, 1):
|
|
|
href = a.get("href")
|
|
|
if not href.endswith(".html"):
|
|
|
continue
|
|
|
|
|
|
abs_url = urljoin(self.chapter_base, href)
|
|
|
title = a.get_text(strip=True)
|
|
|
self.chapters.append(Chapter(i, title, abs_url))
|
|
|
|
|
|
logger.debug("Total chapters: %s", len(self.chapters))
|
|
|
|
|
|
# -----------------------------------------------------
|
|
|
# DOWNLOAD CHAPTERS
|
|
|
# -----------------------------------------------------
|
|
|
def get_all_chapters(self):
|
|
|
for ch in self.chapters:
|
|
|
ch.text = self.fetch_chapter(ch)
|
|
|
logger.debug("CH %s length = %s", ch.number, len(ch.text))
|
|
|
|
|
|
def get_some_chapters(self, limit):
|
|
|
for ch in self.chapters[:limit]:
|
|
|
ch.text = self.fetch_chapter(ch)
|
|
|
filename = self.base_path / f"{ch.number:05d}_{ch.title}.txt"
|
|
|
filename.write_text(ch.text, encoding="utf-8")
|
|
|
logger.debug("Saved test chapter: %s", filename)
|
|
|
|
|
|
def fetch_chapter(self, ch):
|
|
|
soup = self.get_document(ch.url)
|
|
|
text = self.parse_chapter_text(soup)
|
|
|
return clean_text(text, self.replacements)
|
|
|
|
|
|
def parse_chapter_text(self, soup):
|
|
|
body = soup.body
|
|
|
h1 = body.find("h1")
|
|
|
|
|
|
parts = []
|
|
|
collecting = False
|
|
|
|
|
|
for sib in h1.next_siblings:
|
|
|
if getattr(sib, "get", None) and sib.get("class") == ["bottomlink"]:
|
|
|
break
|
|
|
if getattr(sib, "get", None) and sib.get("class") == ["toplink"]:
|
|
|
continue
|
|
|
if getattr(sib, "name", None) in ["script", "style"]:
|
|
|
continue
|
|
|
|
|
|
if not collecting:
|
|
|
if getattr(sib, "name", None) == "br":
|
|
|
collecting = True
|
|
|
continue
|
|
|
|
|
|
txt = sib.strip() if isinstance(sib, str) else sib.get_text("\n", strip=True)
|
|
|
if txt:
|
|
|
parts.append(txt)
|
|
|
|
|
|
return "\n".join(parts).strip()
|
|
|
|
|
|
# -----------------------------------------------------
|
|
|
# SPLIT VOLUMES
|
|
|
# -----------------------------------------------------
|
|
|
def split_into_volumes(self):
|
|
|
logger.debug(
|
|
|
"Splitting into volumes (max %s chapters per volume)", self.MAX_VOL_SIZE)
|
|
|
|
|
|
chapters = len(self.chapters)
|
|
|
volume = 1
|
|
|
index = 0
|
|
|
|
|
|
while index < chapters:
|
|
|
chunk = self.chapters[index:index + self.MAX_VOL_SIZE]
|
|
|
volume_dir = self.base_path / f"v{volume}"
|
|
|
volume_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
for ch in chunk:
|
|
|
filename = volume_dir / f"{ch.number:05d}_{ch.title}.txt"
|
|
|
filename.write_text(ch.text, encoding="utf-8")
|
|
|
|
|
|
logger.debug("Volume %s saved (%s chapters)", volume, len(chunk))
|
|
|
volume += 1
|
|
|
index += self.MAX_VOL_SIZE
|