|
|
import requests
|
|
|
import os
|
|
|
import time
|
|
|
from pathlib import Path
|
|
|
from bs4 import BeautifulSoup
|
|
|
from urllib.parse import urljoin, urlparse
|
|
|
from PIL import Image
|
|
|
from io import BytesIO
|
|
|
|
|
|
from scraper.logger import log_debug
|
|
|
from scraper.utils import clean_text, load_replacements
|
|
|
|
|
|
|
|
|
class Chapter:
|
|
|
def __init__(self, num, title, url):
|
|
|
self.number = num
|
|
|
self.title = title
|
|
|
self.url = url
|
|
|
self.text = ""
|
|
|
|
|
|
|
|
|
class BookScraper:
|
|
|
def __init__(self, site, url):
|
|
|
self.site = site
|
|
|
self.url = url
|
|
|
|
|
|
self.book_title = ""
|
|
|
self.book_author = ""
|
|
|
self.book_description = ""
|
|
|
self.cover_url = ""
|
|
|
|
|
|
self.chapters = []
|
|
|
self.base_path = None
|
|
|
self.chapter_base = None
|
|
|
|
|
|
# ENV
|
|
|
self.DRY_RUN = os.getenv("DRY_RUN", "1") == "1"
|
|
|
self.TEST_LIMIT = int(os.getenv("TEST_LIMIT", "10"))
|
|
|
self.MAX_DL = float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1"))
|
|
|
self.min_delay = 1.0 / self.MAX_DL if self.MAX_DL > 0 else 1.0
|
|
|
self._last_download_time = 0
|
|
|
|
|
|
# replacements.txt
|
|
|
fp = os.path.join(os.getcwd(), "replacements.txt")
|
|
|
extra = load_replacements(fp)
|
|
|
self.site.replacements.update(extra)
|
|
|
|
|
|
self.start_time = None
|
|
|
self.total_chapters = 0
|
|
|
self.volume_dirs = {}
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
# RATE LIMITER
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
def throttle(self):
|
|
|
now = time.time()
|
|
|
elapsed = now - self._last_download_time
|
|
|
|
|
|
if elapsed < self.min_delay:
|
|
|
time.sleep(self.min_delay - elapsed)
|
|
|
|
|
|
self._last_download_time = time.time()
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def execute(self):
|
|
|
log_debug(f"Starting scraper for {self.url}")
|
|
|
|
|
|
self.start_time = time.time()
|
|
|
|
|
|
soup = self.get_doc_with_retry(self.url)
|
|
|
self.parse_title(soup)
|
|
|
self.parse_author(soup)
|
|
|
self.parse_description(soup)
|
|
|
self.parse_cover(soup)
|
|
|
|
|
|
self.prepare_output_folder()
|
|
|
|
|
|
chapter_page = self.get_chapter_page(soup)
|
|
|
self.parse_chapter_links(chapter_page)
|
|
|
self.prepare_volume_folders()
|
|
|
|
|
|
if self.DRY_RUN:
|
|
|
self.download_some(self.TEST_LIMIT)
|
|
|
else:
|
|
|
self.download_all()
|
|
|
|
|
|
return {"title": self.book_title}
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
# HTTP GET WITH RETRIES + HARD 429 COOLDOWN WITH COUNTDOWN
|
|
|
# ------------------------------------------------------------
|
|
|
def get_doc_with_retry(self, url):
|
|
|
attempt = 1
|
|
|
|
|
|
while True:
|
|
|
self.throttle()
|
|
|
log_debug(f"GET {url} (attempt {attempt})")
|
|
|
|
|
|
try:
|
|
|
resp = requests.get(
|
|
|
url,
|
|
|
headers={"User-Agent": "Mozilla/5.0"},
|
|
|
timeout=10,
|
|
|
)
|
|
|
except Exception as e:
|
|
|
log_debug(f"Network error {e} → retry in {attempt + 1}s")
|
|
|
time.sleep(attempt + 1)
|
|
|
attempt += 1
|
|
|
continue
|
|
|
|
|
|
code = resp.status_code
|
|
|
log_debug(f"HTTP {code} for {url}")
|
|
|
|
|
|
# 429 → hard cooldown with countdown
|
|
|
if code == 429:
|
|
|
cooldown = 60
|
|
|
log_debug(f"429 detected — cooldown {cooldown}s")
|
|
|
for i in range(cooldown, 0, -1):
|
|
|
log_debug(f"429 cooldown… {i}s remaining")
|
|
|
time.sleep(1)
|
|
|
attempt += 1
|
|
|
continue
|
|
|
|
|
|
# recoverable
|
|
|
if code in (403, 500):
|
|
|
wait = min(5 * attempt, 30)
|
|
|
log_debug(f"HTTP {code} → retry in {wait}s")
|
|
|
time.sleep(wait)
|
|
|
attempt += 1
|
|
|
continue
|
|
|
|
|
|
if code == 200:
|
|
|
resp.encoding = self.site.encoding
|
|
|
return BeautifulSoup(resp.text, "lxml")
|
|
|
|
|
|
# unexpected
|
|
|
wait = attempt + 1
|
|
|
log_debug(f"Unexpected HTTP {code} → sleep {wait}s")
|
|
|
time.sleep(wait)
|
|
|
attempt += 1
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def parse_title(self, soup):
|
|
|
h1 = soup.find("h1")
|
|
|
self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle"
|
|
|
log_debug(f"Book title = {self.book_title}")
|
|
|
|
|
|
def parse_author(self, soup):
|
|
|
td = soup.find("td", string=lambda t: t and "作" in t)
|
|
|
self.book_author = (
|
|
|
td.get_text(strip=True).split(":")[1]
|
|
|
if td and ":" in td.get_text()
|
|
|
else "UnknownAuthor"
|
|
|
)
|
|
|
log_debug(f"Book author = {self.book_author}")
|
|
|
|
|
|
def parse_description(self, soup):
|
|
|
span = soup.find("span", string=lambda t: t and "内容简介" in t)
|
|
|
if not span:
|
|
|
log_debug("No description found")
|
|
|
self.book_description = ""
|
|
|
return
|
|
|
|
|
|
parts = []
|
|
|
for sib in span.next_siblings:
|
|
|
if getattr(sib, "name", None) == "span":
|
|
|
break
|
|
|
text = (
|
|
|
sib.get_text(strip=True)
|
|
|
if hasattr(sib, "get_text")
|
|
|
else str(sib).strip()
|
|
|
)
|
|
|
if text:
|
|
|
parts.append(text)
|
|
|
|
|
|
self.book_description = "\n".join(parts)
|
|
|
log_debug(f"Description length = {len(self.book_description)}")
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def parse_cover(self, soup):
|
|
|
cover = soup.find(
|
|
|
"img", src=lambda v: v and "files/article/image" in v)
|
|
|
if not cover:
|
|
|
log_debug("Cover not found")
|
|
|
return
|
|
|
|
|
|
self.cover_url = urljoin(self.site.root, cover.get("src"))
|
|
|
log_debug(f"Cover URL = {self.cover_url}")
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def prepare_output_folder(self):
|
|
|
self.base_path = Path("output") / self.book_title / self.site.name
|
|
|
self.base_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
if self.cover_url:
|
|
|
self.download_cover()
|
|
|
|
|
|
def download_cover(self):
|
|
|
log_debug(f"Downloading cover: {self.cover_url}")
|
|
|
|
|
|
resp = requests.get(
|
|
|
self.cover_url,
|
|
|
headers={"User-Agent": "Mozilla/5.0"},
|
|
|
timeout=10,
|
|
|
)
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
return
|
|
|
|
|
|
if "html" in resp.headers.get("Content-Type", ""):
|
|
|
return
|
|
|
|
|
|
try:
|
|
|
img = Image.open(BytesIO(resp.content))
|
|
|
except:
|
|
|
return
|
|
|
|
|
|
img.save(self.base_path / "cover.jpg")
|
|
|
log_debug("Cover saved")
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def get_chapter_page(self, soup):
|
|
|
node = soup.select_one(
|
|
|
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
|
|
|
)
|
|
|
href = node.select_one("a").get("href")
|
|
|
url = urljoin(self.site.root, href)
|
|
|
|
|
|
parsed = urlparse(url)
|
|
|
bp = parsed.path.rsplit("/", 1)[0] + "/"
|
|
|
self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{bp}"
|
|
|
|
|
|
return self.get_doc_with_retry(url)
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def parse_chapter_links(self, soup):
|
|
|
cont = soup.select_one(self.site.chapter_list_selector)
|
|
|
items = cont.select("ul li a[href]")
|
|
|
|
|
|
self.chapters = []
|
|
|
idx = 1
|
|
|
for a in items:
|
|
|
href = a.get("href")
|
|
|
if not href.endswith(".html"):
|
|
|
continue
|
|
|
title = a.get_text(strip=True)
|
|
|
full = urljoin(self.chapter_base, href)
|
|
|
self.chapters.append(Chapter(idx, title, full))
|
|
|
idx += 1
|
|
|
|
|
|
self.total_chapters = len(self.chapters)
|
|
|
log_debug(f"Found {self.total_chapters} chapters")
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def prepare_volume_folders(self):
|
|
|
max_size = int(os.getenv("MAX_VOL_SIZE", "200"))
|
|
|
num_vols = (self.total_chapters + max_size - 1) // max_size
|
|
|
|
|
|
for v in range(1, num_vols + 1):
|
|
|
d = self.base_path / f"v{v}"
|
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
self.volume_dirs[v] = d
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def download_all(self):
|
|
|
for ch in self.chapters:
|
|
|
self.download_chapter(ch)
|
|
|
|
|
|
def download_some(self, limit):
|
|
|
for ch in self.chapters[:limit]:
|
|
|
self.download_chapter(ch)
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
def download_chapter(self, ch):
|
|
|
# Determine volume + filename
|
|
|
max_size = int(os.getenv("MAX_VOL_SIZE", "200"))
|
|
|
volume = ((ch.number - 1) // max_size) + 1
|
|
|
vdir = self.volume_dirs.get(volume, self.base_path)
|
|
|
|
|
|
expected_name = f"{ch.number:05d}_{ch.title}.txt"
|
|
|
fname = vdir / expected_name
|
|
|
expected_full_path = str(fname.resolve())
|
|
|
|
|
|
# STRICT SKIP CHECK
|
|
|
if fname.exists() and fname.is_file():
|
|
|
actual_size = fname.stat().st_size
|
|
|
|
|
|
# correct name?
|
|
|
if fname.name == expected_name:
|
|
|
expected_dir = str(vdir.resolve())
|
|
|
actual_dir = str(fname.parent.resolve())
|
|
|
|
|
|
if expected_dir == actual_dir:
|
|
|
if actual_size > 300:
|
|
|
log_debug(
|
|
|
f"Skip chapter {ch.number}/{self.total_chapters}: already exists\n"
|
|
|
f" Path: {expected_full_path}\n"
|
|
|
f" Size: {actual_size} bytes"
|
|
|
)
|
|
|
return
|
|
|
else:
|
|
|
log_debug(
|
|
|
f"Existing file too small ({actual_size} bytes), redownloading: {expected_full_path}"
|
|
|
)
|
|
|
else:
|
|
|
log_debug(
|
|
|
f"Directory mismatch for chapter {ch.number}, redownloading"
|
|
|
)
|
|
|
else:
|
|
|
log_debug(
|
|
|
f"Filename mismatch for chapter {ch.number}, redownloading\n"
|
|
|
f" Expected: {expected_name}\n"
|
|
|
f" Found: {fname.name}"
|
|
|
)
|
|
|
|
|
|
# PROGRESS INFO
|
|
|
percent = (ch.number / self.total_chapters) * 100
|
|
|
elapsed = time.time() - self.start_time
|
|
|
avg_time = elapsed / max(ch.number - 1, 1)
|
|
|
remaining = self.total_chapters - ch.number
|
|
|
eta_seconds = max(0, remaining * avg_time)
|
|
|
|
|
|
eta_min = int(eta_seconds // 60)
|
|
|
eta_sec = int(eta_seconds % 60)
|
|
|
|
|
|
log_debug(
|
|
|
f"Fetching chapter {ch.number}/{self.total_chapters} "
|
|
|
f"({percent:.2f}%, ETA {eta_min}m {eta_sec}s): "
|
|
|
f"{ch.title}"
|
|
|
)
|
|
|
|
|
|
# RETRY EMPTY CONTENT
|
|
|
attempt = 1
|
|
|
while True:
|
|
|
soup = self.get_doc_with_retry(ch.url)
|
|
|
text = self.parse_chapter_text(soup)
|
|
|
|
|
|
if text.strip():
|
|
|
ch.text = text
|
|
|
break
|
|
|
|
|
|
wait = min(10 + attempt, 30)
|
|
|
log_debug(f"Empty chapter → retry in {wait}s")
|
|
|
time.sleep(wait)
|
|
|
attempt += 1
|
|
|
|
|
|
fname.write_text(ch.text, encoding="utf-8")
|
|
|
log_debug(f"Saved chapter to v{volume}: {fname}")
|
|
|
chapter_delay = float(os.getenv("CHAPTER_DELAY", "2"))
|
|
|
log_debug(f"Throttling {chapter_delay}s before next chapter")
|
|
|
time.sleep(chapter_delay)
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
def parse_chapter_text(self, soup):
|
|
|
body = soup.body
|
|
|
if not body:
|
|
|
return ""
|
|
|
|
|
|
h1 = body.find("h1")
|
|
|
if not h1:
|
|
|
return ""
|
|
|
|
|
|
parts = []
|
|
|
collecting = False
|
|
|
|
|
|
for sib in h1.next_siblings:
|
|
|
if getattr(sib, "class", None) == ["toplink"]:
|
|
|
continue
|
|
|
if getattr(sib, "class", None) == ["bottomlink"]:
|
|
|
break
|
|
|
if getattr(sib, "name", None) in ["script", "style"]:
|
|
|
continue
|
|
|
|
|
|
if not collecting:
|
|
|
if getattr(sib, "name", None) == "br":
|
|
|
collecting = True
|
|
|
continue
|
|
|
|
|
|
text = (
|
|
|
sib.get_text("\n", strip=True)
|
|
|
if hasattr(sib, "get_text")
|
|
|
else str(sib).strip()
|
|
|
)
|
|
|
if text:
|
|
|
parts.append(text)
|
|
|
|
|
|
raw = "\n".join(parts)
|
|
|
raw = clean_text(raw, self.site.replacements)
|
|
|
return raw.strip()
|