|
|
|
|
@ -2,7 +2,7 @@
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
|
|
|
|
from scraper.logger import log_debug
|
|
|
|
|
from scraper.utils import clean_text, load_replacements
|
|
|
|
|
@ -11,8 +11,11 @@ from scraper.models.book_state import Chapter
|
|
|
|
|
|
|
|
|
|
class BookScraper:
|
|
|
|
|
"""
|
|
|
|
|
Lightweight scraper: only metadata + chapter list.
|
|
|
|
|
All downloading/parsing/saving is handled by Celery tasks.
|
|
|
|
|
Minimal scraper: only metadata + chapter list.
|
|
|
|
|
The DownloadController handles Celery pipelines for:
|
|
|
|
|
- download
|
|
|
|
|
- parse
|
|
|
|
|
- save
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, site, url):
|
|
|
|
|
@ -23,17 +26,17 @@ class BookScraper:
|
|
|
|
|
self.book_author = ""
|
|
|
|
|
self.book_description = ""
|
|
|
|
|
self.cover_url = ""
|
|
|
|
|
self.chapter_base = None
|
|
|
|
|
|
|
|
|
|
self.chapters = []
|
|
|
|
|
self.chapter_base = None
|
|
|
|
|
|
|
|
|
|
# Load custom replacements
|
|
|
|
|
extra = load_replacements("replacements.txt")
|
|
|
|
|
self.site.replacements.update(extra)
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
def parse_book_info(self):
|
|
|
|
|
"""Parse title, author, description, cover from the main page."""
|
|
|
|
|
def execute(self):
|
|
|
|
|
"""Main entry point. Returns metadata + chapter URLs."""
|
|
|
|
|
soup = self._fetch(self.url)
|
|
|
|
|
|
|
|
|
|
self._parse_title(soup)
|
|
|
|
|
@ -41,13 +44,25 @@ class BookScraper:
|
|
|
|
|
self._parse_description(soup)
|
|
|
|
|
self._parse_cover(soup)
|
|
|
|
|
|
|
|
|
|
# Parse chapter list page + chapter links
|
|
|
|
|
chapter_page = self.get_chapter_page(soup)
|
|
|
|
|
self.parse_chapter_links(chapter_page)
|
|
|
|
|
|
|
|
|
|
log_debug(f"[BookScraper] Completed metadata parse")
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"title": self.book_title,
|
|
|
|
|
"author": self.book_author,
|
|
|
|
|
"description": self.book_description,
|
|
|
|
|
"cover_url": self.cover_url,
|
|
|
|
|
"book_url": self.url,
|
|
|
|
|
"chapters": [
|
|
|
|
|
{"num": ch.number, "title": ch.title, "url": ch.url}
|
|
|
|
|
for ch in self.chapters
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
def _fetch(self, url):
|
|
|
|
|
"""Simple fetch (no retry), DownloadController handles errors."""
|
|
|
|
|
log_debug(f"[BookScraper] Fetch: {url}")
|
|
|
|
|
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
|
|
|
|
|
resp.encoding = self.site.encoding
|
|
|
|
|
@ -74,7 +89,6 @@ class BookScraper:
|
|
|
|
|
|
|
|
|
|
parts = []
|
|
|
|
|
for sib in span.next_siblings:
|
|
|
|
|
# Stop when next book section begins
|
|
|
|
|
if getattr(sib, "name", None) == "span":
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
@ -83,22 +97,21 @@ class BookScraper:
|
|
|
|
|
if hasattr(sib, "get_text")
|
|
|
|
|
else str(sib).strip()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if text:
|
|
|
|
|
parts.append(text)
|
|
|
|
|
|
|
|
|
|
self.book_description = "\n".join(parts)
|
|
|
|
|
log_debug(
|
|
|
|
|
f"[BookScraper] Description length = {len(self.book_description)} characters"
|
|
|
|
|
)
|
|
|
|
|
self.book_description = clean_text("\n".join(parts), self.site.replacements)
|
|
|
|
|
log_debug(f"[BookScraper] Description length = {len(self.book_description)}")
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
def _parse_cover(self, soup):
|
|
|
|
|
cover = soup.find("img", src=lambda v: v and "files/article/image" in v)
|
|
|
|
|
if not cover:
|
|
|
|
|
img = soup.find("img", src=lambda v: v and "files/article/image" in v)
|
|
|
|
|
if not img:
|
|
|
|
|
log_debug("[BookScraper] No cover found")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
self.cover_url = urljoin(self.site.root, cover.get("src"))
|
|
|
|
|
self.cover_url = urljoin(self.site.root, img.get("src"))
|
|
|
|
|
log_debug(f"[BookScraper] Cover URL = {self.cover_url}")
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
@ -108,13 +121,13 @@ class BookScraper:
|
|
|
|
|
"html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table"
|
|
|
|
|
)
|
|
|
|
|
href = node.select_one("a").get("href")
|
|
|
|
|
url = urljoin(self.site.root, href)
|
|
|
|
|
chapter_url = urljoin(self.site.root, href)
|
|
|
|
|
|
|
|
|
|
parsed = urlparse(url)
|
|
|
|
|
bp = parsed.path.rsplit("/", 1)[0] + "/"
|
|
|
|
|
self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{bp}"
|
|
|
|
|
# base for chapter links
|
|
|
|
|
parts = chapter_url.rsplit("/", 1)
|
|
|
|
|
self.chapter_base = parts[0] + "/"
|
|
|
|
|
|
|
|
|
|
return self._fetch(url)
|
|
|
|
|
return self._fetch(chapter_url)
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
def parse_chapter_links(self, soup):
|
|
|
|
|
@ -136,8 +149,3 @@ class BookScraper:
|
|
|
|
|
idx += 1
|
|
|
|
|
|
|
|
|
|
log_debug(f"[BookScraper] Found {len(self.chapters)} chapters")
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
def get_chapter_list(self):
|
|
|
|
|
"""Return the chapter list (DownloadController reads this)."""
|
|
|
|
|
return self.chapters
|
|
|
|
|
|