# ============================================================ # File: scraper/services/scrape_engine.py (C&U — no circular import) # Purpose: # Unified scraping engine for INIT-flow and Celery tasks. # ScrapeEngine does NOT determine book_idx itself. # ============================================================ import os import time import re import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from logbus.publisher import log from scraper.logger import log_debug from scraper.logger_decorators import logcall from scraper.utils.utils import load_replacements class ScrapeEngine: """ Central scraping engine. Metadata + chapterlist scraping. All methods logged with @logcall. IMPORTANT: - ScrapeEngine NEVER decides book_idx. - No dependency on InitService (prevents circular import). """ # ------------------------------------------------------------ # REPLACEMENTS LOADER # ------------------------------------------------------------ @staticmethod @logcall def _apply_replacements(site): fp = os.path.join(os.getcwd(), "replacements.txt") extra = load_replacements(fp) if not hasattr(site, "replacements"): site.replacements = {} site.replacements.update(extra) return True # ------------------------------------------------------------ # RATE LIMITER # ------------------------------------------------------------ MIN_DELAY = 1.0 / float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1")) @staticmethod @logcall def _throttle(last_time=[0]): now = time.time() elapsed = now - last_time[0] if elapsed < ScrapeEngine.MIN_DELAY: time.sleep(ScrapeEngine.MIN_DELAY - elapsed) last_time[0] = time.time() return True # ------------------------------------------------------------ # HTTP GET # ------------------------------------------------------------ @staticmethod @logcall def _get_doc(url: str, site): attempt = 1 while True: ScrapeEngine._throttle() log_debug(f"[SCRAPER] GET {url} (attempt {attempt})") try: resp = requests.get( url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10, ) except Exception as e: log_debug(f"Network error {e} → retry {attempt + 1}s") time.sleep(attempt + 1) attempt += 1 continue code = resp.status_code if code == 200: resp.encoding = getattr(site, "encoding", "utf-8") return BeautifulSoup(resp.text, "lxml") if code == 429: cooldown = 60 log_debug("429 detected — cooldown 60s") for i in range(cooldown, 0, -1): log_debug(f" cooldown {i}s…") time.sleep(1) attempt += 1 continue if code in (403, 500): wait = min(5 * attempt, 30) log_debug(f"HTTP {code} → retry in {wait}s") time.sleep(wait) attempt += 1 continue wait = attempt + 1 log_debug(f"Unexpected HTTP {code} → sleep {wait}s") time.sleep(wait) attempt += 1 # ------------------------------------------------------------ # PARSER HELPERS # ------------------------------------------------------------ @staticmethod @logcall def _parse_title(soup): h1 = soup.find("h1") return h1.get_text(strip=True) if h1 else "UnknownTitle" @staticmethod @logcall def _parse_author(soup): td = soup.find("td", string=lambda t: t and "作" in t) if td and ":" in td.get_text(): return td.get_text(strip=True).split(":")[1] return "UnknownAuthor" @staticmethod @logcall def _parse_description(soup): span = soup.find("span", string=lambda t: t and "内容简介" in t) if not span: return "" parts = [] for sib in span.next_siblings: if getattr(sib, "name", None) == "span": break txt = ( sib.get_text(strip=True) if hasattr(sib, "get_text") else str(sib).strip() ) if txt: parts.append(txt) return "\n".join(parts) # ------------------------------------------------------------ # COVER PARSER (NO InitService dependency) # ------------------------------------------------------------ @staticmethod @logcall def _parse_cover(soup, site): """ Extract book index from URL heuristically instead of InitService (prevents circular import). """ # Typical Chinese novel sites embed numeric ID in URL path try: parsed = urlparse(site.url) digits = re.findall(r"\d+", parsed.path) book_idx = digits[-1] if digits else None except Exception: book_idx = None imgs = soup.find_all("img", src=True) candidates = [] for img in imgs: src = img["src"].strip() filename = os.path.basename(src) if book_idx and book_idx in filename: candidates.append((filename, src)) if not candidates: return None candidates.sort(key=lambda t: len(t[0])) # smallest filename return urljoin(site.root, candidates[0][1]) # ------------------------------------------------------------ # RESOLVE CHAPTER PAGE # ------------------------------------------------------------ @staticmethod @logcall def _resolve_chapter_page(soup, site): node = soup.select_one( "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" ) if not node: raise ValueError("Could not locate chapter list base node") href = node.select_one("a").get("href") url = urljoin(site.root, href) parsed = urlparse(url) basepath = parsed.path.rsplit("/", 1)[0] + "/" chapter_base = f"{parsed.scheme}://{parsed.netloc}{basepath}" return url, chapter_base # ------------------------------------------------------------ # PARSE CHAPTER LINKS # ------------------------------------------------------------ @staticmethod @logcall def _parse_chapter_links(soup, chapter_base, selector): cont = soup.select_one(selector) if not cont: return [] items = cont.select("ul li a[href]") chapters = [] idx = 1 for a in items: href = a.get("href") if not href.endswith(".html"): continue title = a.get_text(strip=True) full = urljoin(chapter_base, href) chapters.append({"num": idx, "title": title, "url": full}) idx += 1 return chapters # ============================================================ # PUBLIC APIS # ============================================================ @staticmethod @logcall def fetch_metadata_only(site, url: str) -> dict: ScrapeEngine._apply_replacements(site) soup = ScrapeEngine._get_doc(url, site) site.url = url # needed for cover parsing return { "title": ScrapeEngine._parse_title(soup), "author": ScrapeEngine._parse_author(soup), "description": ScrapeEngine._parse_description(soup), "cover_url": ScrapeEngine._parse_cover(soup, site), "book_url": url, } @staticmethod @logcall def fetch_metadata_and_chapters(site, url: str) -> dict: ScrapeEngine._apply_replacements(site) soup = ScrapeEngine._get_doc(url, site) site.url = url title = ScrapeEngine._parse_title(soup) author = ScrapeEngine._parse_author(soup) desc = ScrapeEngine._parse_description(soup) cover = ScrapeEngine._parse_cover(soup, site) chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site) chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site) chapters = ScrapeEngine._parse_chapter_links( chapter_soup, chapter_base, site.chapter_list_selector ) return { "title": title, "author": author, "description": desc, "cover_url": cover, "chapters": chapters, "chapters_total": len(chapters), "book_url": url, } @staticmethod @logcall def fetch_chapterlist(site, url: str): ScrapeEngine._apply_replacements(site) soup = ScrapeEngine._get_doc(url, site) chapter_page_url, chapter_base = ScrapeEngine._resolve_chapter_page(soup, site) chapter_soup = ScrapeEngine._get_doc(chapter_page_url, site) return ScrapeEngine._parse_chapter_links( chapter_soup, chapter_base, site.chapter_list_selector )