# worker/downloader.py import time import requests from io import BytesIO from bs4 import BeautifulSoup from scraper.logger import log_debug from scraper.utils.utils import clean_text from urllib.parse import urljoin class ChapterDownloader: """ Worker-side chapter downloader. - Geen metadata scraping - Geen BookScraper dependency - Alleen: GET → parse → text → save """ def __init__(self, min_delay=1.0): self.min_delay = min_delay self._last_download_time = 0 # ------------------------------------------------------------ def throttle(self): now = time.time() elapsed = now - self._last_download_time if elapsed < self.min_delay: time.sleep(self.min_delay - elapsed) self._last_download_time = time.time() # ------------------------------------------------------------ def get_doc_with_retry(self, url): attempt = 1 while True: self.throttle() log_debug(f"[DL] GET {url} (attempt {attempt})") try: resp = requests.get( url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10, ) except Exception as e: log_debug(f"[DL] Network error {e} → retry") attempt += 1 time.sleep(2) continue code = resp.status_code if code == 200: resp.encoding = "utf-8" return BeautifulSoup(resp.text, "lxml") if code == 429: log_debug("[DL] 429 cooldown 60s") time.sleep(60) attempt += 1 continue if code in (403, 500): log_debug(f"[DL] HTTP {code} → retry") time.sleep(5) attempt += 1 continue log_debug(f"[DL] Unexpected HTTP {code}") time.sleep(3) attempt += 1 # ------------------------------------------------------------ def parse_chapter_text(self, soup): """ Kopie van BookScraper.parse_chapter_text, MAAR zonder dependencies op parse_title, parse_author, etc. """ body = soup.body if not body: return "" h1 = body.find("h1") if not h1: return "" parts = [] collecting = False for sib in h1.next_siblings: if getattr(sib, "class", None) == ["toplink"]: continue if getattr(sib, "class", None) == ["bottomlink"]: break if getattr(sib, "name", None) in ["script", "style"]: continue if not collecting: if getattr(sib, "name", None) == "br": collecting = True continue text = ( sib.get_text("\n", strip=True) if hasattr(sib, "get_text") else str(sib).strip() ) if text: parts.append(text) raw = "\n".join(parts) return clean_text(raw, {}) # ------------------------------------------------------------ def save_chapter(self, number, title, text, output_base): """ Save chapter using same volume logic as BookScraper. """ max_size = 200 volume = ((number - 1) // max_size) + 1 vdir = f"{output_base}/v{volume}" import os os.makedirs(vdir, exist_ok=True) fname = f"{number:05d}_{title}.txt" full = f"{vdir}/{fname}" with open(full, "w", encoding="utf-8") as f: f.write(text) log_debug(f"[DL] Saved chapter {number}: {full}") return full # ------------------------------------------------------------ def download(self, number, title, url, output_base): soup = self.get_doc_with_retry(url) text = self.parse_chapter_text(soup) return self.save_chapter(number, title, text, output_base)