You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
144 lines
4.0 KiB
144 lines
4.0 KiB
# worker/downloader.py
|
|
|
|
import time
|
|
import requests
|
|
from io import BytesIO
|
|
from bs4 import BeautifulSoup
|
|
from scraper.logger import log_debug
|
|
from scraper.utils.utils import clean_text
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
class ChapterDownloader:
|
|
"""
|
|
Worker-side chapter downloader.
|
|
- Geen metadata scraping
|
|
- Geen BookScraper dependency
|
|
- Alleen: GET → parse → text → save
|
|
"""
|
|
|
|
def __init__(self, min_delay=1.0):
|
|
self.min_delay = min_delay
|
|
self._last_download_time = 0
|
|
|
|
# ------------------------------------------------------------
|
|
def throttle(self):
|
|
now = time.time()
|
|
elapsed = now - self._last_download_time
|
|
|
|
if elapsed < self.min_delay:
|
|
time.sleep(self.min_delay - elapsed)
|
|
|
|
self._last_download_time = time.time()
|
|
|
|
# ------------------------------------------------------------
|
|
def get_doc_with_retry(self, url):
|
|
attempt = 1
|
|
|
|
while True:
|
|
self.throttle()
|
|
log_debug(f"[DL] GET {url} (attempt {attempt})")
|
|
|
|
try:
|
|
resp = requests.get(
|
|
url,
|
|
headers={"User-Agent": "Mozilla/5.0"},
|
|
timeout=10,
|
|
)
|
|
except Exception as e:
|
|
log_debug(f"[DL] Network error {e} → retry")
|
|
attempt += 1
|
|
time.sleep(2)
|
|
continue
|
|
|
|
code = resp.status_code
|
|
|
|
if code == 200:
|
|
resp.encoding = "utf-8"
|
|
return BeautifulSoup(resp.text, "lxml")
|
|
|
|
if code == 429:
|
|
log_debug("[DL] 429 cooldown 60s")
|
|
time.sleep(60)
|
|
attempt += 1
|
|
continue
|
|
|
|
if code in (403, 500):
|
|
log_debug(f"[DL] HTTP {code} → retry")
|
|
time.sleep(5)
|
|
attempt += 1
|
|
continue
|
|
|
|
log_debug(f"[DL] Unexpected HTTP {code}")
|
|
time.sleep(3)
|
|
attempt += 1
|
|
|
|
# ------------------------------------------------------------
|
|
def parse_chapter_text(self, soup):
|
|
"""
|
|
Kopie van BookScraper.parse_chapter_text,
|
|
MAAR zonder dependencies op parse_title, parse_author, etc.
|
|
"""
|
|
body = soup.body
|
|
if not body:
|
|
return ""
|
|
|
|
h1 = body.find("h1")
|
|
if not h1:
|
|
return ""
|
|
|
|
parts = []
|
|
collecting = False
|
|
|
|
for sib in h1.next_siblings:
|
|
if getattr(sib, "class", None) == ["toplink"]:
|
|
continue
|
|
if getattr(sib, "class", None) == ["bottomlink"]:
|
|
break
|
|
if getattr(sib, "name", None) in ["script", "style"]:
|
|
continue
|
|
|
|
if not collecting:
|
|
if getattr(sib, "name", None) == "br":
|
|
collecting = True
|
|
continue
|
|
|
|
text = (
|
|
sib.get_text("\n", strip=True)
|
|
if hasattr(sib, "get_text")
|
|
else str(sib).strip()
|
|
)
|
|
if text:
|
|
parts.append(text)
|
|
|
|
raw = "\n".join(parts)
|
|
return clean_text(raw, {})
|
|
|
|
# ------------------------------------------------------------
|
|
def save_chapter(self, number, title, text, output_base):
|
|
"""
|
|
Save chapter using same volume logic as BookScraper.
|
|
"""
|
|
max_size = 200
|
|
volume = ((number - 1) // max_size) + 1
|
|
vdir = f"{output_base}/v{volume}"
|
|
|
|
import os
|
|
|
|
os.makedirs(vdir, exist_ok=True)
|
|
|
|
fname = f"{number:05d}_{title}.txt"
|
|
full = f"{vdir}/{fname}"
|
|
|
|
with open(full, "w", encoding="utf-8") as f:
|
|
f.write(text)
|
|
|
|
log_debug(f"[DL] Saved chapter {number}: {full}")
|
|
return full
|
|
|
|
# ------------------------------------------------------------
|
|
def download(self, number, title, url, output_base):
|
|
soup = self.get_doc_with_retry(url)
|
|
text = self.parse_chapter_text(soup)
|
|
return self.save_chapter(number, title, text, output_base)
|