You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/worker/downloader.py

144 lines
4.0 KiB

# worker/downloader.py
import time
import requests
from io import BytesIO
from bs4 import BeautifulSoup
from scraper.logger import log_debug
from scraper.utils.utils import clean_text
from urllib.parse import urljoin
class ChapterDownloader:
"""
Worker-side chapter downloader.
- Geen metadata scraping
- Geen BookScraper dependency
- Alleen: GET → parse → text → save
"""
def __init__(self, min_delay=1.0):
self.min_delay = min_delay
self._last_download_time = 0
# ------------------------------------------------------------
def throttle(self):
now = time.time()
elapsed = now - self._last_download_time
if elapsed < self.min_delay:
time.sleep(self.min_delay - elapsed)
self._last_download_time = time.time()
# ------------------------------------------------------------
def get_doc_with_retry(self, url):
attempt = 1
while True:
self.throttle()
log_debug(f"[DL] GET {url} (attempt {attempt})")
try:
resp = requests.get(
url,
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
except Exception as e:
log_debug(f"[DL] Network error {e} → retry")
attempt += 1
time.sleep(2)
continue
code = resp.status_code
if code == 200:
resp.encoding = "utf-8"
return BeautifulSoup(resp.text, "lxml")
if code == 429:
log_debug("[DL] 429 cooldown 60s")
time.sleep(60)
attempt += 1
continue
if code in (403, 500):
log_debug(f"[DL] HTTP {code} → retry")
time.sleep(5)
attempt += 1
continue
log_debug(f"[DL] Unexpected HTTP {code}")
time.sleep(3)
attempt += 1
# ------------------------------------------------------------
def parse_chapter_text(self, soup):
"""
Kopie van BookScraper.parse_chapter_text,
MAAR zonder dependencies op parse_title, parse_author, etc.
"""
body = soup.body
if not body:
return ""
h1 = body.find("h1")
if not h1:
return ""
parts = []
collecting = False
for sib in h1.next_siblings:
if getattr(sib, "class", None) == ["toplink"]:
continue
if getattr(sib, "class", None) == ["bottomlink"]:
break
if getattr(sib, "name", None) in ["script", "style"]:
continue
if not collecting:
if getattr(sib, "name", None) == "br":
collecting = True
continue
text = (
sib.get_text("\n", strip=True)
if hasattr(sib, "get_text")
else str(sib).strip()
)
if text:
parts.append(text)
raw = "\n".join(parts)
return clean_text(raw, {})
# ------------------------------------------------------------
def save_chapter(self, number, title, text, output_base):
"""
Save chapter using same volume logic as BookScraper.
"""
max_size = 200
volume = ((number - 1) // max_size) + 1
vdir = f"{output_base}/v{volume}"
import os
os.makedirs(vdir, exist_ok=True)
fname = f"{number:05d}_{title}.txt"
full = f"{vdir}/{fname}"
with open(full, "w", encoding="utf-8") as f:
f.write(text)
log_debug(f"[DL] Saved chapter {number}: {full}")
return full
# ------------------------------------------------------------
def download(self, number, title, url, output_base):
soup = self.get_doc_with_retry(url)
text = self.parse_chapter_text(soup)
return self.save_chapter(number, title, text, output_base)