You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
28 lines
818 B
28 lines
818 B
# ============================================================
|
|
# File: scraper/engine/fetcher.py
|
|
# Purpose:
|
|
# Low-level HTML fetch utility shared by all site scrapers.
|
|
# Replaces scattered _fetch() logic inside BookScraper.
|
|
# ============================================================
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) "
|
|
"Gecko/20100101 Firefox/118.0"
|
|
)
|
|
}
|
|
|
|
|
|
def fetch_html(url: str, encoding: str = "utf-8", timeout: int = 10) -> BeautifulSoup:
|
|
"""
|
|
Fetch HTML with a consistent user-agent and encoding.
|
|
Returns BeautifulSoup(lxml).
|
|
"""
|
|
resp = requests.get(url, headers=HEADERS, timeout=timeout)
|
|
resp.encoding = encoding
|
|
return BeautifulSoup(resp.text, "lxml")
|