You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
1.4 KiB
58 lines
1.4 KiB
# scraper/utils.py
|
|
|
|
import re
|
|
import os
|
|
from pathlib import Path
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Load replacements from text_replacements.txt (optional file)
|
|
# ------------------------------------------------------------
|
|
def load_replacements(filepath="text_replacements.txt") -> dict:
|
|
"""
|
|
Load key=value style replacements.
|
|
Empty or missing file → return {}.
|
|
"""
|
|
path = Path(filepath)
|
|
|
|
if not path.exists():
|
|
return {}
|
|
|
|
repl = {}
|
|
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if "=" in line:
|
|
key, val = line.split("=", 1)
|
|
repl[key.strip()] = val.strip()
|
|
|
|
return repl
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Clean extracted HTML text
|
|
# ------------------------------------------------------------
|
|
def clean_text(raw: str, repl_dict: dict = None) -> str:
|
|
"""
|
|
Normalizes whitespace, removes junk, and applies replacements.
|
|
repl_dict is optional → falls back to {}.
|
|
"""
|
|
if repl_dict is None:
|
|
repl_dict = {}
|
|
|
|
txt = raw
|
|
|
|
# Normalize CRLF
|
|
txt = txt.replace("\r", "")
|
|
|
|
# Collapse multiple blank lines
|
|
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
|
|
|
# Apply replacements
|
|
for key, val in repl_dict.items():
|
|
txt = txt.replace(key, val)
|
|
|
|
# Strip excessive whitespace at edges
|
|
return txt.strip()
|