# scraper/utils.py import re import os from pathlib import Path # ------------------------------------------------------------ # Load replacements from text_replacements.txt (optional file) # ------------------------------------------------------------ def load_replacements(filepath="text_replacements.txt") -> dict: """ Load key=value style replacements. Empty or missing file → return {}. """ path = Path(filepath) if not path.exists(): return {} repl = {} with open(path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if "=" in line: key, val = line.split("=", 1) repl[key.strip()] = val.strip() return repl # ------------------------------------------------------------ # Clean extracted HTML text # ------------------------------------------------------------ def clean_text(raw: str, repl_dict: dict = None) -> str: """ Normalizes whitespace, removes junk, and applies replacements. repl_dict is optional → falls back to {}. """ if repl_dict is None: repl_dict = {} txt = raw # Normalize CRLF txt = txt.replace("\r", "") # Collapse multiple blank lines txt = re.sub(r"\n{3,}", "\n\n", txt) # Apply replacements for key, val in repl_dict.items(): txt = txt.replace(key, val) # Strip excessive whitespace at edges return txt.strip()