# ============================================================ # File: scraper/utils.py # Purpose: # Centralised replacement loader + text cleaner # using 3 replacement categories: # 1) HTML replacements # 2) Encoding replacements # 3) Junk-term replacements (generic "noise" phrases) # # Nothing in this file contains hardcoded cleanup rules. # EVERYTHING comes from replacement files ONLY. # ============================================================ import os import re from pathlib import Path # ------------------------------------------------------------ # Generic key=value replacement loader # ------------------------------------------------------------ def load_replacement_file(path: Path) -> dict: """ Loads key=value pairs from a file. Missing file → {}. Ignores empty lines and lines starting with '#'. """ if not path.exists(): return {} repl = {} with open(path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line or line.startswith("#"): continue if "=" in line: key, val = line.split("=", 1) repl[key.strip()] = val.strip() return repl # ------------------------------------------------------------ # Load all categories (HTML → encoding → junk) # Order matters: later overrides earlier. # ------------------------------------------------------------ def load_all_replacements() -> dict: root = Path(__file__).parent / "replacements" html_file = root / "html.txt" enc_file = root / "encoding.txt" junk_file = root / "junk.txt" repl = {} repl.update(load_replacement_file(html_file)) repl.update(load_replacement_file(enc_file)) repl.update(load_replacement_file(junk_file)) return repl # ------------------------------------------------------------ # Legacy compatibility wrapper # Many modules still import: from scraper.utils import load_replacements # This wrapper keeps everything working. # ------------------------------------------------------------ def load_replacements(filepath=None) -> dict: """ Backward-compatible alias. - If called with no filepath → return merged replacements. - If called with a filepath → load that one file only. """ if filepath is None: return load_all_replacements() else: # Allow explicit loading of a single file path = Path(filepath) return load_replacement_file(path) # ------------------------------------------------------------ # Clean text using loaded replacements # ------------------------------------------------------------ def clean_text(raw: str, repl: dict) -> str: """ Apply replacements and basic whitespace normalisation. No hardcoded rules live here. """ if not raw: return "" txt = raw.replace("\r", "") # Apply loaded replacements for key, val in repl.items(): # print(f"Replacing: {key} → {val}") txt = txt.replace(key, val) # Collapse 3+ blank lines → max 1 txt = re.sub(r"\n{3,}", "\n\n", txt) return txt.strip() # ------------------------------------------------------------ # Determine chapter save path # ------------------------------------------------------------ def get_save_path(chapter_num: int, base_path: str) -> str: filename = f"{chapter_num:04d}.txt" return os.path.join(base_path, filename)