kmftools/bookscraper/scraper/tasks/utils.py

# scraper/utils.py

import re
import os
from pathlib import Path


# ------------------------------------------------------------
# Load replacements from text_replacements.txt (optional file)
# ------------------------------------------------------------
def load_replacements(filepath="text_replacements.txt") -> dict:
    """
    Load key=value style replacements.
    Empty or missing file → return {}.
    """
    path = Path(filepath)

    if not path.exists():
        return {}

    repl = {}

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if "=" in line:
                key, val = line.split("=", 1)
                repl[key.strip()] = val.strip()

    return repl


# ------------------------------------------------------------
# Clean extracted HTML text
# ------------------------------------------------------------
def clean_text(raw: str, repl_dict: dict = None) -> str:
    """
    Normalizes whitespace, removes junk, and applies replacements.
    repl_dict is optional → falls back to {}.
    """
    if repl_dict is None:
        repl_dict = {}

    txt = raw

    # Normalize CRLF
    txt = txt.replace("\r", "")

    # Collapse multiple blank lines
    txt = re.sub(r"\n{3,}", "\n\n", txt)

    # Apply replacements
    for key, val in repl_dict.items():
        txt = txt.replace(key, val)

    # Strip excessive whitespace at edges
    return txt.strip()