kmftools/bookscraper/scraper/utils.py

# ============================================================
# File: scraper/utils.py
# Purpose:
#   Centralised replacement loader + text cleaner
#   using 3 replacement categories:
#       1) HTML replacements
#       2) Encoding replacements
#       3) Junk-term replacements (generic "noise" phrases)
#
# Nothing in this file contains hardcoded cleanup rules.
# EVERYTHING comes from replacement files ONLY.
# ============================================================

import os
import re
from pathlib import Path


# ------------------------------------------------------------
# Generic key=value replacement loader
# ------------------------------------------------------------
def load_replacement_file(path: Path) -> dict:
    """
    Loads key=value pairs from a file.
    Missing file → {}.
    Ignores empty lines and lines starting with '#'.
    """
    if not path.exists():
        return {}

    repl = {}

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if not line or line.startswith("#"):
                continue

            if "=" in line:
                key, val = line.split("=", 1)
                repl[key.strip()] = val.strip()

    return repl


# ------------------------------------------------------------
# Load all categories (HTML → encoding → junk)
# Order matters: later overrides earlier.
# ------------------------------------------------------------
def load_all_replacements() -> dict:
    root = Path(__file__).parent / "replacements"

    html_file = root / "html.txt"
    enc_file = root / "encoding.txt"
    junk_file = root / "junk.txt"

    repl = {}
    repl.update(load_replacement_file(html_file))
    repl.update(load_replacement_file(enc_file))
    repl.update(load_replacement_file(junk_file))

    return repl


# ------------------------------------------------------------
# Legacy compatibility wrapper
# Many modules still import: from scraper.utils import load_replacements
# This wrapper keeps everything working.
# ------------------------------------------------------------
def load_replacements(filepath=None) -> dict:
    """
    Backward-compatible alias.
    - If called with no filepath → return merged replacements.
    - If called with a filepath → load that one file only.
    """
    if filepath is None:
        return load_all_replacements()
    else:
        # Allow explicit loading of a single file
        path = Path(filepath)
        return load_replacement_file(path)


# ------------------------------------------------------------
# Clean text using loaded replacements
# ------------------------------------------------------------
def clean_text(raw: str, repl: dict) -> str:
    """
    Apply replacements and basic whitespace normalisation.
    No hardcoded rules live here.
    """
    if not raw:
        return ""

    txt = raw.replace("\r", "")

    # Apply loaded replacements
    for key, val in repl.items():
        # print(f"Replacing: {key} → {val}")
        txt = txt.replace(key, val)

    # Collapse 3+ blank lines → max 1
    txt = re.sub(r"\n{3,}", "\n\n", txt)

    return txt.strip()


# ------------------------------------------------------------
# Determine chapter save path
# ------------------------------------------------------------
def get_save_path(chapter_num: int, base_path: str) -> str:
    filename = f"{chapter_num:04d}.txt"
    return os.path.join(base_path, filename)