You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/utils.py

115 lines
3.4 KiB

# ============================================================
# File: scraper/utils.py
# Purpose:
# Centralised replacement loader + text cleaner
# using 3 replacement categories:
# 1) HTML replacements
# 2) Encoding replacements
# 3) Junk-term replacements (generic "noise" phrases)
#
# Nothing in this file contains hardcoded cleanup rules.
# EVERYTHING comes from replacement files ONLY.
# ============================================================
import os
import re
from pathlib import Path
# ------------------------------------------------------------
# Generic key=value replacement loader
# ------------------------------------------------------------
def load_replacement_file(path: Path) -> dict:
"""
Loads key=value pairs from a file.
Missing file → {}.
Ignores empty lines and lines starting with '#'.
"""
if not path.exists():
return {}
repl = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" in line:
key, val = line.split("=", 1)
repl[key.strip()] = val.strip()
return repl
# ------------------------------------------------------------
# Load all categories (HTML → encoding → junk)
# Order matters: later overrides earlier.
# ------------------------------------------------------------
def load_all_replacements() -> dict:
root = Path(__file__).parent / "replacements"
html_file = root / "html.txt"
enc_file = root / "encoding.txt"
junk_file = root / "junk.txt"
repl = {}
repl.update(load_replacement_file(html_file))
repl.update(load_replacement_file(enc_file))
repl.update(load_replacement_file(junk_file))
return repl
# ------------------------------------------------------------
# Legacy compatibility wrapper
# Many modules still import: from scraper.utils import load_replacements
# This wrapper keeps everything working.
# ------------------------------------------------------------
def load_replacements(filepath=None) -> dict:
"""
Backward-compatible alias.
- If called with no filepath → return merged replacements.
- If called with a filepath → load that one file only.
"""
if filepath is None:
return load_all_replacements()
else:
# Allow explicit loading of a single file
path = Path(filepath)
return load_replacement_file(path)
# ------------------------------------------------------------
# Clean text using loaded replacements
# ------------------------------------------------------------
def clean_text(raw: str, repl: dict) -> str:
"""
Apply replacements and basic whitespace normalisation.
No hardcoded rules live here.
"""
if not raw:
return ""
txt = raw.replace("\r", "")
# Apply loaded replacements
for key, val in repl.items():
# print(f"Replacing: {key} → {val}")
txt = txt.replace(key, val)
# Collapse 3+ blank lines → max 1
txt = re.sub(r"\n{3,}", "\n\n", txt)
return txt.strip()
# ------------------------------------------------------------
# Determine chapter save path
# ------------------------------------------------------------
def get_save_path(chapter_num: int, base_path: str) -> str:
filename = f"{chapter_num:04d}.txt"
return os.path.join(base_path, filename)