You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
3.4 KiB
115 lines
3.4 KiB
# ============================================================
|
|
# File: scraper/utils.py
|
|
# Purpose:
|
|
# Centralised replacement loader + text cleaner
|
|
# using 3 replacement categories:
|
|
# 1) HTML replacements
|
|
# 2) Encoding replacements
|
|
# 3) Junk-term replacements (generic "noise" phrases)
|
|
#
|
|
# Nothing in this file contains hardcoded cleanup rules.
|
|
# EVERYTHING comes from replacement files ONLY.
|
|
# ============================================================
|
|
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Generic key=value replacement loader
|
|
# ------------------------------------------------------------
|
|
def load_replacement_file(path: Path) -> dict:
|
|
"""
|
|
Loads key=value pairs from a file.
|
|
Missing file → {}.
|
|
Ignores empty lines and lines starting with '#'.
|
|
"""
|
|
if not path.exists():
|
|
return {}
|
|
|
|
repl = {}
|
|
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
|
|
if "=" in line:
|
|
key, val = line.split("=", 1)
|
|
repl[key.strip()] = val.strip()
|
|
|
|
return repl
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Load all categories (HTML → encoding → junk)
|
|
# Order matters: later overrides earlier.
|
|
# ------------------------------------------------------------
|
|
def load_all_replacements() -> dict:
|
|
root = Path(__file__).parent / "replacements"
|
|
|
|
html_file = root / "html.txt"
|
|
enc_file = root / "encoding.txt"
|
|
junk_file = root / "junk.txt"
|
|
|
|
repl = {}
|
|
repl.update(load_replacement_file(html_file))
|
|
repl.update(load_replacement_file(enc_file))
|
|
repl.update(load_replacement_file(junk_file))
|
|
|
|
return repl
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Legacy compatibility wrapper
|
|
# Many modules still import: from scraper.utils import load_replacements
|
|
# This wrapper keeps everything working.
|
|
# ------------------------------------------------------------
|
|
def load_replacements(filepath=None) -> dict:
|
|
"""
|
|
Backward-compatible alias.
|
|
- If called with no filepath → return merged replacements.
|
|
- If called with a filepath → load that one file only.
|
|
"""
|
|
if filepath is None:
|
|
return load_all_replacements()
|
|
else:
|
|
# Allow explicit loading of a single file
|
|
path = Path(filepath)
|
|
return load_replacement_file(path)
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Clean text using loaded replacements
|
|
# ------------------------------------------------------------
|
|
def clean_text(raw: str, repl: dict) -> str:
|
|
"""
|
|
Apply replacements and basic whitespace normalisation.
|
|
No hardcoded rules live here.
|
|
"""
|
|
if not raw:
|
|
return ""
|
|
|
|
txt = raw.replace("\r", "")
|
|
|
|
# Apply loaded replacements
|
|
for key, val in repl.items():
|
|
# print(f"Replacing: {key} → {val}")
|
|
txt = txt.replace(key, val)
|
|
|
|
# Collapse 3+ blank lines → max 1
|
|
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
|
|
|
return txt.strip()
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Determine chapter save path
|
|
# ------------------------------------------------------------
|
|
def get_save_path(chapter_num: int, base_path: str) -> str:
|
|
filename = f"{chapter_num:04d}.txt"
|
|
return os.path.join(base_path, filename)
|