You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/utils.py

58 lines
1.4 KiB

# scraper/utils.py
import re
import os
from pathlib import Path
# ------------------------------------------------------------
# Load replacements from text_replacements.txt (optional file)
# ------------------------------------------------------------
def load_replacements(filepath="text_replacements.txt") -> dict:
"""
Load key=value style replacements.
Empty or missing file → return {}.
"""
path = Path(filepath)
if not path.exists():
return {}
repl = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if "=" in line:
key, val = line.split("=", 1)
repl[key.strip()] = val.strip()
return repl
# ------------------------------------------------------------
# Clean extracted HTML text
# ------------------------------------------------------------
def clean_text(raw: str, repl_dict: dict = None) -> str:
"""
Normalizes whitespace, removes junk, and applies replacements.
repl_dict is optional → falls back to {}.
"""
if repl_dict is None:
repl_dict = {}
txt = raw
# Normalize CRLF
txt = txt.replace("\r", "")
# Collapse multiple blank lines
txt = re.sub(r"\n{3,}", "\n\n", txt)
# Apply replacements
for key, val in repl_dict.items():
txt = txt.replace(key, val)
# Strip excessive whitespace at edges
return txt.strip()