You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/utils.py

68 lines
1.9 KiB

import os
import re
from pathlib import Path
# ------------------------------------------------------------
# Load replacements from text_replacements.txt (optional file)
# ------------------------------------------------------------
def load_replacements(filepath="text_replacements.txt") -> dict:
"""
Load key=value style replacements.
Empty or missing file → return {}.
Lines starting with '#' are ignored.
"""
path = Path(filepath)
if not path.exists():
return {}
repl = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" in line:
key, val = line.split("=", 1)
repl[key.strip()] = val.strip()
return repl
# ------------------------------------------------------------
# Clean extracted HTML text
# ------------------------------------------------------------
def clean_text(raw: str, repl_dict: dict = None) -> str:
"""
Normalize whitespace, remove junk, apply replacements.
repl_dict is optional → {} if none provided.
"""
if repl_dict is None:
repl_dict = {}
txt = raw.replace("\r", "") # normalize CRLF
# Collapse 3+ blank lines → max 1 empty line
txt = re.sub(r"\n{3,}", "\n\n", txt)
# Apply replacements
for key, val in repl_dict.items():
txt = txt.replace(key, val)
return txt.strip()
# ------------------------------------------------------------
# Determine save path for a chapter (shared by download & save)
# ------------------------------------------------------------
def get_save_path(chapter_num: int, base_path: str) -> str:
"""
Returns the filesystem path where this chapter should be saved.
Formats the filename as 0001.txt, 0002.txt, ...
"""
filename = f"{chapter_num:04d}.txt"
return os.path.join(base_path, filename)