import os import re from pathlib import Path # ------------------------------------------------------------ # Load replacements from text_replacements.txt (optional file) # ------------------------------------------------------------ def load_replacements(filepath="text_replacements.txt") -> dict: """ Load key=value style replacements. Empty or missing file → return {}. Lines starting with '#' are ignored. """ path = Path(filepath) if not path.exists(): return {} repl = {} with open(path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line or line.startswith("#"): continue if "=" in line: key, val = line.split("=", 1) repl[key.strip()] = val.strip() return repl # ------------------------------------------------------------ # Clean extracted HTML text # ------------------------------------------------------------ def clean_text(raw: str, repl_dict: dict = None) -> str: """ Normalize whitespace, remove junk, apply replacements. repl_dict is optional → {} if none provided. """ if repl_dict is None: repl_dict = {} txt = raw.replace("\r", "") # normalize CRLF # Collapse 3+ blank lines → max 1 empty line txt = re.sub(r"\n{3,}", "\n\n", txt) # Apply replacements for key, val in repl_dict.items(): txt = txt.replace(key, val) return txt.strip() # ------------------------------------------------------------ # Determine save path for a chapter (shared by download & save) # ------------------------------------------------------------ def get_save_path(chapter_num: int, base_path: str) -> str: """ Returns the filesystem path where this chapter should be saved. Formats the filename as 0001.txt, 0002.txt, ... """ filename = f"{chapter_num:04d}.txt" return os.path.join(base_path, filename)