You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
1.9 KiB
68 lines
1.9 KiB
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Load replacements from text_replacements.txt (optional file)
|
|
# ------------------------------------------------------------
|
|
def load_replacements(filepath="text_replacements.txt") -> dict:
|
|
"""
|
|
Load key=value style replacements.
|
|
Empty or missing file → return {}.
|
|
Lines starting with '#' are ignored.
|
|
"""
|
|
path = Path(filepath)
|
|
|
|
if not path.exists():
|
|
return {}
|
|
|
|
repl = {}
|
|
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
if "=" in line:
|
|
key, val = line.split("=", 1)
|
|
repl[key.strip()] = val.strip()
|
|
|
|
return repl
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Clean extracted HTML text
|
|
# ------------------------------------------------------------
|
|
def clean_text(raw: str, repl_dict: dict = None) -> str:
|
|
"""
|
|
Normalize whitespace, remove junk, apply replacements.
|
|
repl_dict is optional → {} if none provided.
|
|
"""
|
|
if repl_dict is None:
|
|
repl_dict = {}
|
|
|
|
txt = raw.replace("\r", "") # normalize CRLF
|
|
|
|
# Collapse 3+ blank lines → max 1 empty line
|
|
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
|
|
|
# Apply replacements
|
|
for key, val in repl_dict.items():
|
|
txt = txt.replace(key, val)
|
|
|
|
return txt.strip()
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# Determine save path for a chapter (shared by download & save)
|
|
# ------------------------------------------------------------
|
|
def get_save_path(chapter_num: int, base_path: str) -> str:
|
|
"""
|
|
Returns the filesystem path where this chapter should be saved.
|
|
Formats the filename as 0001.txt, 0002.txt, ...
|
|
"""
|
|
|
|
filename = f"{chapter_num:04d}.txt"
|
|
return os.path.join(base_path, filename)
|