feat/download-progress-abort
parent
f27b33a882
commit
788572e1fa
@ -1,57 +0,0 @@
|
||||
# scraper/utils.py
|
||||
|
||||
import re
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Load replacements from text_replacements.txt (optional file)
|
||||
# ------------------------------------------------------------
|
||||
def load_replacements(filepath="text_replacements.txt") -> dict:
|
||||
"""
|
||||
Load key=value style replacements.
|
||||
Empty or missing file → return {}.
|
||||
"""
|
||||
path = Path(filepath)
|
||||
|
||||
if not path.exists():
|
||||
return {}
|
||||
|
||||
repl = {}
|
||||
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if "=" in line:
|
||||
key, val = line.split("=", 1)
|
||||
repl[key.strip()] = val.strip()
|
||||
|
||||
return repl
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Clean extracted HTML text
|
||||
# ------------------------------------------------------------
|
||||
def clean_text(raw: str, repl_dict: dict = None) -> str:
|
||||
"""
|
||||
Normalizes whitespace, removes junk, and applies replacements.
|
||||
repl_dict is optional → falls back to {}.
|
||||
"""
|
||||
if repl_dict is None:
|
||||
repl_dict = {}
|
||||
|
||||
txt = raw
|
||||
|
||||
# Normalize CRLF
|
||||
txt = txt.replace("\r", "")
|
||||
|
||||
# Collapse multiple blank lines
|
||||
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
||||
|
||||
# Apply replacements
|
||||
for key, val in repl_dict.items():
|
||||
txt = txt.replace(key, val)
|
||||
|
||||
# Strip excessive whitespace at edges
|
||||
return txt.strip()
|
||||
@ -1,36 +1,67 @@
|
||||
# scraper/utils.py
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_replacements(path="text_replacements.txt") -> dict:
|
||||
# ------------------------------------------------------------
|
||||
# Load replacements from text_replacements.txt (optional file)
|
||||
# ------------------------------------------------------------
|
||||
def load_replacements(filepath="text_replacements.txt") -> dict:
|
||||
"""
|
||||
Load key=value replacements from a simple text file.
|
||||
Lines beginning with # are ignored.
|
||||
Load key=value style replacements.
|
||||
Empty or missing file → return {}.
|
||||
Lines starting with '#' are ignored.
|
||||
"""
|
||||
fp = Path(path)
|
||||
if not fp.exists():
|
||||
path = Path(filepath)
|
||||
|
||||
if not path.exists():
|
||||
return {}
|
||||
|
||||
repl = {}
|
||||
for line in fp.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
if "=" in line:
|
||||
k, v = line.split("=", 1)
|
||||
repl[k.strip()] = v.strip()
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if "=" in line:
|
||||
key, val = line.split("=", 1)
|
||||
repl[key.strip()] = val.strip()
|
||||
|
||||
return repl
|
||||
|
||||
|
||||
def clean_text(raw: str, repl_dict: dict) -> str:
|
||||
# ------------------------------------------------------------
|
||||
# Clean extracted HTML text
|
||||
# ------------------------------------------------------------
|
||||
def clean_text(raw: str, repl_dict: dict = None) -> str:
|
||||
"""
|
||||
Cleans text using user-defined replacements.
|
||||
Normalize whitespace, remove junk, apply replacements.
|
||||
repl_dict is optional → {} if none provided.
|
||||
"""
|
||||
txt = raw
|
||||
if repl_dict is None:
|
||||
repl_dict = {}
|
||||
|
||||
txt = raw.replace("\r", "") # normalize CRLF
|
||||
|
||||
for k, v in repl_dict.items():
|
||||
txt = txt.replace(k, v)
|
||||
# Collapse 3+ blank lines → max 1 empty line
|
||||
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
||||
|
||||
# Apply replacements
|
||||
for key, val in repl_dict.items():
|
||||
txt = txt.replace(key, val)
|
||||
|
||||
return txt.strip()
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Determine save path for a chapter (shared by download & save)
|
||||
# ------------------------------------------------------------
|
||||
def get_save_path(chapter_num: int, base_path: str) -> str:
|
||||
"""
|
||||
Returns the filesystem path where this chapter should be saved.
|
||||
Formats the filename as 0001.txt, 0002.txt, ...
|
||||
"""
|
||||
|
||||
filename = f"{chapter_num:04d}.txt"
|
||||
return os.path.join(base_path, filename)
|
||||
|
||||
Loading…
Reference in new issue