feat/download-progress-abort
parent
f27b33a882
commit
788572e1fa
@ -1,57 +0,0 @@
|
|||||||
# scraper/utils.py
|
|
||||||
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Load replacements from text_replacements.txt (optional file)
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
def load_replacements(filepath="text_replacements.txt") -> dict:
|
|
||||||
"""
|
|
||||||
Load key=value style replacements.
|
|
||||||
Empty or missing file → return {}.
|
|
||||||
"""
|
|
||||||
path = Path(filepath)
|
|
||||||
|
|
||||||
if not path.exists():
|
|
||||||
return {}
|
|
||||||
|
|
||||||
repl = {}
|
|
||||||
|
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
|
||||||
for line in f:
|
|
||||||
line = line.strip()
|
|
||||||
if "=" in line:
|
|
||||||
key, val = line.split("=", 1)
|
|
||||||
repl[key.strip()] = val.strip()
|
|
||||||
|
|
||||||
return repl
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Clean extracted HTML text
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
def clean_text(raw: str, repl_dict: dict = None) -> str:
|
|
||||||
"""
|
|
||||||
Normalizes whitespace, removes junk, and applies replacements.
|
|
||||||
repl_dict is optional → falls back to {}.
|
|
||||||
"""
|
|
||||||
if repl_dict is None:
|
|
||||||
repl_dict = {}
|
|
||||||
|
|
||||||
txt = raw
|
|
||||||
|
|
||||||
# Normalize CRLF
|
|
||||||
txt = txt.replace("\r", "")
|
|
||||||
|
|
||||||
# Collapse multiple blank lines
|
|
||||||
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
|
||||||
|
|
||||||
# Apply replacements
|
|
||||||
for key, val in repl_dict.items():
|
|
||||||
txt = txt.replace(key, val)
|
|
||||||
|
|
||||||
# Strip excessive whitespace at edges
|
|
||||||
return txt.strip()
|
|
||||||
@ -1,36 +1,67 @@
|
|||||||
# scraper/utils.py
|
import os
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def load_replacements(path="text_replacements.txt") -> dict:
|
# ------------------------------------------------------------
|
||||||
|
# Load replacements from text_replacements.txt (optional file)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def load_replacements(filepath="text_replacements.txt") -> dict:
|
||||||
"""
|
"""
|
||||||
Load key=value replacements from a simple text file.
|
Load key=value style replacements.
|
||||||
Lines beginning with # are ignored.
|
Empty or missing file → return {}.
|
||||||
|
Lines starting with '#' are ignored.
|
||||||
"""
|
"""
|
||||||
fp = Path(path)
|
path = Path(filepath)
|
||||||
if not fp.exists():
|
|
||||||
|
if not path.exists():
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
repl = {}
|
repl = {}
|
||||||
for line in fp.read_text(encoding="utf-8").splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
if not line or line.startswith("#"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if "=" in line:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
k, v = line.split("=", 1)
|
for line in f:
|
||||||
repl[k.strip()] = v.strip()
|
line = line.strip()
|
||||||
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
|
if "=" in line:
|
||||||
|
key, val = line.split("=", 1)
|
||||||
|
repl[key.strip()] = val.strip()
|
||||||
|
|
||||||
return repl
|
return repl
|
||||||
|
|
||||||
|
|
||||||
def clean_text(raw: str, repl_dict: dict) -> str:
|
# ------------------------------------------------------------
|
||||||
|
# Clean extracted HTML text
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def clean_text(raw: str, repl_dict: dict = None) -> str:
|
||||||
"""
|
"""
|
||||||
Cleans text using user-defined replacements.
|
Normalize whitespace, remove junk, apply replacements.
|
||||||
|
repl_dict is optional → {} if none provided.
|
||||||
"""
|
"""
|
||||||
txt = raw
|
if repl_dict is None:
|
||||||
|
repl_dict = {}
|
||||||
|
|
||||||
|
txt = raw.replace("\r", "") # normalize CRLF
|
||||||
|
|
||||||
for k, v in repl_dict.items():
|
# Collapse 3+ blank lines → max 1 empty line
|
||||||
txt = txt.replace(k, v)
|
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
||||||
|
|
||||||
|
# Apply replacements
|
||||||
|
for key, val in repl_dict.items():
|
||||||
|
txt = txt.replace(key, val)
|
||||||
|
|
||||||
return txt.strip()
|
return txt.strip()
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Determine save path for a chapter (shared by download & save)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def get_save_path(chapter_num: int, base_path: str) -> str:
|
||||||
|
"""
|
||||||
|
Returns the filesystem path where this chapter should be saved.
|
||||||
|
Formats the filename as 0001.txt, 0002.txt, ...
|
||||||
|
"""
|
||||||
|
|
||||||
|
filename = f"{chapter_num:04d}.txt"
|
||||||
|
return os.path.join(base_path, filename)
|
||||||
|
|||||||
Loading…
Reference in new issue