diff --git a/bookscraper/scraper/download_controller.py b/bookscraper/scraper/download_controller.py index f93bb33..aed1b2e 100644 --- a/bookscraper/scraper/download_controller.py +++ b/bookscraper/scraper/download_controller.py @@ -4,10 +4,12 @@ # Build Celery pipelines for all chapters # and pass book_id for abort/progress/log functionality. # + Download and replicate cover image to all volume folders +# + Generate scripts (allinone.txt, makebook.txt, say.txt) # ========================================================= from celery import group from scraper.tasks.pipeline import build_chapter_pipeline +from scraper.scriptgen import generate_all_scripts # <-- ADDED from logbus.publisher import log import os import requests @@ -22,6 +24,7 @@ class DownloadController: - consistent meta propagation - book_id-based abort + progress tracking - cover download + volume replication + - script generation (allinone.txt, makebook.txt, say.txt) """ def __init__(self, book_id: str, scrape_result: dict): @@ -62,7 +65,6 @@ class DownloadController: cover_path = os.path.join(self.book_base, "cover.jpg") - # HEADERS that bypass 403 hotlink protection headers = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) " @@ -145,7 +147,7 @@ class DownloadController: tasks.append( build_chapter_pipeline( - self.book_id, # ← UUID from scraping.py + self.book_id, # UUID chapter_num, chapter_url, volume_path, @@ -165,4 +167,17 @@ class DownloadController: # ------------------------------------------------------- self.replicate_cover_to_volumes() + # ------------------------------------------------------- + # 3) Generate scripts (allinone, makebook, say) + # ------------------------------------------------------- + try: + generate_all_scripts( + self.book_base, + self.title, + self.meta.get("author"), + ) + log(f"[CTRL] Scripts generated for '{self.title}'") + except Exception as e: + log(f"[CTRL] Script generation failed: {e}") + return async_result diff --git a/bookscraper/scraper/scriptgen.py b/bookscraper/scraper/scriptgen.py new file mode 100644 index 0000000..4b714cf --- /dev/null +++ b/bookscraper/scraper/scriptgen.py @@ -0,0 +1,112 @@ +# scraper/scriptgen.py +# Generates scripts (allinone.txt, makebook.txt, say.txt) +# using external templates + dynamic merge generation. + +import os +import stat +from logbus.publisher import log + +TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates") + + +# ------------------------------------------------------------ +# Load a template file from scraper/templates/ +# ------------------------------------------------------------ +def load_template(name: str) -> str: + path = os.path.join(TEMPLATE_DIR, name) + if not os.path.exists(path): + log(f"[SCRIPTGEN] Template missing: {path}") + return "" + with open(path, "r", encoding="utf-8") as f: + return f.read() + + +# ------------------------------------------------------------ +# Detect volumes (Volume_001, Volume_002, ...) +# ------------------------------------------------------------ +def detect_volumes(book_base: str): + vols = [] + for name in os.listdir(book_base): + p = os.path.join(book_base, name) + if os.path.isdir(p) and name.lower().startswith("volume_"): + try: + num = int(name.split("_")[1]) + vols.append((num, name)) + except Exception: + continue + vols.sort() + return [v[0] for v in vols] + + +# ------------------------------------------------------------ +# Build the dynamic merge block +# ------------------------------------------------------------ +def build_merge_block(title: str, author: str, volumes): + lines = [] + for vol in volumes: + line = ( + f'm4b-tool merge --jobs=4 --writer="{author}" ' + f'--albumartist="{author}" --album="{title}" ' + f'--name="{title}" --output-file="{title}-{vol}.m4b" ' + f'"{vol}" -vvv' + ) + lines.append(line) + + if not lines: + return "" + + return " \\\n&& ".join(lines) + "\n" + + +# ------------------------------------------------------------ +# Main generator +# ------------------------------------------------------------ +def generate_all_scripts(book_base: str, title: str, author: str): + log(f"[SCRIPTGEN] Generating scripts in {book_base}") + + # Load templates + say_template = load_template("say.template") + cleanup_template = load_template("cleanup.template") + + volumes = detect_volumes(book_base) + log(f"[SCRIPTGEN] Volumes detected: {volumes}") + + merge_block = build_merge_block(title, author, volumes) + + # -------------------------------------------------------- + # allinone.txt = say + cleanup + merge + # -------------------------------------------------------- + outfile = os.path.join(book_base, "allinone.txt") + with open(outfile, "w", encoding="utf-8") as f: + f.write(say_template) + f.write("\n") + f.write(cleanup_template) + f.write("\n") + f.write(merge_block) + os.chmod(outfile, os.stat(outfile).st_mode | stat.S_IEXEC) + log(f"[SCRIPTGEN] Created {outfile}") + + # -------------------------------------------------------- + # makebook.txt = merge only + # -------------------------------------------------------- + outfile2 = os.path.join(book_base, "makebook.txt") + with open(outfile2, "w", encoding="utf-8") as f: + f.write(merge_block) + os.chmod(outfile2, os.stat(outfile2).st_mode | stat.S_IEXEC) + log(f"[SCRIPTGEN] Created {outfile2}") + + # -------------------------------------------------------- + # say.txt = say + cleanup + # -------------------------------------------------------- + outfile3 = os.path.join(book_base, "say.txt") + with open(outfile3, "w", encoding="utf-8") as f: + f.write(say_template) + f.write("\n") + f.write(cleanup_template) + os.chmod(outfile3, os.stat(outfile3).st_mode | stat.S_IEXEC) + log(f"[SCRIPTGEN] Created {outfile3}") + + log(f"[SCRIPTGEN] All scripts generated successfully for '{title}'") + + +__all__ = ["generate_all_scripts"] diff --git a/bookscraper/scraper/tasks/download_tasks.py b/bookscraper/scraper/tasks/download_tasks.py index 59c9430..71a6da4 100644 --- a/bookscraper/scraper/tasks/download_tasks.py +++ b/bookscraper/scraper/tasks/download_tasks.py @@ -32,20 +32,10 @@ print(">>> [IMPORT] download_tasks.py loaded") # TIMESTAMPED LOG WRAPPER # ----------------------------------------------------------- def log_msg(book_id: str, message: str): - """ - Log with compact timestamp + book_id. - Pushes to: - - console (publisher.log) - - GUI Redis (push_ui) - """ ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") full = f"{ts} [{book_id}] {message}" - - # console log(full) - - # GUI (Redis rolling list) - push_ui(full) # NO book_id param — ui_log is DOM + push_ui(full) # ----------------------------------------------------------- @@ -68,12 +58,12 @@ GLOBAL_DELAY = int(os.getenv("DOWNLOAD_GLOBAL_MIN_DELAY", "1")) DELAY_KEY = "download:delay_lock" # ----------------------------------------------------------- -# Redis connection +# Redis # ----------------------------------------------------------- REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0") redis_client = redis.Redis.from_url(REDIS_URL) -SEM_KEY = "download:active" # semaphore counter +SEM_KEY = "download:active" # ============================================================ @@ -145,21 +135,15 @@ def download_chapter( "abort": True, } - # Mark started — ensures parse/save must run + # Mark started mark_chapter_started(book_id, chapter_num) - # Hard delay - if GLOBAL_DELAY > 0: - time.sleep(GLOBAL_DELAY) - - save_path = get_save_path(chapter_num, base_path) - # ----------------------------------------------------------- - # SKIP existing + # NEW POSITION FOR SKIP BLOCK (before any delay logic) # ----------------------------------------------------------- + save_path = get_save_path(chapter_num, base_path) + if os.path.exists(save_path): - wait_for_global_delay() - set_global_delay() log_msg(book_id, f"[DL] SKIP {chapter_num} (exists) → {save_path}") return { "chapter": chapter_num, @@ -169,6 +153,12 @@ def download_chapter( "path": save_path, } + # ----------------------------------------------------------- + # Hard delay (only for real downloads) + # ----------------------------------------------------------- + if GLOBAL_DELAY > 0: + time.sleep(GLOBAL_DELAY) + # Sync delay wait_for_global_delay() @@ -207,10 +197,7 @@ def download_chapter( delay = BASE_DELAY * (BACKOFF**attempt) # 429 hard block - if ( - hasattr(exc, "response") - and getattr(exc.response, "status_code", None) == 429 - ): + if getattr(getattr(exc, "response", None), "status_code", None) == 429: log_msg( book_id, f"[DL] 429 {chapter_num} → WAIT {DELAY_429}s " diff --git a/bookscraper/scraper/templates/allinone.template b/bookscraper/scraper/templates/allinone.template new file mode 100644 index 0000000..fcb6bec --- /dev/null +++ b/bookscraper/scraper/templates/allinone.template @@ -0,0 +1,44 @@ +#!/bin/sh + +main_dir="$( cd "$( dirname "$0" )" && pwd )" + +shopt -s nocasematch # For case-insensitive regex matching + +for subfolder in "$main_dir"/*; do + + if [ -d "$subfolder" ]; then + audiofolder="$subfolder/Audio" + mkdir -p "$audiofolder" + + for entry in "$subfolder"/*.txt; do + fn=$(basename "$entry") + [[ "${entry##*.}" =~ txt ]] + + echo "$fn" + inputfile="$subfolder/$fn" + outputfile="$audiofolder/${fn%.*}.m4b" + + now=$(date +"%T") + echo "Current time : $now" + echo "$inputfile ->" + echo "$outputfile" && \ + + if [ -f $outputfile ]; then + echo $outputfile + "exists: skipping" + else + say --voice=Sinji \ + --output-file="$outputfile" \ + --input-file="$inputfile" \ + --file-format=m4bf \ + --quality=127 \ + -r 200 \ + --data-format=aac + fi + + done + + fi + +done + +# CLEANUP WILL BE APPENDED BY scriptgen.py diff --git a/bookscraper/scraper/templates/cleanup.template b/bookscraper/scraper/templates/cleanup.template new file mode 100644 index 0000000..26380d5 --- /dev/null +++ b/bookscraper/scraper/templates/cleanup.template @@ -0,0 +1,4 @@ +find . -name "*.m4b" -size -580c | while read fname; do + echo "deleting $(ls -lah \"$fname\")" + rm "$fname" +done diff --git a/bookscraper/templates/say.template b/bookscraper/templates/say.template new file mode 100644 index 0000000..e69de29