From c711c5356de911b0b8a6730233c99072c7069f56 Mon Sep 17 00:00:00 2001 From: "peter.fong" Date: Sat, 29 Nov 2025 21:13:12 +0000 Subject: [PATCH 1/3] make_scripts toegvoegd --- bookscraper/scraper/book_scraper.py | 137 +++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 1 deletion(-) diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py index 83b0348..283b556 100644 --- a/bookscraper/scraper/book_scraper.py +++ b/bookscraper/scraper/book_scraper.py @@ -85,6 +85,8 @@ class BookScraper: else: self.download_all() + self.prepare_scripts(self.base_path) + return {"title": self.book_title} # ------------------------------------------------------------ @@ -345,7 +347,17 @@ class BookScraper: time.sleep(wait) attempt += 1 - fname.write_text(ch.text, encoding="utf-8") + if ch.number == 1: + header = ( + f"Description:\n{self.book_description}\n" + f" {ch.url}\n" + "----------------------------------------\n\n" + ) + content = header + ch.text + else: + content = ch.text + + fname.write_text(content, encoding="utf-8") log_debug(f"Saved chapter to v{volume}: {fname}") chapter_delay = float(os.getenv("CHAPTER_DELAY", "2")) log_debug(f"Throttling {chapter_delay}s before next chapter") @@ -389,3 +401,126 @@ class BookScraper: raw = "\n".join(parts) raw = clean_text(raw, self.site.replacements) return raw.strip() + # ------------------------------------------------------------ + # SCRIPT GENERATOR (C# prepareScripts equivalent) + # ------------------------------------------------------------ + + def prepare_scripts(self, path: Path): + log_debug("Preparing scripts...") + + # Alleen volume folders (v1, v2, v3, ...) + dirs = sorted([ + d for d in path.iterdir() + if d.is_dir() and d.name.startswith("v") + ]) + + # -------------------------------------------------------- + # M4B MERGE COMMANDS + # -------------------------------------------------------- + m4b_commands = [] + for d in dirs: + volname = d.name + cmd = ( + 'm4b-tool merge --jobs=4 ' + f'--writer="{self.book_author}" ' + f'--albumartist="{self.book_author}" ' + f'--album="{self.book_title}" ' + f'--name="{self.book_title}" ' + f'--output-file="{self.book_title}-{volname}.m4b" "./{volname}" -vvv' + ) + m4b_commands.append(cmd) + + # Mooie formatting + m4b_joined = " \\\n && ".join(m4b_commands) + "\n\n" + + move_script = r''' + find ./ -maxdepth 1 -name "*.m4b" | while read fname; do + echo "moving $(ls -lah "$fname")" + mv "$fname" ../ + done + '''.strip() + "\n" + + m4b_joined += move_script + + # -------------------------------------------------------- + # SAY TTS SCRIPT (jouw verbeterde versie – identiek voor say.txt en allinone.txt) + # -------------------------------------------------------- + say_script = r'''#!/usr/bin/env bash + + set -euo pipefail + + main_dir="$( cd "$( dirname "$0" )" && pwd )" + + shopt -s nocasematch # case-insensitive matching + + echo "=== TTS START ===" + echo "Main directory: $main_dir" + echo + + for subfolder in "$main_dir"/*; do + if [[ -d "$subfolder" ]]; then + + audiofolder="$subfolder/Audio" + mkdir -p "$audiofolder" + + for entry in "$subfolder"/*.txt; do + [[ -f "$entry" ]] || continue + + fn=$(basename "$entry") + inputfile="$entry" + outputfile="$audiofolder/${fn%.*}.m4b" + + now=$(date +"%T") + + echo "[$now] Processing $fn" + echo "Input : $inputfile" + echo "Output: $outputfile" + + if [[ -f "$outputfile" ]]; then + echo "[$now] EXISTS — skipping" + else + say --voice=Sinji \ + --output-file="$outputfile" \ + --input-file="$inputfile" \ + --file-format=m4bf \ + --quality=127 \ + -r 200 \ + --data-format=aac + fi + + echo + done + fi + done + + echo "=== CLEANUP TINY FILES ===" + find "$main_dir" -name "*.m4b" -size -580c | while read -r fname; do + echo "Deleting: $(ls -lah "$fname")" + rm "$fname" + done + echo + ''' + + # -------------------------------------------------------- + # SCHRIJF BESTANDEN WEG + # -------------------------------------------------------- + + # allinone.txt = TTS + merge + (path / "allinone.txt").write_text( + say_script + "\n" + m4b_joined, + encoding="utf-8" + ) + + # makebook.txt = alleen merge + (path / "makebook.txt").write_text( + m4b_joined, + encoding="utf-8" + ) + + # say.txt = alleen TTS (jouw vraag!) + (path / "say.txt").write_text( + say_script, + encoding="utf-8" + ) + + log_debug("Script files written: allinone.txt, makebook.txt, say.txt") From f5fdcbebf9734c02cb0d9c314cebba1358d1b4f6 Mon Sep 17 00:00:00 2001 From: "peter.fong" Date: Sat, 29 Nov 2025 21:19:45 +0000 Subject: [PATCH 2/3] chatgpt context meegegeven. --- bookscraper/CHATGPT_CONTEXT.md | 87 ++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 bookscraper/CHATGPT_CONTEXT.md diff --git a/bookscraper/CHATGPT_CONTEXT.md b/bookscraper/CHATGPT_CONTEXT.md new file mode 100644 index 0000000..4aeb9d6 --- /dev/null +++ b/bookscraper/CHATGPT_CONTEXT.md @@ -0,0 +1,87 @@ +ChatGPT Project Context – Bookscraper / Celery Branch + +(Plaatsen in /docs/CHATGPT_CONTEXT.md of in de repo root) + +1. Scraper Status (NIET AANPASSEN ZONDER TOESTEMMING) + +De Python-based bookscraper is volledig functioneel. +De volgende onderdelen zijn stabiel en mogen niet worden overschreven, herschreven of opgeschoond zonder expliciete toestemming: + +prepare_scripts() genereert drie scripts: + +say.txt: alleen het TTS-script (bash, timestamps, Sinji voice, safe) + +makebook.txt: alleen m4b merge + move + +allinone.txt: TTS + merge + move + +Volume-structuur: v1, v2, v3, … + +Chapter-output: + +Chapter 1 bevat een header: + +URL: +Description: + + +---------------------------------------- + + +Overige chapters hebben alleen de tekst + +Rate limiter werkt + +Chapter parsing werkt + +Description parsing werkt + +Cover download werkt + +Skiplogica werkt correct + +2. Ontwikkelregels voor ChatGPT + +Nooit bestaande werkende code verwijderen + +Geen stille rewrites + +Geen herstructurering zonder toestemming + +Wijzigingen worden minimalistisch en doelgericht toegepast + +Bij voorkeur veranderingen in diff/patch-stijl + +Altijd aangeven welke bestanden worden geraakt + +Directorystructuur behouden: +output///v1 etc. + +3. Huidige Focus: celery_branch + +ChatGPT moet zich richten op: + +Celery worker architectuur verbeteren + +Queueing & retry policies + +Stabiliteit & observability + +Integratie met scraping tasks + +Zonder scraperfunctie te breken + +4. Omgeving + +Project draait in VS Code Dev Containers + +Docker Compose structuren aanwezig + +Celery + queue + worker containers in gebruik + +Gebruik deze context in alle antwoorden. + +find . \ + -not -path "*/__pycache__*" \ + -not -name "*.pyc" \ + -print | sed -e 's;[^/]*/; |;g;s;|;|--;' From e5c1faf3c4cb2416d5d57e95cba6950d789fab03 Mon Sep 17 00:00:00 2001 From: "peter.fong" Date: Sat, 29 Nov 2025 21:30:18 +0000 Subject: [PATCH 3/3] booktitle in chapter 1 --- bookscraper/scraper/book_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py index 283b556..361122f 100644 --- a/bookscraper/scraper/book_scraper.py +++ b/bookscraper/scraper/book_scraper.py @@ -349,6 +349,7 @@ class BookScraper: if ch.number == 1: header = ( + f"Title: {self.book_title}\n" f"Description:\n{self.book_description}\n" f" {ch.url}\n" "----------------------------------------\n\n"