parent
fe4ed78802
commit
8e2d3cec49
@ -0,0 +1 @@
|
||||
output/
|
||||
@ -1,64 +1,52 @@
|
||||
# app.py
|
||||
from flask import Flask, request, Response, render_template
|
||||
import time
|
||||
import queue
|
||||
import redis
|
||||
import os
|
||||
# ============================================
|
||||
# File: bookscraper/app.py
|
||||
# Ensure project directory is on PYTHONPATH
|
||||
# ============================================
|
||||
|
||||
from scraper.book_scraper import BookScraper
|
||||
from scraper.sites import BookSite
|
||||
from scraper.logger import add_listener, remove_listener, LOG_BUFFER
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
r = redis.Redis.from_url("redis://redis:6379/0")
|
||||
from scraper.logger import log_debug
|
||||
from scraper.download_controller import DownloadController
|
||||
from flask import Flask, render_template, request
|
||||
from dotenv import load_dotenv
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add this directory (bookscraper/) to Python import path
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
# Load .env BEFORE any Celery app is imported
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# RUN SCRAPER
|
||||
# ----------------------------------------------------------
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/run", methods=["POST"])
|
||||
def run_scraper():
|
||||
data = request.json
|
||||
site = BookSite()
|
||||
scraper = BookScraper(site, data["url"])
|
||||
result = scraper.execute()
|
||||
|
||||
return {
|
||||
"title": result["title"],
|
||||
"buffer": LOG_BUFFER.getvalue()
|
||||
}
|
||||
@app.route("/", methods=["GET"])
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# REALTIME LOG STREAM (SSE)
|
||||
# ----------------------------------------------------------
|
||||
@app.route("/start", methods=["POST"])
|
||||
def start_scraping():
|
||||
url = request.form.get("url", "").strip()
|
||||
|
||||
@app.route("/stream")
|
||||
def stream():
|
||||
if not url:
|
||||
return render_template("result.html", error="Geen URL opgegeven.")
|
||||
|
||||
def event_stream():
|
||||
pub = r.pubsub()
|
||||
pub.subscribe("logs")
|
||||
try:
|
||||
log_debug(f"[WEB] Start scraping: {url}")
|
||||
|
||||
for msg in pub.listen():
|
||||
if msg["type"] == "message":
|
||||
yield f"data: {msg['data'].decode()}\n\n"
|
||||
ctl = DownloadController(url)
|
||||
result = ctl.start()
|
||||
|
||||
return Response(event_stream(), mimetype="text/event-stream")
|
||||
return render_template("result.html", result=result, url=url)
|
||||
|
||||
except Exception as e:
|
||||
log_debug(f"[WEB] ERROR: {e}")
|
||||
return render_template("result.html", error=str(e), url=url)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
debug = os.getenv("FLASK_DEBUG", "0") == "1"
|
||||
host = os.getenv("HOST", "0.0.0.0")
|
||||
port = int(os.getenv("PORT", "5000"))
|
||||
|
||||
app.run(debug=debug, host=host, port=port)
|
||||
app.run(host="0.0.0.0", port=5000, debug=debug)
|
||||
|
||||
@ -0,0 +1,18 @@
|
||||
from flask import Blueprint, request, jsonify
|
||||
from scraper.download_controller import DownloadController
|
||||
|
||||
bp_download = Blueprint("download", __name__)
|
||||
|
||||
|
||||
@bp_download.post("/api/download")
|
||||
def api_download():
|
||||
data = request.get_json() or {}
|
||||
url = data.get("url")
|
||||
|
||||
if not url:
|
||||
return jsonify({"error": "Missing URL"}), 400
|
||||
|
||||
ctl = DownloadController(url)
|
||||
result = ctl.start()
|
||||
|
||||
return jsonify(result), 200
|
||||
@ -0,0 +1,39 @@
|
||||
# scraper/download_controller.py
|
||||
|
||||
from logbus.publisher import log
|
||||
from scraper.tasks.pipeline import build_chapter_pipeline
|
||||
|
||||
|
||||
class DownloadController:
|
||||
|
||||
def __init__(self, url: str):
|
||||
self.url = url
|
||||
self.scraper = None # door BookScraper gevuld
|
||||
self.base_path = None
|
||||
|
||||
def start(self):
|
||||
log(f"[DL-CONTROLLER] Parsing metadata for {self.url}")
|
||||
|
||||
# 1) Boek info verzamelen
|
||||
scraper = self.scraper = self._init_scraper()
|
||||
scraper.parse_book_info()
|
||||
|
||||
# base_path bepalen
|
||||
self.base_path = scraper.get_base_path()
|
||||
|
||||
# 2) Chapters ophalen
|
||||
chapters = scraper.get_chapter_list()
|
||||
|
||||
# 3) Per chapter een Celery pipeline starten
|
||||
for ch in chapters:
|
||||
log(f"[DL-CONTROLLER] Queue pipeline for chapter {ch.number}")
|
||||
|
||||
workflow = build_chapter_pipeline(
|
||||
chapter_number=ch.number,
|
||||
chapter_url=ch.url,
|
||||
base_path=self.base_path
|
||||
)
|
||||
|
||||
workflow.delay() # 🔥 dit start de chain
|
||||
|
||||
return {"status": "queued", "chapters": len(chapters)}
|
||||
@ -0,0 +1,33 @@
|
||||
# scraper/tasks/download_tasks.py
|
||||
|
||||
from celery import shared_task
|
||||
from logbus.publisher import log
|
||||
import requests
|
||||
|
||||
|
||||
@shared_task(bind=True, queue="download", ignore_result=False)
|
||||
def download_chapter(self, chapter_number: int, chapter_url: str):
|
||||
"""
|
||||
Download a chapter page and return raw HTML for parsing.
|
||||
Does NOT save anything; that is done by save_tasks.py
|
||||
"""
|
||||
|
||||
log(f"[DL] Downloading chapter {chapter_number}: {chapter_url}")
|
||||
|
||||
try:
|
||||
resp = requests.get(chapter_url, timeout=15)
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
|
||||
log(f"[DL] OK {chapter_number}: {len(html)} bytes")
|
||||
|
||||
# Dit resultaat wordt doorgegeven aan parse_task
|
||||
return {
|
||||
"chapter": chapter_number,
|
||||
"url": chapter_url,
|
||||
"html": html,
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[DL] ERROR downloading {chapter_url}: {exc}")
|
||||
raise
|
||||
@ -0,0 +1,57 @@
|
||||
# scraper/tasks/parse_tasks.py
|
||||
|
||||
from celery import shared_task
|
||||
from logbus.publisher import log
|
||||
from scraper.utils import clean_text
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
@shared_task(bind=True, queue="parse", ignore_result=False)
|
||||
def parse_chapter(self, html: str, chapter_url: str):
|
||||
"""
|
||||
Parse downloaded chapter HTML into clean text.
|
||||
Returns a dict:
|
||||
{
|
||||
"url": chapter_url,
|
||||
"text": "...parsed text..."
|
||||
}
|
||||
"""
|
||||
try:
|
||||
log(f"[PARSE] Start parsing: {chapter_url}")
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Veel Chinese sites gebruiken dit soort containers:
|
||||
possible_blocks = [
|
||||
"#content",
|
||||
".content",
|
||||
"div#content",
|
||||
"div.content",
|
||||
"div#chaptercontent",
|
||||
"#chapterContent"
|
||||
]
|
||||
|
||||
node = None
|
||||
for sel in possible_blocks:
|
||||
r = soup.select_one(sel)
|
||||
if r:
|
||||
node = r
|
||||
break
|
||||
|
||||
if not node:
|
||||
log(
|
||||
f"[PARSE] WARNING: no known content block found in {chapter_url}")
|
||||
text = clean_text(soup.get_text())
|
||||
else:
|
||||
text = clean_text(node.get_text())
|
||||
|
||||
log(f"[PARSE] Finished parsing: {chapter_url} ({len(text)} chars)")
|
||||
|
||||
return {
|
||||
"url": chapter_url,
|
||||
"text": text,
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[PARSE] ERROR parsing {chapter_url}: {exc}")
|
||||
raise
|
||||
@ -0,0 +1,28 @@
|
||||
# scraper/tasks/pipeline.py
|
||||
|
||||
from celery import chain
|
||||
from logbus.publisher import log
|
||||
|
||||
from scraper.tasks.download_tasks import download_chapter
|
||||
from scraper.tasks.parse_tasks import parse_chapter
|
||||
from scraper.tasks.save_tasks import save_chapter
|
||||
|
||||
|
||||
def build_chapter_pipeline(chapter_number: int, chapter_url: str, base_path: str):
|
||||
"""
|
||||
Build a Celery pipeline for a single chapter:
|
||||
download -> parse -> save
|
||||
"""
|
||||
|
||||
log(f"[PIPELINE] Building chain for chapter {chapter_number}")
|
||||
|
||||
# Important: download returns dict {chapter, url, html}
|
||||
# parse accepts html + chapter_url (via s())
|
||||
# save accepts chapter_number, text, base_path
|
||||
workflow = chain(
|
||||
download_chapter.s(chapter_number, chapter_url),
|
||||
parse_chapter.s(), # takes previous result dict
|
||||
save_chapter.s(base_path=base_path)
|
||||
)
|
||||
|
||||
return workflow
|
||||
@ -0,0 +1,57 @@
|
||||
# scraper/tasks/save_tasks.py
|
||||
|
||||
from celery import shared_task
|
||||
from logbus.publisher import log
|
||||
import os
|
||||
|
||||
|
||||
@shared_task(bind=True, queue="save", ignore_result=False)
|
||||
def save_chapter(self, result: dict, base_path: str):
|
||||
"""
|
||||
Save parsed chapter text to disk.
|
||||
result = {
|
||||
"url": ...,
|
||||
"text": ...
|
||||
}
|
||||
"""
|
||||
|
||||
try:
|
||||
text = result.get("text", "")
|
||||
url = result.get("url")
|
||||
|
||||
# Haal chapter nummer uit URL
|
||||
# Bijvoorbeeld: .../12345.html
|
||||
# ⇒ 12345
|
||||
chapter_number = extract_chapter_number(url)
|
||||
|
||||
if not os.path.exists(base_path):
|
||||
os.makedirs(base_path, exist_ok=True)
|
||||
|
||||
filename = f"{chapter_number:05d}.txt"
|
||||
path = os.path.join(base_path, filename)
|
||||
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(text)
|
||||
|
||||
log(f"[SAVE] Saved chapter {chapter_number} → {path}")
|
||||
|
||||
return {"chapter": chapter_number, "path": path}
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[SAVE] ERROR saving chapter from {url}: {exc}")
|
||||
raise
|
||||
|
||||
|
||||
def extract_chapter_number(url: str) -> int:
|
||||
"""
|
||||
Utility extractor for chapter numbers from a URL.
|
||||
Example: https://site.com/1234.html → 1234
|
||||
"""
|
||||
try:
|
||||
import re
|
||||
m = re.search(r'(\d+)\.html?', url)
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
except:
|
||||
pass
|
||||
return 0
|
||||
@ -0,0 +1,35 @@
|
||||
from celery import shared_task
|
||||
from scraper.book_scraper import BookScraper
|
||||
from scraper.sites import BookSite
|
||||
from logbus.publisher import log
|
||||
|
||||
|
||||
@shared_task(bind=True, queue="scraping")
|
||||
def scrape_book(self, url):
|
||||
"""
|
||||
HIGH-LEVEL SCRAPER TASK
|
||||
Roept synchronen BookScraper aan voor een volledige scrape.
|
||||
"""
|
||||
log(f"[SCRAPER] Start scrape: {url}")
|
||||
|
||||
scraper = BookScraper(BookSite(), url)
|
||||
result = scraper.execute()
|
||||
|
||||
log(f"[SCRAPER] Finished scrape: {url}")
|
||||
return {"title": result["title"]}
|
||||
|
||||
|
||||
@shared_task(bind=True, queue="download", max_retries=5)
|
||||
def download_chapter_task(self, number, title, url, output_base):
|
||||
"""
|
||||
Download alleen één chapter.
|
||||
download_worker.py voert dit uiteindelijk uit.
|
||||
"""
|
||||
from worker.download_worker import download_single_chapter
|
||||
|
||||
try:
|
||||
return download_single_chapter(number, title, url, output_base)
|
||||
|
||||
except Exception as e:
|
||||
log(f"[DOWNLOAD] Error while downloading chapter {number}: {e}")
|
||||
raise self.retry(countdown=3)
|
||||
@ -1,10 +0,0 @@
|
||||
# tasks/pipeline.py
|
||||
from celery import shared_task
|
||||
from tasks.scraping import scrape_book
|
||||
from tasks.audio import text_to_audio
|
||||
|
||||
|
||||
@shared_task(bind=True)
|
||||
def scrape_and_convert(self, url):
|
||||
result = scrape_book.delay(url)
|
||||
return result.id
|
||||
@ -1,17 +0,0 @@
|
||||
# tasks/scraping.py
|
||||
from celery import shared_task
|
||||
from scraper.book_scraper import BookScraper
|
||||
from scraper.sites import BookSite
|
||||
from logbus.publisher import log
|
||||
|
||||
|
||||
@shared_task(bind=True, queue="scraping")
|
||||
def scrape_book(self, url):
|
||||
log(f"START scraping: {url}")
|
||||
|
||||
site = BookSite()
|
||||
scraper = BookScraper(site, url)
|
||||
result = scraper.execute()
|
||||
|
||||
log(f"FINISHED scraping: {url}")
|
||||
return {"title": result["title"]}
|
||||
@ -1,40 +1,34 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<html lang="nl">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>BookScraper</title>
|
||||
<style>
|
||||
body { font-family: Arial; padding:20px; }
|
||||
#log { background:#000; color:#0f0; padding:10px; height:400px; overflow:auto; white-space:pre-wrap; }
|
||||
body { font-family: Arial, sans-serif; padding: 40px; max-width: 600px; margin: auto; }
|
||||
h1 { margin-bottom: 20px; }
|
||||
input[type="text"] {
|
||||
width: 100%; padding: 12px; font-size: 16px;
|
||||
border: 1px solid #ccc; border-radius: 6px;
|
||||
}
|
||||
button {
|
||||
margin-top: 20px;
|
||||
padding: 12px 20px;
|
||||
background: #007bff; color: white;
|
||||
border: none; border-radius: 6px;
|
||||
font-size: 16px; cursor: pointer;
|
||||
}
|
||||
button:hover { background: #0056b3; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>BookScraper</h1>
|
||||
<h1>BookScraper WebGUI</h1>
|
||||
|
||||
<input id="url" type="text" placeholder="Book URL" style="width:400px">
|
||||
<button onclick="startScrape()">Start</button>
|
||||
|
||||
<h2>Realtime log:</h2>
|
||||
<div id="log"></div>
|
||||
|
||||
<script>
|
||||
function startScrape() {
|
||||
document.getElementById("log").innerHTML = "";
|
||||
|
||||
const evtSource = new EventSource("/stream");
|
||||
evtSource.onmessage = function(e) {
|
||||
const logDiv = document.getElementById("log");
|
||||
logDiv.innerText += e.data + "\n";
|
||||
logDiv.scrollTop = logDiv.scrollHeight;
|
||||
};
|
||||
|
||||
fetch("/run", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ url: document.getElementById("url").value })
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<form action="/start" method="POST">
|
||||
<label for="url">Geef een boek-URL op:</label><br><br>
|
||||
<input type="text" id="url" name="url" placeholder="https://example.com/book/12345" required>
|
||||
<button type="submit">Start Scraping</button>
|
||||
</form>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -1,18 +1,63 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<html lang="nl">
|
||||
<head>
|
||||
<title>Scrape Done</title>
|
||||
<meta charset="UTF-8">
|
||||
<title>Scrape Resultaat</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; padding: 40px; max-width: 900px; margin: auto; }
|
||||
h1 { margin-bottom: 10px; }
|
||||
.error { padding: 15px; background: #ffdddd; border-left: 5px solid #ff4444; margin-bottom: 20px; }
|
||||
.box { padding: 15px; background: #f7f7f7; border: 1px solid #ddd; margin-bottom: 20px; border-radius: 6px; }
|
||||
a { color: #007bff; text-decoration: none; }
|
||||
a:hover { text-decoration: underline; }
|
||||
pre { background: #222; color: #eee; padding: 10px; border-radius: 6px; overflow-x: auto; }
|
||||
small { color: #555; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>Scrape Complete</h1>
|
||||
<a href="/">← Terug</a>
|
||||
|
||||
<p><strong>Book title:</strong> {{ title }}</p>
|
||||
{% if error %}
|
||||
<div class="error">
|
||||
<strong>Fout:</strong><br>{{ error }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<p><strong>Output folder:</strong></p>
|
||||
<pre>{{ basepath }}</pre>
|
||||
<h1>Scrape resultaat</h1>
|
||||
|
||||
<a href="/">Scrape another book</a>
|
||||
{% if result %}
|
||||
<div class="box">
|
||||
<strong>Titel:</strong> {{ result.title }}<br>
|
||||
<strong>Auteur:</strong> {{ result.author }}<br>
|
||||
</div>
|
||||
|
||||
{% if result.description %}
|
||||
<div class="box">
|
||||
<strong>Beschrijving:</strong><br>
|
||||
<p>{{ result.description }}</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="box">
|
||||
<strong>Aantal chapters:</strong> {{ result.chapters|length }}
|
||||
</div>
|
||||
|
||||
{% if result.chapters %}
|
||||
<div class="box">
|
||||
<strong>Chapters:</strong><br><br>
|
||||
<ul>
|
||||
{% for ch in result.chapters %}
|
||||
<li>
|
||||
<a href="{{ ch.url }}" target="_blank">
|
||||
Chapter {{ ch.number }} — {{ ch.title }}
|
||||
</a>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -0,0 +1,11 @@
|
||||
# worker/download_worker.py
|
||||
|
||||
from worker.downloader import ChapterDownloader
|
||||
|
||||
|
||||
def download_single_chapter(number, title, url, output_base):
|
||||
"""
|
||||
Called by Celery task.
|
||||
"""
|
||||
dl = ChapterDownloader()
|
||||
return dl.download(number, title, url, output_base)
|
||||
@ -0,0 +1,139 @@
|
||||
# worker/downloader.py
|
||||
|
||||
import time
|
||||
import requests
|
||||
from io import BytesIO
|
||||
from bs4 import BeautifulSoup
|
||||
from scraper.logger import log_debug
|
||||
from scraper.utils import clean_text
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class ChapterDownloader:
|
||||
"""
|
||||
Worker-side chapter downloader.
|
||||
- Geen metadata scraping
|
||||
- Geen BookScraper dependency
|
||||
- Alleen: GET → parse → text → save
|
||||
"""
|
||||
|
||||
def __init__(self, min_delay=1.0):
|
||||
self.min_delay = min_delay
|
||||
self._last_download_time = 0
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def throttle(self):
|
||||
now = time.time()
|
||||
elapsed = now - self._last_download_time
|
||||
|
||||
if elapsed < self.min_delay:
|
||||
time.sleep(self.min_delay - elapsed)
|
||||
|
||||
self._last_download_time = time.time()
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def get_doc_with_retry(self, url):
|
||||
attempt = 1
|
||||
|
||||
while True:
|
||||
self.throttle()
|
||||
log_debug(f"[DL] GET {url} (attempt {attempt})")
|
||||
|
||||
try:
|
||||
resp = requests.get(
|
||||
url,
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
timeout=10,
|
||||
)
|
||||
except Exception as e:
|
||||
log_debug(f"[DL] Network error {e} → retry")
|
||||
attempt += 1
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
code = resp.status_code
|
||||
|
||||
if code == 200:
|
||||
resp.encoding = "utf-8"
|
||||
return BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
if code == 429:
|
||||
log_debug("[DL] 429 cooldown 60s")
|
||||
time.sleep(60)
|
||||
attempt += 1
|
||||
continue
|
||||
|
||||
if code in (403, 500):
|
||||
log_debug(f"[DL] HTTP {code} → retry")
|
||||
time.sleep(5)
|
||||
attempt += 1
|
||||
continue
|
||||
|
||||
log_debug(f"[DL] Unexpected HTTP {code}")
|
||||
time.sleep(3)
|
||||
attempt += 1
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def parse_chapter_text(self, soup):
|
||||
"""
|
||||
Kopie van BookScraper.parse_chapter_text,
|
||||
MAAR zonder dependencies op parse_title, parse_author, etc.
|
||||
"""
|
||||
body = soup.body
|
||||
if not body:
|
||||
return ""
|
||||
|
||||
h1 = body.find("h1")
|
||||
if not h1:
|
||||
return ""
|
||||
|
||||
parts = []
|
||||
collecting = False
|
||||
|
||||
for sib in h1.next_siblings:
|
||||
if getattr(sib, "class", None) == ["toplink"]:
|
||||
continue
|
||||
if getattr(sib, "class", None) == ["bottomlink"]:
|
||||
break
|
||||
if getattr(sib, "name", None) in ["script", "style"]:
|
||||
continue
|
||||
|
||||
if not collecting:
|
||||
if getattr(sib, "name", None) == "br":
|
||||
collecting = True
|
||||
continue
|
||||
|
||||
text = sib.get_text("\n", strip=True) if hasattr(
|
||||
sib, "get_text") else str(sib).strip()
|
||||
if text:
|
||||
parts.append(text)
|
||||
|
||||
raw = "\n".join(parts)
|
||||
return clean_text(raw, {})
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def save_chapter(self, number, title, text, output_base):
|
||||
"""
|
||||
Save chapter using same volume logic as BookScraper.
|
||||
"""
|
||||
max_size = 200
|
||||
volume = ((number - 1) // max_size) + 1
|
||||
vdir = f"{output_base}/v{volume}"
|
||||
|
||||
import os
|
||||
os.makedirs(vdir, exist_ok=True)
|
||||
|
||||
fname = f"{number:05d}_{title}.txt"
|
||||
full = f"{vdir}/{fname}"
|
||||
|
||||
with open(full, "w", encoding="utf-8") as f:
|
||||
f.write(text)
|
||||
|
||||
log_debug(f"[DL] Saved chapter {number}: {full}")
|
||||
return full
|
||||
|
||||
# ------------------------------------------------------------
|
||||
def download(self, number, title, url, output_base):
|
||||
soup = self.get_doc_with_retry(url)
|
||||
text = self.parse_chapter_text(soup)
|
||||
return self.save_chapter(number, title, text, output_base)
|
||||
Loading…
Reference in new issue