You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
533 lines
14 KiB
533 lines
14 KiB
# ============================================
|
|
# File: bookscraper/app.py (ASYNC SCRAPING)
|
|
# ============================================
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
import os
|
|
from flask import (
|
|
Flask,
|
|
render_template,
|
|
request,
|
|
jsonify,
|
|
send_from_directory,
|
|
redirect,
|
|
url_for,
|
|
)
|
|
|
|
print(">>> [WEB] Importing celery_app …")
|
|
from celery_app import celery_app
|
|
from celery.result import AsyncResult
|
|
|
|
from db.db import init_db
|
|
from db.repository import (
|
|
get_registered_books,
|
|
fetch_book,
|
|
fetch_all_books,
|
|
get_progress,
|
|
)
|
|
|
|
from logbus.publisher import log
|
|
from scraper.logger import log_debug
|
|
from scraper.abort import set_abort
|
|
from scraper.ui_log import get_ui_logs, reset_ui_logs
|
|
from scraper.state import state as r
|
|
from scraper.logger_decorators import logcall
|
|
from scraper.utils.state_sync import sync_books_from_redis
|
|
from scraper.services.init_service import InitService
|
|
|
|
|
|
# INIT DB
|
|
init_db()
|
|
|
|
app = Flask(__name__)
|
|
|
|
# =====================================================
|
|
# STATIC FILE SERVING
|
|
# =====================================================
|
|
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
|
|
|
|
|
@app.route("/output/<path:filename>")
|
|
@logcall
|
|
def serve_output(filename):
|
|
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
|
|
|
|
|
|
# =====================================================
|
|
# SECTION 1 — NAVIGATION / HTML PAGES
|
|
# =====================================================
|
|
|
|
|
|
@app.route("/", methods=["GET"])
|
|
@logcall
|
|
def index():
|
|
return redirect(url_for("dashboard"))
|
|
|
|
|
|
@app.route("/dashboard", methods=["GET"])
|
|
@logcall
|
|
def dashboard():
|
|
logs_list = get_ui_logs() or []
|
|
registered_books = get_registered_books()
|
|
log(f"[WEB] Registered books: {registered_books}")
|
|
from db.repository import fetch_all_books
|
|
from pprint import pprint
|
|
|
|
pprint(fetch_all_books())
|
|
pprint(get_registered_books())
|
|
|
|
# reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
|
|
|
|
return render_template(
|
|
"dashboard/dashboard.html",
|
|
books=list_active_books(),
|
|
registered=registered_books,
|
|
logs=logs_list,
|
|
)
|
|
|
|
|
|
@app.route("/book/<book_idx>")
|
|
@logcall
|
|
def book_detail(book_idx):
|
|
title = r.get(f"book:{book_idx}:title") or book_idx
|
|
return render_template(
|
|
"dashboard/book_detail.html",
|
|
book_id=book_idx,
|
|
title=title,
|
|
logs=get_ui_logs(),
|
|
)
|
|
|
|
|
|
# =====================================================
|
|
# SECTION 2 — ACTION ROUTES (INIT, START, ABORT, HIDE)
|
|
# =====================================================
|
|
|
|
|
|
@app.route("/init", methods=["POST"])
|
|
@logcall
|
|
def init_book():
|
|
# -------------------------------------------------
|
|
# Accept single URL (legacy) OR multi-line URLs
|
|
# -------------------------------------------------
|
|
raw_urls = request.form.get("urls") or request.form.get("url") or ""
|
|
|
|
urls = [line.strip() for line in raw_urls.splitlines() if line.strip()]
|
|
|
|
if not urls:
|
|
return render_template(
|
|
"dashboard/dashboard.html",
|
|
error="Geen URL(s) opgegeven.",
|
|
books=list_active_books(),
|
|
registered=get_registered_books(),
|
|
logs=get_ui_logs(),
|
|
)
|
|
|
|
# -------------------------------------------------
|
|
# Duplicate check: existing book_ids
|
|
# -------------------------------------------------
|
|
existing_books = {b["book_idx"] for b in fetch_all_books()}
|
|
results = []
|
|
|
|
# -------------------------------------------------
|
|
# Process each URL independently
|
|
# -------------------------------------------------
|
|
for url in urls:
|
|
try:
|
|
book_id = InitService.derive_book_id(url)
|
|
|
|
if book_id in existing_books:
|
|
results.append(
|
|
{
|
|
"url": url,
|
|
"status": "skipped",
|
|
"book_id": book_id,
|
|
"message": "Al geregistreerd",
|
|
}
|
|
)
|
|
continue
|
|
|
|
result = InitService.execute(url)
|
|
|
|
results.append(
|
|
{
|
|
"url": url,
|
|
"status": "registered",
|
|
"book_id": result.get("book_id"),
|
|
"title": result.get("title"),
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
log_debug(f"[INIT] ERROR for url={url}: {e}")
|
|
results.append(
|
|
{
|
|
"url": url,
|
|
"status": "error",
|
|
"error": str(e),
|
|
}
|
|
)
|
|
|
|
# -------------------------------------------------
|
|
# Summary message
|
|
# -------------------------------------------------
|
|
ok = sum(1 for r in results if r["status"] == "registered")
|
|
skipped = sum(1 for r in results if r["status"] == "skipped")
|
|
failed = sum(1 for r in results if r["status"] == "error")
|
|
|
|
message = f"Geregistreerd: {ok}, overgeslagen: {skipped}, fouten: {failed}"
|
|
|
|
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
|
|
|
|
return render_template(
|
|
"dashboard/dashboard.html",
|
|
message=message,
|
|
init_results=results, # optioneel voor UI-weergave
|
|
books=list_active_books(),
|
|
registered=reg,
|
|
logs=get_ui_logs(),
|
|
)
|
|
|
|
|
|
@app.route("/hide/<book_idx>", methods=["POST"])
|
|
@logcall
|
|
def hide_registered_book(book_idx):
|
|
# intentionally left disabled
|
|
pass
|
|
|
|
|
|
@app.route("/start", methods=["POST"])
|
|
@logcall
|
|
def start_scraping():
|
|
# 1) Form field: book_idx
|
|
book_idx = request.form.get("book_idx")
|
|
log(f"[WEB][START] Received start request for book_idx={book_idx}")
|
|
if not book_idx:
|
|
msg = "book_idx ontbreekt in formulier"
|
|
log(f"[WEB][START] ERROR: {msg}")
|
|
return jsonify({"status": "error", "message": msg}), 400
|
|
|
|
# 2) Fetch boek uit SQLite
|
|
try:
|
|
book = fetch_book(book_idx)
|
|
log(f"[WEB][START] Fetched book from DB: {book}")
|
|
except Exception as e:
|
|
log(f"[WEB][START] DB ERROR: {e}")
|
|
return jsonify({"status": "error", "message": "DB fout"}), 500
|
|
|
|
if not book:
|
|
msg = f"Boek '{book_idx}' niet gevonden in DB"
|
|
log(f"[WEB][START] ERROR: {msg}")
|
|
return jsonify({"status": "error", "message": msg}), 404
|
|
|
|
# 3) Boek moet een URL hebben
|
|
url = book.get("book_url")
|
|
if not url:
|
|
msg = f"Boek '{book_idx}' heeft geen book_url in DB"
|
|
log(f"[WEB][START] ERROR: {msg}")
|
|
return jsonify({"status": "error", "message": msg}), 500
|
|
|
|
# 4) Reset UI logs
|
|
reset_ui_logs()
|
|
|
|
# 5) Logging
|
|
log(f"[WEB][START] Starting full scrape book_idx={book_idx}, url={url}")
|
|
log_debug(f"[WEB][START] DEBUG: book data = {book}")
|
|
|
|
# 6) Celery controller taak starten
|
|
try:
|
|
async_result = celery_app.send_task(
|
|
"scraper.tasks.controller_tasks.start_full_scrape",
|
|
args=[book_idx],
|
|
queue="controller",
|
|
)
|
|
except Exception as e:
|
|
log(f"[WEB][START] Celery ERROR: {e}")
|
|
return jsonify({"status": "error", "message": f"Celery fout: {e}"}), 500
|
|
|
|
# 7) Successfully dispatched task
|
|
log(f"[WEB][START] Task dispatched: {async_result.id}")
|
|
|
|
reg = [b for b in get_registered_books() if b.get("status") != "hidden"]
|
|
|
|
return render_template(
|
|
"dashboard/dashboard.html",
|
|
scraping_task_id=async_result.id,
|
|
books=list_active_books(),
|
|
registered=reg,
|
|
logs=get_ui_logs(),
|
|
)
|
|
|
|
|
|
@app.route("/abort/<book_idx>", methods=["POST"])
|
|
@logcall
|
|
def abort_download(book_idx):
|
|
log_debug(f"[WEB] Abort requested for book: {book_idx}")
|
|
set_abort(book_idx)
|
|
return jsonify({"status": "ok", "aborted": book_idx})
|
|
|
|
|
|
# =====================================================
|
|
# SECTION 3 — API ROUTES (JSON)
|
|
# =====================================================
|
|
|
|
|
|
@app.route("/api/state/all", methods=["GET"])
|
|
@logcall
|
|
def api_state_all():
|
|
"""
|
|
Returns the merged SQL + Redis state for all books
|
|
(same logic as /debug/inspect_state but JSON-only).
|
|
"""
|
|
from scraper.utils.state_sync import inspect_books_state
|
|
|
|
return jsonify(inspect_books_state())
|
|
|
|
|
|
@app.route("/api/books")
|
|
@logcall
|
|
def api_books():
|
|
return jsonify(list_active_books())
|
|
|
|
|
|
@app.route("/api/book/<book_idx>/status")
|
|
@logcall
|
|
def api_book_status(book_idx):
|
|
return jsonify(getStatus(book_idx))
|
|
|
|
|
|
@app.route("/api/book/<book_idx>/logs")
|
|
@logcall
|
|
def api_book_logs(book_idx):
|
|
logs = r.lrange(f"logs:{book_idx}", 0, -1) or []
|
|
return jsonify(logs)
|
|
|
|
|
|
@app.route("/progress/<book_idx>")
|
|
@logcall
|
|
def progress(book_idx):
|
|
return jsonify(get_progress(book_idx))
|
|
|
|
|
|
@app.route("/celery-result/<task_id>")
|
|
@logcall
|
|
def celery_result(task_id):
|
|
result = AsyncResult(task_id, app=celery_app)
|
|
if result.successful():
|
|
return jsonify({"ready": True, "result": result.get()})
|
|
if result.failed():
|
|
return jsonify({"ready": True, "error": "failed"})
|
|
return jsonify({"ready": False})
|
|
|
|
|
|
@app.route("/clear-logs", methods=["POST"])
|
|
@logcall
|
|
def clear_logs():
|
|
reset_ui_logs()
|
|
return jsonify({"status": "ok"})
|
|
|
|
|
|
@app.route("/logs", methods=["GET"])
|
|
@logcall
|
|
def logs():
|
|
try:
|
|
last_index = int(request.args.get("last_index", -1))
|
|
except:
|
|
last_index = -1
|
|
|
|
all_logs = get_ui_logs() or []
|
|
|
|
new_lines = []
|
|
new_last = last_index
|
|
|
|
for idx, line in enumerate(all_logs):
|
|
if idx > last_index:
|
|
new_lines.append(line)
|
|
new_last = idx
|
|
|
|
return jsonify({"lines": new_lines, "last": new_last})
|
|
|
|
|
|
from flask import render_template
|
|
from scraper.services.status_check_service import StatusCheckService
|
|
from logbus.publisher import log
|
|
|
|
|
|
from db.repository import get_book_state
|
|
|
|
|
|
@app.route("/inspect/statuscheck/<book_idx>", methods=["POST"])
|
|
@logcall
|
|
def inspect_statuscheck(book_idx):
|
|
try:
|
|
StatusCheckService.run(book_idx)
|
|
return ("", 204) # background action, geen UI
|
|
except Exception as e:
|
|
log(f"[STATUSCHECK] ERROR book_idx={book_idx}: {e}")
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
# =====================================================
|
|
# SECTION 4 — DEBUG ROUTES
|
|
# =====================================================
|
|
|
|
|
|
@app.route("/debug/sync_state", methods=["GET"])
|
|
def debug_sync_state():
|
|
results = sync_books_from_redis()
|
|
return {"status": "ok", "synced": results}
|
|
|
|
|
|
from scraper.utils.state_sync import inspect_books_state
|
|
|
|
|
|
@app.route("/debug/inspect_state", methods=["GET"])
|
|
def debug_inspect_state():
|
|
results = inspect_books_state()
|
|
return render_template("debug/inspect_state.html", results=results)
|
|
|
|
|
|
@app.route("/debug/redis-keys")
|
|
@logcall
|
|
def debug_redis_keys():
|
|
cursor = 0
|
|
results = {}
|
|
while True:
|
|
cursor, keys = r.scan(cursor, match="*", count=200)
|
|
for k in keys:
|
|
try:
|
|
results[k] = r.get(k)
|
|
except:
|
|
results[k] = "<non-string value>"
|
|
if cursor == 0:
|
|
break
|
|
return jsonify(results)
|
|
|
|
|
|
# =====================================================
|
|
# DB DEBUG
|
|
# =====================================================
|
|
|
|
|
|
@app.route("/api/db/books")
|
|
@logcall
|
|
def api_db_books():
|
|
try:
|
|
books = fetch_all_books()
|
|
return jsonify({"status": "ok", "books": books})
|
|
except Exception as e:
|
|
return jsonify({"status": "error", "message": str(e)}), 500
|
|
|
|
|
|
# =============================================
|
|
# DEBUG QUEUE VIEW (HTML)
|
|
# =============================================
|
|
|
|
from flask import render_template
|
|
from urllib.parse import urlparse
|
|
import redis
|
|
from celery_app import celery_app
|
|
|
|
|
|
@app.route("/debug/queues")
|
|
def debug_queues():
|
|
insp = celery_app.control.inspect()
|
|
|
|
workers_active = insp.active() or {}
|
|
workers_scheduled = insp.scheduled() or {}
|
|
workers_reserved = insp.reserved() or {}
|
|
|
|
redis_url = os.getenv("REDIS_BROKER")
|
|
parsed = urlparse(redis_url)
|
|
|
|
r2 = redis.Redis(
|
|
host=parsed.hostname,
|
|
port=parsed.port,
|
|
db=int(parsed.path.strip("/") or 0),
|
|
decode_responses=True,
|
|
)
|
|
|
|
queue_names = ["scraping", "controller", "download", "parse", "save", "audio"]
|
|
|
|
queues = []
|
|
for q in queue_names:
|
|
key = f"celery:{q}"
|
|
try:
|
|
queues.append(
|
|
{
|
|
"name": q,
|
|
"redis_key": key,
|
|
"length": r2.llen(key),
|
|
"items": r2.lrange(key, 0, 30),
|
|
}
|
|
)
|
|
except Exception as e:
|
|
queues.append(
|
|
{
|
|
"name": q,
|
|
"redis_key": key,
|
|
"length": "ERR",
|
|
"items": [str(e)],
|
|
}
|
|
)
|
|
|
|
return render_template(
|
|
"debug/queues.html",
|
|
queues=queues,
|
|
workers_active=workers_active,
|
|
workers_reserved=workers_reserved,
|
|
workers_scheduled=workers_scheduled,
|
|
)
|
|
|
|
|
|
# =====================================================
|
|
# SECTION 5 — INTERNAL HELPERS
|
|
# =====================================================
|
|
|
|
|
|
@logcall
|
|
def getStatus(book_idx):
|
|
state = r.hgetall(f"book:{book_idx}:state")
|
|
status = state.get("status") or "unknown"
|
|
dl_done = int(state.get("chapters_download_done", 0))
|
|
dl_skipped = int(state.get("chapters_download_skipped", 0))
|
|
dl_total = int(state.get("chapters_total", 0))
|
|
au_done = int(state.get("audio_done") or 0)
|
|
title = state.get("title") or book_idx
|
|
|
|
return {
|
|
"book_id": book_idx,
|
|
"title": title,
|
|
"status": status,
|
|
"download_done": dl_done,
|
|
"download_skipped": dl_skipped,
|
|
"download_total": dl_total,
|
|
"audio_done": au_done,
|
|
"audio_total": dl_total,
|
|
}
|
|
|
|
|
|
@logcall
|
|
def list_active_books():
|
|
books = []
|
|
for key in r.scan_iter(match="book:*:state", count=1000):
|
|
first = key.find(":")
|
|
second = key.find(":", first + 1)
|
|
book_idx = key[first + 1 : second]
|
|
books.append(getStatus(book_idx))
|
|
return books
|
|
|
|
|
|
# =====================================================
|
|
# SECTION 6 — FLASK RUNNER
|
|
# =====================================================
|
|
|
|
if __name__ == "__main__":
|
|
debug = os.getenv("FLASK_DEBUG", "0") == "1"
|
|
host = os.getenv("HOST", "0.0.0.0")
|
|
port = int(os.getenv("PORT", "5000"))
|
|
|
|
log_debug(f"[WEB] Starting Flask server on {host}:{port}, debug={debug}")
|
|
app.run(host=host, port=port, debug=debug)
|