You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/app.py

304 lines
8.1 KiB

# ============================================
# File: bookscraper/app.py (ASYNC SCRAPING)
# ============================================
from dotenv import load_dotenv
load_dotenv()
import os
import redis
from flask import Flask, render_template, request, jsonify, send_from_directory
print(">>> [WEB] Importing celery_app …")
from celery_app import celery_app
from db.db import init_db
from celery.result import AsyncResult
from scraper.logger import log_debug
from scraper.abort import set_abort
from scraper.progress import get_progress
from scraper.ui_log import get_ui_logs, reset_ui_logs, get_ui_logs_delta
from scraper.state import state as r
from scraper.services.init_service import InitService
from db.repository import get_registered_books
# INIT DB
init_db()
app = Flask(__name__)
# =====================================================
# STATIC FILE SERVING
# =====================================================
OUTPUT_ROOT = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
@app.route("/output/<path:filename>")
def serve_output(filename):
return send_from_directory(OUTPUT_ROOT, filename, as_attachment=False)
# =====================================================
# SECTION 1 — NAVIGATION / HTML PAGES
# =====================================================
@app.route("/", methods=["GET"])
def index():
return render_template("index.html")
@app.route("/dashboard", methods=["GET"])
def dashboard():
logs_list = get_ui_logs() or []
return render_template(
"dashboard/dashboard.html",
books=list_active_books(), # Redis
registered=get_registered_books(), # SQLite INIT results
logs=logs_list,
)
@app.route("/book/<book_id>")
def book_detail(book_id):
title = r.get(f"book:{book_id}:title") or book_id
return render_template(
"dashboard/book_detail.html",
book_id=book_id,
title=title,
logs=get_ui_logs(),
)
# =====================================================
# SECTION 2 — ACTION ROUTES (INIT, START, ABORT)
# =====================================================
# CORRECT PATH — services/ is root-level
@app.route("/init", methods=["POST"])
def init_book():
"""
INIT-flow:
- user enters URL
- lightweight metadata fetch
- insert into SQLite as 'registered'
- return dashboard HTML (NOT JSON)
"""
url = request.form.get("url", "").strip()
if not url:
return render_template(
"dashboard/dashboard.html",
error="Geen URL opgegeven.",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
try:
result = InitService.execute(url)
msg = f"Boek geregistreerd: {result.get('title')}"
return render_template(
"dashboard/dashboard.html",
message=msg,
books=list_active_books(), # Redis
registered=get_registered_books(), # SQLite INIT results
logs=get_ui_logs(),
)
except Exception as e:
log_debug(f"[INIT] ERROR: {e}")
return render_template(
"dashboard/dashboard.html",
error=f"INIT mislukt: {e}",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
@app.route("/start", methods=["POST"])
def start_scraping():
url = request.form.get("url", "").strip()
if not url:
return render_template(
"dashboard/dashboard.html",
error="Geen URL opgegeven.",
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
reset_ui_logs()
log_debug(f"[WEB] Scraping via Celery: {url}")
async_result = celery_app.send_task(
"scraper.tasks.scraping.start_scrape_book",
args=[url],
queue="scraping",
)
return render_template(
"dashboard/dashboard.html",
scraping_task_id=async_result.id,
books=list_active_books(),
registered=get_registered_books(),
logs=get_ui_logs(),
)
@app.route("/abort/<book_id>", methods=["POST"])
def abort_download(book_id):
log_debug(f"[WEB] Abort requested for book: {book_id}")
set_abort(book_id)
return jsonify({"status": "ok", "aborted": book_id})
# =====================================================
# SECTION 3 — API ROUTES (JSON)
# =====================================================
@app.route("/api/books")
def api_books():
return jsonify(list_active_books())
@app.route("/api/book/<book_id>/status")
def api_book_status(book_id):
return jsonify(getStatus(book_id))
@app.route("/api/book/<book_id>/logs")
def api_book_logs(book_id):
logs = r.lrange(f"logs:{book_id}", 0, -1) or []
return jsonify(logs)
@app.route("/progress/<book_id>")
def progress(book_id):
return jsonify(get_progress(book_id))
@app.route("/celery-result/<task_id>")
def celery_result(task_id):
result = AsyncResult(task_id, app=celery_app)
if result.successful():
return jsonify({"ready": True, "result": result.get()})
if result.failed():
return jsonify({"ready": True, "error": "failed"})
return jsonify({"ready": False})
@app.route("/clear-logs", methods=["POST"])
def clear_logs():
reset_ui_logs()
return jsonify({"status": "ok", "message": "UI logs cleared"})
@app.route("/logs", methods=["GET"])
def logs():
try:
last_index = int(request.args.get("last_index", -1))
except:
last_index = -1
new_lines, total = get_ui_logs_delta(last_index)
return jsonify({"lines": new_lines, "total": total})
# =====================================================
# SECTION 4 — DEBUG ROUTES
# =====================================================
@app.route("/debug/redis-keys")
def debug_redis_keys():
cursor = 0
results = {}
while True:
cursor, keys = r.scan(cursor, match="*", count=200)
for k in keys:
try:
results[k] = r.get(k)
except:
results[k] = "<non-string value>"
if cursor == 0:
break
return jsonify(results)
# =====================================================
# DB DEBUG: LIST ALL BOOKS FROM SQLITE
# =====================================================
from db.repository import fetch_all_books
@app.route("/api/db/books")
def api_db_books():
"""
Return ALL books stored in SQLite — including INIT-only entries.
Useful to verify that /init wrote correct metadata.
"""
try:
books = fetch_all_books()
return jsonify({"status": "ok", "books": books})
except Exception as e:
return jsonify({"status": "error", "message": str(e)}), 500
# =====================================================
# SECTION 5 — INTERNAL HELPERS
# =====================================================
def getStatus(book_id):
state = r.hgetall(f"book:{book_id}:state")
status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_id
return {
"book_id": book_id,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": dl_total,
}
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_id = key[first + 1 : second]
books.append(getStatus(book_id))
return books
# =====================================================
# SECTION 6 — FLASK RUNNER
# =====================================================
if __name__ == "__main__":
debug = os.getenv("FLASK_DEBUG", "0") == "1"
host = os.getenv("HOST", "0.0.0.0")
port = int(os.getenv("PORT", "5000"))
log_debug(f"[WEB] Starting Flask server on {host}:{port}, debug={debug}")
app.run(host=host, port=port, debug=debug)