parent
8e2d3cec49
commit
cbfcce62cc
@ -1,45 +1,47 @@
|
|||||||
# ============================================
|
# celery_app.py
|
||||||
# File: bookscraper/celery_app.py
|
|
||||||
# ============================================
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from celery import Celery
|
from celery import Celery
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Load environment variables (OK to do here)
|
print(">>> [celery_app] Loading .env BEFORE initializing Celery...")
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
print(">>> DEBUG: celery_app.py LOADED")
|
BROKER = os.getenv("REDIS_BROKER")
|
||||||
print(">>> DEBUG: env REDIS_BROKER =", os.getenv("REDIS_BROKER"))
|
BACKEND = os.getenv("REDIS_BACKEND")
|
||||||
print(">>> DEBUG: env REDIS_URL =", os.getenv("REDIS_URL"))
|
|
||||||
|
|
||||||
# Read broker settings
|
|
||||||
REDIS_BROKER = os.getenv("REDIS_BROKER")
|
|
||||||
REDIS_BACKEND = os.getenv("REDIS_BACKEND")
|
|
||||||
|
|
||||||
# Fallback ONLY if missing
|
print(">>> [celery_app] BROKER =", BROKER)
|
||||||
if not REDIS_BROKER:
|
print(">>> [celery_app] BACKEND =", BACKEND)
|
||||||
REDIS_BROKER = os.getenv(
|
|
||||||
"REDIS_URL", "redis://host.docker.internal:6379/0"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not REDIS_BACKEND:
|
|
||||||
REDIS_BACKEND = REDIS_BROKER # safe fallback
|
|
||||||
|
|
||||||
# Create Celery app AFTER loading .env
|
|
||||||
celery_app = Celery(
|
celery_app = Celery(
|
||||||
"bookscraper",
|
"bookscraper",
|
||||||
broker=REDIS_BROKER,
|
broker=BROKER,
|
||||||
backend=REDIS_BACKEND,
|
backend=BACKEND,
|
||||||
|
include=[
|
||||||
|
"scraper.tasks.scraping",
|
||||||
|
"scraper.tasks.controller_tasks",
|
||||||
|
"scraper.tasks.download_tasks",
|
||||||
|
"scraper.tasks.parse_tasks",
|
||||||
|
"scraper.tasks.save_tasks",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
celery_app.conf.update(
|
celery_app.conf.task_routes = {
|
||||||
task_default_queue="default",
|
"scraper.tasks.scraping.*": {"queue": "scraping"},
|
||||||
task_routes={
|
"scraper.tasks.controller_tasks.*": {"queue": "controller"},
|
||||||
"tasks.scraping.*": {"queue": "scraping"},
|
"scraper.tasks.download_tasks.*": {"queue": "download"},
|
||||||
"tasks.audio.*": {"queue": "audio"},
|
"scraper.tasks.parse_tasks.*": {"queue": "parse"},
|
||||||
"tasks.*": {"queue": "default"},
|
"scraper.tasks.save_tasks.*": {"queue": "save"},
|
||||||
},
|
}
|
||||||
worker_prefetch_multiplier=1,
|
|
||||||
task_acks_late=True,
|
# ------------------------------------------------------------
|
||||||
)
|
# EXTRA DEBUG: test import of included modules
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
print(">>> [celery_app] Testing imports for included task modules...")
|
||||||
|
for module in celery_app.conf.include:
|
||||||
|
try:
|
||||||
|
__import__(module)
|
||||||
|
print(f">>> [celery_app] OK import {module}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f">>> [celery_app] FAILED import {module}: {e}")
|
||||||
|
|
||||||
|
print(">>> [celery_app] Celery initialization complete.")
|
||||||
|
|||||||
@ -1,87 +1,152 @@
|
|||||||
version: "3.9"
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
# Redis broker & backend
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
redis:
|
||||||
|
image: redis:7
|
||||||
|
container_name: bookscraper_redis
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "redis-cli", "ping"]
|
||||||
|
interval: 2s
|
||||||
|
timeout: 2s
|
||||||
|
retries: 20
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
# Controller Worker
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
worker_controller:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/Dockerfile.scraper
|
||||||
|
container_name: worker_controller
|
||||||
|
command: celery -A celery_app worker -Q controller -n controller@%h -l INFO
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
volumes:
|
||||||
|
- .:/app
|
||||||
|
- /Users/peter/Desktop/books:/app/output
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
# -------------------------
|
# ----------------------------------------------------------
|
||||||
# WEB UI
|
# Web GUI
|
||||||
# -------------------------
|
# ----------------------------------------------------------
|
||||||
web:
|
web:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile
|
dockerfile: docker/Dockerfile.web
|
||||||
container_name: bookscraper
|
container_name: bookscraper_web
|
||||||
ports:
|
|
||||||
- "5050:5000"
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
- .:/app
|
- .:/app
|
||||||
- /Users/peter/Desktop/books:/app/output
|
- /Users/peter/Desktop/books:/app/output
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
ports:
|
||||||
|
- "5011:5000"
|
||||||
|
environment:
|
||||||
|
- REDIS_BROKER=redis://redis:6379/0
|
||||||
|
- REDIS_BACKEND=redis://redis:6379/1
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
environment:
|
# ----------------------------------------------------------
|
||||||
FLASK_ENV: "production"
|
# Download Worker
|
||||||
|
# ----------------------------------------------------------
|
||||||
restart: unless-stopped
|
worker_download:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/Dockerfile.scraper
|
||||||
|
container_name: worker_download
|
||||||
|
volumes:
|
||||||
|
- .:/app
|
||||||
|
- /Users/peter/Desktop/books:/app/output
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
command: celery -A celery_app worker -Q download -n download@%h -l INFO
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
# -------------------------
|
# ----------------------------------------------------------
|
||||||
# SCRAPING WORKER
|
# Parse Worker
|
||||||
# (1 concurrency, 1 job tegelijk)
|
# ----------------------------------------------------------
|
||||||
# -------------------------
|
worker_parse:
|
||||||
scraper_worker:
|
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile
|
dockerfile: docker/Dockerfile.scraper
|
||||||
container_name: scraper_worker
|
container_name: worker_parse
|
||||||
command: python worker/scrape_worker.py
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
- .:/app
|
- .:/app
|
||||||
- /Users/peter/Desktop/books:/app/output
|
- /Users/peter/Desktop/books:/app/output
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
|
command: celery -A celery_app worker -Q parse -n parse@%h -l INFO
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
restart: unless-stopped
|
# ----------------------------------------------------------
|
||||||
|
# Save Worker
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
worker_save:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/Dockerfile.scraper
|
||||||
|
container_name: worker_save
|
||||||
|
volumes:
|
||||||
|
- .:/app
|
||||||
|
- /Users/peter/Desktop/books:/app/output
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
command: celery -A celery_app worker -Q save -n save@%h -l INFO
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
# -------------------------
|
# ----------------------------------------------------------
|
||||||
# AUDIO WORKER
|
# Audio Worker (macOS only)
|
||||||
# -------------------------
|
# ----------------------------------------------------------
|
||||||
audio_worker:
|
worker_audio:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile
|
dockerfile: docker/Dockerfile.audio
|
||||||
container_name: audio_worker
|
container_name: worker_audio
|
||||||
command: python worker/audio_worker.py
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
- .:/app
|
- .:/app
|
||||||
- /Users/peter/Desktop/books:/app/output
|
- /Users/peter/Desktop/books:/app/output
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
|
command: celery -A celery_app worker -Q audio -n audio@%h -l INFO
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
restart: unless-stopped
|
# ----------------------------------------------------------
|
||||||
|
# Scraping Worker
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
worker_scraping:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/Dockerfile.scraper
|
||||||
|
container_name: worker_scraping
|
||||||
|
volumes:
|
||||||
|
- .:/app
|
||||||
|
- /Users/peter/Desktop/books:/app/output
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
env_file:
|
||||||
# -------------------------
|
- .env
|
||||||
# REDIS (LOGS & QUEUE)
|
command: celery -A celery_app worker -Q scraping -n scraping@%h -l INFO
|
||||||
# -------------------------
|
restart: "no"
|
||||||
redis:
|
|
||||||
image: redis:alpine
|
|
||||||
container_name: redis
|
|
||||||
ports:
|
|
||||||
- "6379:6379"
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|||||||
@ -1,17 +1,9 @@
|
|||||||
# docker/Dockerfile.audio
|
FROM python:3.12-slim
|
||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install audio processing dependencies (extend later)
|
COPY requirements.audio.txt /app/requirements.audio.txt
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN pip install --no-cache-dir -r /app/requirements.audio.txt
|
||||||
ffmpeg \
|
|
||||||
libavcodec-extra \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
COPY requirements.txt .
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
|
||||||
|
|
||||||
COPY . .
|
COPY . /app
|
||||||
|
|
||||||
CMD ["python", "worker/audio_worker.py"]
|
CMD ["python3", "-c", "print('audio worker ready')"]
|
||||||
|
|||||||
@ -1,17 +1,15 @@
|
|||||||
# docker/Dockerfile.scraper
|
FROM python:3.12-slim
|
||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Scraper-specific system dependencies
|
RUN apt-get update && apt-get install -y \
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
build-essential \
|
||||||
libjpeg62-turbo-dev \
|
libxml2-dev \
|
||||||
zlib1g-dev \
|
libxslt1-dev \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
COPY requirements.txt .
|
COPY requirements.scraper.txt /app/requirements.scraper.txt
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r /app/requirements.scraper.txt
|
||||||
|
|
||||||
COPY . .
|
COPY . /app
|
||||||
|
|
||||||
CMD ["python", "worker/scrape_worker.py"]
|
CMD ["python3", "-c", "print('scraper worker ready')"]
|
||||||
|
|||||||
@ -1,16 +1,17 @@
|
|||||||
# docker/Dockerfile.web
|
|
||||||
FROM python:3.11-slim
|
FROM python:3.11-slim
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install only Python deps
|
# Copy full requirements for both Flask + Celery + BookScraper
|
||||||
COPY requirements.txt .
|
COPY requirements.txt /app/requirements.txt
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r /app/requirements.txt
|
||||||
|
|
||||||
# Copy the entire app
|
# Copy entire application (including .env so load_dotenv works)
|
||||||
COPY . .
|
COPY . /app
|
||||||
|
|
||||||
|
# Ensure Celery + BookScraper modules load correctly
|
||||||
|
ENV PYTHONPATH=/app
|
||||||
|
|
||||||
# Flask runs on port 5000
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
|
|
||||||
CMD ["python", "app.py"]
|
CMD ["python", "app.py"]
|
||||||
|
|||||||
@ -0,0 +1,7 @@
|
|||||||
|
requests
|
||||||
|
beautifulsoup4
|
||||||
|
lxml
|
||||||
|
pillow
|
||||||
|
redis
|
||||||
|
celery[redis]
|
||||||
|
python-dotenv
|
||||||
@ -0,0 +1,7 @@
|
|||||||
|
requests
|
||||||
|
beautifulsoup4
|
||||||
|
lxml
|
||||||
|
pillow
|
||||||
|
redis
|
||||||
|
celery[redis]
|
||||||
|
python-dotenv
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
flask
|
||||||
|
requests
|
||||||
|
beautifulsoup4
|
||||||
|
lxml
|
||||||
|
pillow
|
||||||
|
redis
|
||||||
|
celery
|
||||||
|
python-dotenv
|
||||||
@ -1,39 +1,72 @@
|
|||||||
# scraper/download_controller.py
|
# scraper/download_controller.py
|
||||||
|
|
||||||
from logbus.publisher import log
|
from celery import group
|
||||||
from scraper.tasks.pipeline import build_chapter_pipeline
|
from scraper.tasks.pipeline import build_chapter_pipeline
|
||||||
|
from logbus.publisher import log
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
class DownloadController:
|
class DownloadController:
|
||||||
|
"""Coordinates parallel chapter pipelines, with optional volume splitting."""
|
||||||
|
|
||||||
def __init__(self, url: str):
|
def __init__(self, scrape_result: dict):
|
||||||
self.url = url
|
self.scrape_result = scrape_result
|
||||||
self.scraper = None # door BookScraper gevuld
|
self.title = scrape_result.get("title", "UnknownBook")
|
||||||
self.base_path = None
|
self.chapters = scrape_result.get("chapters", [])
|
||||||
|
|
||||||
def start(self):
|
# Base output dir from .env
|
||||||
log(f"[DL-CONTROLLER] Parsing metadata for {self.url}")
|
root = os.getenv("BOOKSCRAPER_OUTPUT_DIR", "output")
|
||||||
|
|
||||||
|
# Volume size
|
||||||
|
self.max_vol = int(os.getenv("MAX_VOL_SIZE", "200"))
|
||||||
|
|
||||||
# 1) Boek info verzamelen
|
# Base directory for the whole book
|
||||||
scraper = self.scraper = self._init_scraper()
|
self.book_base = os.path.join(root, self.title)
|
||||||
scraper.parse_book_info()
|
os.makedirs(self.book_base, exist_ok=True)
|
||||||
|
|
||||||
|
# constant metadata for all chapters
|
||||||
|
self.meta = {
|
||||||
|
"title": self.scrape_result.get("title"),
|
||||||
|
"author": self.scrape_result.get("author"),
|
||||||
|
"description": self.scrape_result.get("description"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_volume_path(self, chapter_num: int) -> str:
|
||||||
|
"""Returns the correct volume directory based on chapter number."""
|
||||||
|
vol_index = (chapter_num - 1) // self.max_vol + 1
|
||||||
|
vol_name = f"Volume_{vol_index:03d}"
|
||||||
|
vol_path = os.path.join(self.book_base, vol_name)
|
||||||
|
os.makedirs(vol_path, exist_ok=True)
|
||||||
|
return vol_path
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
log(f"[CTRL] Starting download pipeline for {self.title}")
|
||||||
|
log(f"[CTRL] Chapters: {len(self.chapters)}")
|
||||||
|
log(f"[CTRL] Output root: {self.book_base}")
|
||||||
|
log(f"[CTRL] MAX_VOL_SIZE = {self.max_vol}")
|
||||||
|
|
||||||
# base_path bepalen
|
tasks = []
|
||||||
self.base_path = scraper.get_base_path()
|
|
||||||
|
|
||||||
# 2) Chapters ophalen
|
for ch in self.chapters:
|
||||||
chapters = scraper.get_chapter_list()
|
chapter_num = ch["num"]
|
||||||
|
chapter_url = ch["url"]
|
||||||
|
|
||||||
# 3) Per chapter een Celery pipeline starten
|
# compute volume directory
|
||||||
for ch in chapters:
|
vol_path = self.get_volume_path(chapter_num)
|
||||||
log(f"[DL-CONTROLLER] Queue pipeline for chapter {ch.number}")
|
|
||||||
|
|
||||||
workflow = build_chapter_pipeline(
|
# build the pipeline for this chapter
|
||||||
chapter_number=ch.number,
|
tasks.append(
|
||||||
chapter_url=ch.url,
|
build_chapter_pipeline(
|
||||||
base_path=self.base_path
|
chapter_num,
|
||||||
|
chapter_url,
|
||||||
|
vol_path, # ✔ correct volume path!!
|
||||||
|
self.meta, # ✔ pass metadata once
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
workflow.delay() # 🔥 dit start de chain
|
# parallel processing
|
||||||
|
job_group = group(tasks)
|
||||||
|
async_result = job_group.apply_async()
|
||||||
|
|
||||||
return {"status": "queued", "chapters": len(chapters)}
|
log("[CTRL] Pipelines launched.")
|
||||||
|
return async_result
|
||||||
|
|||||||
@ -0,0 +1,16 @@
|
|||||||
|
# scraper/models/book_state.py
|
||||||
|
|
||||||
|
|
||||||
|
class Chapter:
|
||||||
|
"""
|
||||||
|
Lightweight chapter model used by DownloadController, BookScraper,
|
||||||
|
and Celery pipelines.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, number: int, title: str, url: str):
|
||||||
|
self.number = number
|
||||||
|
self.title = title
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Chapter(number={self.number}, title={self.title}, url={self.url})"
|
||||||
@ -0,0 +1,21 @@
|
|||||||
|
# scraper/tasks/controller_tasks.py
|
||||||
|
|
||||||
|
from celery_app import celery_app
|
||||||
|
from logbus.publisher import log
|
||||||
|
from scraper.download_controller import DownloadController
|
||||||
|
|
||||||
|
print(">>> [IMPORT] controller_tasks.py loaded")
|
||||||
|
|
||||||
|
|
||||||
|
@celery_app.task(bind=True, queue="controller", ignore_result=False)
|
||||||
|
def launch_downloads(self, scrape_result: dict):
|
||||||
|
"""Start complete download → parse → save pipeline."""
|
||||||
|
|
||||||
|
log("[CTRL] Launching DownloadController...")
|
||||||
|
|
||||||
|
ctl = DownloadController(scrape_result)
|
||||||
|
async_result = ctl.start()
|
||||||
|
|
||||||
|
log("[CTRL] Pipelines dispatched.")
|
||||||
|
|
||||||
|
return {"pipelines_started": len(scrape_result.get("chapters", []))}
|
||||||
@ -1,33 +1,33 @@
|
|||||||
# scraper/tasks/download_tasks.py
|
# scraper/tasks/download_tasks.py
|
||||||
|
from celery_app import celery_app
|
||||||
from celery import shared_task
|
|
||||||
from logbus.publisher import log
|
from logbus.publisher import log
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
print(">>> [IMPORT] download_tasks.py loaded")
|
||||||
|
|
||||||
@shared_task(bind=True, queue="download", ignore_result=False)
|
|
||||||
def download_chapter(self, chapter_number: int, chapter_url: str):
|
|
||||||
"""
|
|
||||||
Download a chapter page and return raw HTML for parsing.
|
|
||||||
Does NOT save anything; that is done by save_tasks.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
log(f"[DL] Downloading chapter {chapter_number}: {chapter_url}")
|
@celery_app.task(bind=True, queue="download", ignore_result=False)
|
||||||
|
def download_chapter(self, chapter_num: int, chapter_url: str):
|
||||||
|
log(f"[DL] Downloading chapter {chapter_num}: {chapter_url}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = requests.get(chapter_url, timeout=15)
|
resp = requests.get(
|
||||||
|
chapter_url,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0"},
|
||||||
|
timeout=20,
|
||||||
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
html = resp.text
|
|
||||||
|
|
||||||
log(f"[DL] OK {chapter_number}: {len(html)} bytes")
|
resp.encoding = resp.apparent_encoding or "gb2312"
|
||||||
|
html = resp.text
|
||||||
|
log(f"[DL] OK {chapter_num}: {len(html)} bytes")
|
||||||
|
|
||||||
# Dit resultaat wordt doorgegeven aan parse_task
|
|
||||||
return {
|
return {
|
||||||
"chapter": chapter_number,
|
"chapter": chapter_num,
|
||||||
"url": chapter_url,
|
"url": chapter_url,
|
||||||
"html": html,
|
"html": html,
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log(f"[DL] ERROR downloading {chapter_url}: {exc}")
|
log(f"[DL] ERROR {chapter_url}: {exc}")
|
||||||
raise
|
raise
|
||||||
|
|||||||
@ -1,57 +1,79 @@
|
|||||||
# scraper/tasks/parse_tasks.py
|
# scraper/tasks/parse_tasks.py
|
||||||
|
|
||||||
from celery import shared_task
|
from celery_app import celery_app
|
||||||
from logbus.publisher import log
|
from logbus.publisher import log
|
||||||
from scraper.utils import clean_text
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from scraper.utils import clean_text, load_replacements
|
||||||
|
|
||||||
|
print(">>> [IMPORT] parse_tasks.py loaded")
|
||||||
|
|
||||||
@shared_task(bind=True, queue="parse", ignore_result=False)
|
|
||||||
def parse_chapter(self, html: str, chapter_url: str):
|
@celery_app.task(bind=True, queue="parse", ignore_result=False)
|
||||||
|
def parse_chapter(self, download_result: dict, meta: dict):
|
||||||
"""
|
"""
|
||||||
Parse downloaded chapter HTML into clean text.
|
download_result:
|
||||||
Returns a dict:
|
{
|
||||||
{
|
"chapter": int,
|
||||||
"url": chapter_url,
|
"url": str,
|
||||||
"text": "...parsed text..."
|
"html": str
|
||||||
}
|
}
|
||||||
|
|
||||||
|
meta:
|
||||||
|
{
|
||||||
|
"title": str,
|
||||||
|
"author": str,
|
||||||
|
"description": str
|
||||||
|
}
|
||||||
"""
|
"""
|
||||||
try:
|
|
||||||
log(f"[PARSE] Start parsing: {chapter_url}")
|
chapter_num = download_result["chapter"]
|
||||||
|
url = download_result["url"]
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
html = download_result["html"]
|
||||||
|
|
||||||
# Veel Chinese sites gebruiken dit soort containers:
|
log(f"[PARSE] Parsing chapter {chapter_num}")
|
||||||
possible_blocks = [
|
|
||||||
"#content",
|
soup = BeautifulSoup(html, "lxml")
|
||||||
".content",
|
|
||||||
"div#content",
|
selectors = [
|
||||||
"div.content",
|
"#content",
|
||||||
"div#chaptercontent",
|
".content",
|
||||||
"#chapterContent"
|
"div#content",
|
||||||
]
|
"div.content",
|
||||||
|
"div#chaptercontent",
|
||||||
node = None
|
"#chapterContent",
|
||||||
for sel in possible_blocks:
|
".read-content",
|
||||||
r = soup.select_one(sel)
|
]
|
||||||
if r:
|
|
||||||
node = r
|
node = None
|
||||||
break
|
for sel in selectors:
|
||||||
|
tmp = soup.select_one(sel)
|
||||||
if not node:
|
if tmp:
|
||||||
log(
|
node = tmp
|
||||||
f"[PARSE] WARNING: no known content block found in {chapter_url}")
|
break
|
||||||
text = clean_text(soup.get_text())
|
|
||||||
else:
|
raw = node.get_text() if node else soup.get_text()
|
||||||
text = clean_text(node.get_text())
|
|
||||||
|
# replacements
|
||||||
log(f"[PARSE] Finished parsing: {chapter_url} ({len(text)} chars)")
|
REPL = load_replacements()
|
||||||
|
text = clean_text(raw, REPL)
|
||||||
return {
|
|
||||||
"url": chapter_url,
|
# ---------------------------------------------------
|
||||||
"text": text,
|
# HEADER ONLY FOR CHAPTER 1
|
||||||
}
|
# ---------------------------------------------------
|
||||||
|
if chapter_num == 1:
|
||||||
except Exception as exc:
|
header = (
|
||||||
log(f"[PARSE] ERROR parsing {chapter_url}: {exc}")
|
f"{meta.get('title','')}\n"
|
||||||
raise
|
f"Author: {meta.get('author','')}\n"
|
||||||
|
f"Description:\n{meta.get('description','')}\n"
|
||||||
|
f"URL: {url}\n" + "-" * 50 + "\n\n"
|
||||||
|
)
|
||||||
|
text = header + text
|
||||||
|
|
||||||
|
log(f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"chapter": chapter_num,
|
||||||
|
"url": url,
|
||||||
|
"text": text,
|
||||||
|
"length": len(text),
|
||||||
|
}
|
||||||
|
|||||||
@ -1,35 +1,52 @@
|
|||||||
from celery import shared_task
|
# scraper/tasks/scraping.py
|
||||||
from scraper.book_scraper import BookScraper
|
#
|
||||||
from scraper.sites import BookSite
|
from celery_app import celery_app
|
||||||
from logbus.publisher import log
|
from logbus.publisher import log
|
||||||
|
import os
|
||||||
|
|
||||||
|
from scraper.sites import BookSite
|
||||||
|
from scraper.book_scraper import BookScraper
|
||||||
|
from scraper.tasks.controller_tasks import launch_downloads
|
||||||
|
|
||||||
|
print(">>> [IMPORT] scraping.py loaded")
|
||||||
|
|
||||||
|
|
||||||
|
@celery_app.task(bind=True, queue="scraping", ignore_result=False)
|
||||||
|
def start_scrape_book(self, url: str):
|
||||||
|
"""Scrapes metadata + chapter list."""
|
||||||
|
|
||||||
|
log(f"[SCRAPING] Start scraping for: {url}")
|
||||||
|
|
||||||
|
site = BookSite()
|
||||||
|
scraper = BookScraper(site, url)
|
||||||
|
scraper.parse_book_info()
|
||||||
|
|
||||||
@shared_task(bind=True, queue="scraping")
|
chapters = scraper.get_chapter_list()
|
||||||
def scrape_book(self, url):
|
full_count = len(chapters)
|
||||||
"""
|
|
||||||
HIGH-LEVEL SCRAPER TASK
|
|
||||||
Roept synchronen BookScraper aan voor een volledige scrape.
|
|
||||||
"""
|
|
||||||
log(f"[SCRAPER] Start scrape: {url}")
|
|
||||||
|
|
||||||
scraper = BookScraper(BookSite(), url)
|
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
|
||||||
result = scraper.execute()
|
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
|
||||||
|
|
||||||
log(f"[SCRAPER] Finished scrape: {url}")
|
if DRY_RUN:
|
||||||
return {"title": result["title"]}
|
log(f"[SCRAPING] DRY_RUN: limiting chapters to first {TEST_LIMIT}")
|
||||||
|
chapters = chapters[:TEST_LIMIT]
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"title": scraper.book_title,
|
||||||
|
"author": scraper.book_author,
|
||||||
|
"description": scraper.book_description,
|
||||||
|
"cover": scraper.cover_url,
|
||||||
|
"chapters": [
|
||||||
|
{"num": ch.number, "title": ch.title, "url": ch.url} for ch in chapters
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
@shared_task(bind=True, queue="download", max_retries=5)
|
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
|
||||||
def download_chapter_task(self, number, title, url, output_base):
|
|
||||||
"""
|
|
||||||
Download alleen één chapter.
|
|
||||||
download_worker.py voert dit uiteindelijk uit.
|
|
||||||
"""
|
|
||||||
from worker.download_worker import download_single_chapter
|
|
||||||
|
|
||||||
try:
|
celery_app.send_task(
|
||||||
return download_single_chapter(number, title, url, output_base)
|
"scraper.tasks.controller_tasks.launch_downloads",
|
||||||
|
args=[result],
|
||||||
|
queue="controller",
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
return result
|
||||||
log(f"[DOWNLOAD] Error while downloading chapter {number}: {e}")
|
|
||||||
raise self.retry(countdown=3)
|
|
||||||
|
|||||||
@ -0,0 +1,57 @@
|
|||||||
|
# scraper/utils.py
|
||||||
|
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Load replacements from text_replacements.txt (optional file)
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def load_replacements(filepath="text_replacements.txt") -> dict:
|
||||||
|
"""
|
||||||
|
Load key=value style replacements.
|
||||||
|
Empty or missing file → return {}.
|
||||||
|
"""
|
||||||
|
path = Path(filepath)
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
repl = {}
|
||||||
|
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if "=" in line:
|
||||||
|
key, val = line.split("=", 1)
|
||||||
|
repl[key.strip()] = val.strip()
|
||||||
|
|
||||||
|
return repl
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Clean extracted HTML text
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def clean_text(raw: str, repl_dict: dict = None) -> str:
|
||||||
|
"""
|
||||||
|
Normalizes whitespace, removes junk, and applies replacements.
|
||||||
|
repl_dict is optional → falls back to {}.
|
||||||
|
"""
|
||||||
|
if repl_dict is None:
|
||||||
|
repl_dict = {}
|
||||||
|
|
||||||
|
txt = raw
|
||||||
|
|
||||||
|
# Normalize CRLF
|
||||||
|
txt = txt.replace("\r", "")
|
||||||
|
|
||||||
|
# Collapse multiple blank lines
|
||||||
|
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
||||||
|
|
||||||
|
# Apply replacements
|
||||||
|
for key, val in repl_dict.items():
|
||||||
|
txt = txt.replace(key, val)
|
||||||
|
|
||||||
|
# Strip excessive whitespace at edges
|
||||||
|
return txt.strip()
|
||||||
@ -1,22 +1,36 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
# scraper/utils.py
|
# scraper/utils.py
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def load_replacements(path="text_replacements.txt") -> dict:
|
||||||
|
"""
|
||||||
|
Load key=value replacements from a simple text file.
|
||||||
|
Lines beginning with # are ignored.
|
||||||
|
"""
|
||||||
|
fp = Path(path)
|
||||||
|
if not fp.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
def load_replacements(path):
|
|
||||||
repl = {}
|
repl = {}
|
||||||
if not path or not os.path.exists(path):
|
for line in fp.read_text(encoding="utf-8").splitlines():
|
||||||
return repl
|
line = line.strip()
|
||||||
|
if not line or line.startswith("#"):
|
||||||
with open(path, encoding="utf-8") as f:
|
continue
|
||||||
for line in f:
|
|
||||||
if "=>" in line:
|
if "=" in line:
|
||||||
k, v = line.strip().split("=>", 1)
|
k, v = line.split("=", 1)
|
||||||
repl[k.strip()] = v.strip()
|
repl[k.strip()] = v.strip()
|
||||||
|
|
||||||
return repl
|
return repl
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text, repl_dict):
|
def clean_text(raw: str, repl_dict: dict) -> str:
|
||||||
for src, tgt in repl_dict.items():
|
"""
|
||||||
text = text.replace(src, tgt)
|
Cleans text using user-defined replacements.
|
||||||
return text
|
"""
|
||||||
|
txt = raw
|
||||||
|
|
||||||
|
for k, v in repl_dict.items():
|
||||||
|
txt = txt.replace(k, v)
|
||||||
|
|
||||||
|
return txt.strip()
|
||||||
|
|||||||
@ -1,63 +1,85 @@
|
|||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html lang="nl">
|
<html lang="nl">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8" />
|
||||||
<title>Scrape Resultaat</title>
|
<title>Scrape & Download Resultaat</title>
|
||||||
<style>
|
<style>
|
||||||
body { font-family: Arial, sans-serif; padding: 40px; max-width: 900px; margin: auto; }
|
body {
|
||||||
h1 { margin-bottom: 10px; }
|
font-family: Arial, sans-serif;
|
||||||
.error { padding: 15px; background: #ffdddd; border-left: 5px solid #ff4444; margin-bottom: 20px; }
|
padding: 40px;
|
||||||
.box { padding: 15px; background: #f7f7f7; border: 1px solid #ddd; margin-bottom: 20px; border-radius: 6px; }
|
max-width: 900px;
|
||||||
a { color: #007bff; text-decoration: none; }
|
margin: auto;
|
||||||
a:hover { text-decoration: underline; }
|
}
|
||||||
pre { background: #222; color: #eee; padding: 10px; border-radius: 6px; overflow-x: auto; }
|
h1 {
|
||||||
small { color: #555; }
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
.error {
|
||||||
|
padding: 15px;
|
||||||
|
background: #ffdddd;
|
||||||
|
border-left: 5px solid #ff4444;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
.box {
|
||||||
|
padding: 15px;
|
||||||
|
background: #f7f7f7;
|
||||||
|
border: 1px solid #ddd;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
border-radius: 6px;
|
||||||
|
}
|
||||||
|
a {
|
||||||
|
color: #007bff;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
a:hover {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
<a href="/">← Terug</a>
|
||||||
|
|
||||||
<a href="/">← Terug</a>
|
{% if error %}
|
||||||
|
<div class="error"><strong>Fout:</strong><br />{{ error }}</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if error %}
|
<h1>Scrape Resultaat</h1>
|
||||||
<div class="error">
|
|
||||||
<strong>Fout:</strong><br>{{ error }}
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<h1>Scrape resultaat</h1>
|
{% if book %}
|
||||||
|
|
||||||
{% if result %}
|
<div class="box">
|
||||||
<div class="box">
|
<strong>Titel:</strong> {{ book.title }}<br />
|
||||||
<strong>Titel:</strong> {{ result.title }}<br>
|
<strong>Auteur:</strong> {{ book.author }}<br />
|
||||||
<strong>Auteur:</strong> {{ result.author }}<br>
|
</div>
|
||||||
</div>
|
|
||||||
|
|
||||||
{% if result.description %}
|
{% if book.description %}
|
||||||
<div class="box">
|
<div class="box">
|
||||||
<strong>Beschrijving:</strong><br>
|
<strong>Beschrijving:</strong><br />
|
||||||
<p>{{ result.description }}</p>
|
<p>{{ book.description }}</p>
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
<div class="box">
|
<div class="box">
|
||||||
<strong>Aantal chapters:</strong> {{ result.chapters|length }}
|
<strong>Aantal chapters:</strong> {{ book.chapters|length }}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{% if result.chapters %}
|
{% if book.chapters %}
|
||||||
<div class="box">
|
<div class="box">
|
||||||
<strong>Chapters:</strong><br><br>
|
<strong>Chapters:</strong><br /><br />
|
||||||
<ul>
|
<ul>
|
||||||
{% for ch in result.chapters %}
|
{% for ch in book.chapters %}
|
||||||
<li>
|
<li>
|
||||||
<a href="{{ ch.url }}" target="_blank">
|
<a href="{{ ch.url }}" target="_blank">
|
||||||
Chapter {{ ch.number }} — {{ ch.title }}
|
Chapter {{ ch.num }} — {{ ch.title }}
|
||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %} {% if download_job_id %}
|
||||||
{% endif %}
|
<div class="box">
|
||||||
|
<strong>Download pipeline gestart!</strong><br />
|
||||||
</body>
|
Job ID: <code>{{ download_job_id }}</code>
|
||||||
|
</div>
|
||||||
|
{% endif %} {% endif %}
|
||||||
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
Loading…
Reference in new issue