parent
158cb63d54
commit
3ed85d08e3
@ -0,0 +1,20 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Pillow dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libjpeg62-turbo-dev zlib1g-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Kopieer volledige app (zoals hij nu is)
|
||||
COPY . .
|
||||
|
||||
# Expose Flask port
|
||||
EXPOSE 5000
|
||||
|
||||
# Gebruik jouw eigen app.py als entrypoint
|
||||
CMD ["python", "app.py"]
|
||||
@ -0,0 +1,125 @@
|
||||
# 📚 BookScraper — Web UI + Docker + Live Log Streaming
|
||||
|
||||
BookScraper is een moderne, volledig geautomatiseerde scraper voor Chinese webnovels
|
||||
zoals **Piaotian / Piaotia**.
|
||||
Het project combineert een krachtige scraping-engine met een prettige webinterface.
|
||||
|
||||
---
|
||||
|
||||
# 🔍 Wat doet dit project?
|
||||
|
||||
BookScraper bestaat uit drie belangrijke onderdelen:
|
||||
|
||||
---
|
||||
|
||||
## 1. 🧠 BookScraper Engine (Python)
|
||||
|
||||
Dit is de kern van het project.
|
||||
De engine:
|
||||
|
||||
- Leest basisinformatie van een boek (titel, auteur, cover)
|
||||
- Zoekt alle chapter-links
|
||||
- Downloadt elk chapter met:
|
||||
- **Retry systeem**
|
||||
- **Anti-429 backoff**
|
||||
(wacht: `backoff * attempt + 1 seconde`)
|
||||
- Detectie van lege chapters → automatisch opnieuw proberen
|
||||
- Past tekstreplacements toe (via `replacements.txt`)
|
||||
- Slaat chapters geordend op
|
||||
- Splits lange boeken automatisch in volumes (`v1/`, `v2/`, `v3/`…)
|
||||
|
||||
De engine is **bestand tegen rate limiting** van Piaotian en soortgelijke sites
|
||||
en werkt met een **throttle (MAX_DOWNLOADS_PER_SEC)** om blokkades te voorkomen.
|
||||
|
||||
---
|
||||
|
||||
## 2. 🌐 Flask Webinterface (UI)
|
||||
|
||||
De webinterface biedt:
|
||||
|
||||
- Een invoerveld voor de boek-URL
|
||||
- Een knop: **Run Scraper**
|
||||
- Live feedback via **server-sent events (SSE)**
|
||||
|
||||
Tijdens het scrapen zie je realtime updates verschijnen, zoals:
|
||||
|
||||
[DEBUG] GET chapter 1123
|
||||
[DEBUG] HTTP 429 → retry sleep 4.0s
|
||||
[DEBUG] Saved chapter: output/xxx/01123_章名.txt
|
||||
|
||||
|
||||
Hierdoor voelt het alsof de scraper “live” aan het werk is.
|
||||
|
||||
---
|
||||
|
||||
## 3. 📡 Live Logging (SSE)
|
||||
|
||||
De Logger vangt alle BookScraper-meldingen op en streamt ze
|
||||
via `/stream` naar de webinterface.
|
||||
Dit maakt het ideaal om scraping in de gaten te houden zonder console.
|
||||
|
||||
---
|
||||
|
||||
## 4. 🔧 Configuratie via `.env`
|
||||
|
||||
Om het project flexibel te houden wordt alles ingesteld via `.env`:
|
||||
|
||||
- Throttle (`MAX_DOWNLOADS_PER_SEC`)
|
||||
- Debugmode (`FLASK_DEBUG`)
|
||||
- DRY_RUN (alleen eerste chapters)
|
||||
- Volume size
|
||||
- Host & Port
|
||||
|
||||
De `.env` wordt automatisch geladen door Docker Compose en door Flask.
|
||||
|
||||
---
|
||||
|
||||
# 📦 Projectstructuur
|
||||
|
||||
bookscraper/
|
||||
│
|
||||
├── scraper/
|
||||
│ ├── book_scraper.py # De scraper engine
|
||||
│ ├── logger.py # SSE logger
|
||||
│ ├── sites.py # Site configuratie (selectors etc.)
|
||||
│ ├── utils.py # Helpers
|
||||
│ └── ...
|
||||
│
|
||||
├── templates/
|
||||
│ └── index.html # UI
|
||||
│
|
||||
├── output/ # Book results
|
||||
│
|
||||
├── app.py # Flask webserver + endpoints
|
||||
├── replacements.txt # Tekstvervangers
|
||||
├── Dockerfile
|
||||
├── docker-compose.yml
|
||||
├── requirements.txt
|
||||
└── .env
|
||||
|
||||
|
||||
---
|
||||
|
||||
# ▶️ Project handmatig starten (ZONDER Docker)
|
||||
|
||||
Zorg dat dependencies geïnstalleerd zijn:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
Start de Flask server:
|
||||
python app.py
|
||||
|
||||
Open daarna:
|
||||
👉 http://localhost:5000
|
||||
|
||||
Docker Build (zonder compose)
|
||||
|
||||
Manueel builden:
|
||||
|
||||
docker build -t bookscraper .
|
||||
docker run -p 5000:5000 --env-file .env bookscraper
|
||||
docker run \
|
||||
-p 5000:5000 \
|
||||
--env-file .env \
|
||||
-v $(pwd)/output:/app/output \
|
||||
bookscraper
|
||||
@ -1,53 +1,71 @@
|
||||
from flask import Flask, request, render_template_string
|
||||
from scraper.book_scraper import BookScraper
|
||||
from scraper.sites import BookSite
|
||||
import sys
|
||||
# app.py
|
||||
from flask import Flask, request, Response, render_template
|
||||
import time
|
||||
import queue
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from scraper.book_scraper import BookScraper
|
||||
from scraper.sites import BookSite
|
||||
from scraper.logger import add_listener, remove_listener, LOG_BUFFER
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
# --- GET: toon formulier ---
|
||||
@app.route("/", methods=["GET"])
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template_string("""
|
||||
<html>
|
||||
<body>
|
||||
<h2>BookScraper</h2>
|
||||
<form method="post">
|
||||
<label>Book URL:</label><br>
|
||||
<input name="url" style="width:400px"><br>
|
||||
<button type="submit">Scrape</button>
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
||||
""")
|
||||
|
||||
|
||||
# --- POST: scraper uitvoeren ---
|
||||
@app.route("/", methods=["POST"])
|
||||
def run_scraper():
|
||||
url = request.form.get("url")
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# RUN SCRAPER
|
||||
# ----------------------------------------------------------
|
||||
|
||||
@app.route("/run", methods=["POST"])
|
||||
def run_scraper():
|
||||
data = request.json
|
||||
site = BookSite()
|
||||
scraper = BookScraper(site, url)
|
||||
scraper = BookScraper(site, data["url"])
|
||||
result = scraper.execute()
|
||||
|
||||
return render_template_string("""
|
||||
<html>
|
||||
<body>
|
||||
<h2>Scrape result: {{title}}</h2>
|
||||
<h3>Debug output:</h3>
|
||||
<pre style='background:#111;color:#0f0;padding:10px;border-radius:8px'>
|
||||
{{debug}}
|
||||
</pre>
|
||||
<p><a href="/">Terug</a></p>
|
||||
</body>
|
||||
</html>
|
||||
""", title=result["title"], debug=result["debug"])
|
||||
return {
|
||||
"title": result["title"],
|
||||
"buffer": LOG_BUFFER.getvalue()
|
||||
}
|
||||
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# REALTIME LOG STREAM (SSE)
|
||||
# ----------------------------------------------------------
|
||||
|
||||
@app.route("/stream")
|
||||
def stream():
|
||||
|
||||
def event_stream():
|
||||
q = queue.Queue()
|
||||
|
||||
# push logregels van BookScraper naar SSE
|
||||
def listener(line):
|
||||
q.put(line)
|
||||
|
||||
add_listener(listener)
|
||||
|
||||
try:
|
||||
while True:
|
||||
msg = q.get() # blokkeert totdat logregel binnenkomt
|
||||
yield f"data: {msg}\n\n"
|
||||
except GeneratorExit:
|
||||
pass
|
||||
finally:
|
||||
remove_listener(listener)
|
||||
|
||||
return Response(event_stream(), mimetype="text/event-stream")
|
||||
|
||||
|
||||
# ----------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True)
|
||||
debug = os.getenv("FLASK_DEBUG", "0") == "1"
|
||||
host = os.getenv("HOST", "0.0.0.0")
|
||||
port = int(os.getenv("PORT", "5000"))
|
||||
|
||||
app.run(debug=debug, host=host, port=port)
|
||||
|
||||
@ -0,0 +1,25 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
bookscraper:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: bookscraper
|
||||
ports:
|
||||
- "5050:5000"
|
||||
|
||||
# Mount alles zoals je lokaal al werkt
|
||||
volumes:
|
||||
- .:/app # volledige projectmap
|
||||
- /Users/peter/Desktop/books:/app/output
|
||||
|
||||
# Bestaande .env wordt automatisch geladen door Docker Compose
|
||||
env_file:
|
||||
- .env
|
||||
|
||||
# Zorg dat Flask NIET in debugmode gaat (jouw code bepaalt dit)
|
||||
environment:
|
||||
FLASK_ENV: "production"
|
||||
|
||||
restart: unless-stopped
|
||||
|
Before Width: | Height: | Size: 11 KiB |
@ -1,22 +1,40 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Book Scraper</title>
|
||||
<title>BookScraper</title>
|
||||
<style>
|
||||
body { font-family: Arial; padding:20px; }
|
||||
#log { background:#000; color:#0f0; padding:10px; height:400px; overflow:auto; white-space:pre-wrap; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>Book Scraper</h1>
|
||||
<h1>BookScraper</h1>
|
||||
|
||||
{% if error %}
|
||||
<p style="color:red">{{ error }}</p>
|
||||
{% endif %}
|
||||
<input id="url" type="text" placeholder="Book URL" style="width:400px">
|
||||
<button onclick="startScrape()">Start</button>
|
||||
|
||||
<form method="post">
|
||||
<label for="book_url">Enter Book URL:</label><br><br>
|
||||
<input type="text" id="book_url" name="book_url" style="width:400px">
|
||||
<br><br>
|
||||
<button type="submit">Scrape</button>
|
||||
</form>
|
||||
<h2>Realtime log:</h2>
|
||||
<div id="log"></div>
|
||||
|
||||
<script>
|
||||
function startScrape() {
|
||||
document.getElementById("log").innerHTML = "";
|
||||
|
||||
const evtSource = new EventSource("/stream");
|
||||
evtSource.onmessage = function(e) {
|
||||
const logDiv = document.getElementById("log");
|
||||
logDiv.innerText += e.data + "\n";
|
||||
logDiv.scrollTop = logDiv.scrollHeight;
|
||||
};
|
||||
|
||||
fetch("/run", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ url: document.getElementById("url").value })
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
||||
Loading…
Reference in new issue