parent
158cb63d54
commit
3ed85d08e3
@ -0,0 +1,20 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Pillow dependencies
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libjpeg62-turbo-dev zlib1g-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Kopieer volledige app (zoals hij nu is)
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Expose Flask port
|
||||||
|
EXPOSE 5000
|
||||||
|
|
||||||
|
# Gebruik jouw eigen app.py als entrypoint
|
||||||
|
CMD ["python", "app.py"]
|
||||||
@ -0,0 +1,125 @@
|
|||||||
|
# 📚 BookScraper — Web UI + Docker + Live Log Streaming
|
||||||
|
|
||||||
|
BookScraper is een moderne, volledig geautomatiseerde scraper voor Chinese webnovels
|
||||||
|
zoals **Piaotian / Piaotia**.
|
||||||
|
Het project combineert een krachtige scraping-engine met een prettige webinterface.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 🔍 Wat doet dit project?
|
||||||
|
|
||||||
|
BookScraper bestaat uit drie belangrijke onderdelen:
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 🧠 BookScraper Engine (Python)
|
||||||
|
|
||||||
|
Dit is de kern van het project.
|
||||||
|
De engine:
|
||||||
|
|
||||||
|
- Leest basisinformatie van een boek (titel, auteur, cover)
|
||||||
|
- Zoekt alle chapter-links
|
||||||
|
- Downloadt elk chapter met:
|
||||||
|
- **Retry systeem**
|
||||||
|
- **Anti-429 backoff**
|
||||||
|
(wacht: `backoff * attempt + 1 seconde`)
|
||||||
|
- Detectie van lege chapters → automatisch opnieuw proberen
|
||||||
|
- Past tekstreplacements toe (via `replacements.txt`)
|
||||||
|
- Slaat chapters geordend op
|
||||||
|
- Splits lange boeken automatisch in volumes (`v1/`, `v2/`, `v3/`…)
|
||||||
|
|
||||||
|
De engine is **bestand tegen rate limiting** van Piaotian en soortgelijke sites
|
||||||
|
en werkt met een **throttle (MAX_DOWNLOADS_PER_SEC)** om blokkades te voorkomen.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 🌐 Flask Webinterface (UI)
|
||||||
|
|
||||||
|
De webinterface biedt:
|
||||||
|
|
||||||
|
- Een invoerveld voor de boek-URL
|
||||||
|
- Een knop: **Run Scraper**
|
||||||
|
- Live feedback via **server-sent events (SSE)**
|
||||||
|
|
||||||
|
Tijdens het scrapen zie je realtime updates verschijnen, zoals:
|
||||||
|
|
||||||
|
[DEBUG] GET chapter 1123
|
||||||
|
[DEBUG] HTTP 429 → retry sleep 4.0s
|
||||||
|
[DEBUG] Saved chapter: output/xxx/01123_章名.txt
|
||||||
|
|
||||||
|
|
||||||
|
Hierdoor voelt het alsof de scraper “live” aan het werk is.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 📡 Live Logging (SSE)
|
||||||
|
|
||||||
|
De Logger vangt alle BookScraper-meldingen op en streamt ze
|
||||||
|
via `/stream` naar de webinterface.
|
||||||
|
Dit maakt het ideaal om scraping in de gaten te houden zonder console.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. 🔧 Configuratie via `.env`
|
||||||
|
|
||||||
|
Om het project flexibel te houden wordt alles ingesteld via `.env`:
|
||||||
|
|
||||||
|
- Throttle (`MAX_DOWNLOADS_PER_SEC`)
|
||||||
|
- Debugmode (`FLASK_DEBUG`)
|
||||||
|
- DRY_RUN (alleen eerste chapters)
|
||||||
|
- Volume size
|
||||||
|
- Host & Port
|
||||||
|
|
||||||
|
De `.env` wordt automatisch geladen door Docker Compose en door Flask.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 📦 Projectstructuur
|
||||||
|
|
||||||
|
bookscraper/
|
||||||
|
│
|
||||||
|
├── scraper/
|
||||||
|
│ ├── book_scraper.py # De scraper engine
|
||||||
|
│ ├── logger.py # SSE logger
|
||||||
|
│ ├── sites.py # Site configuratie (selectors etc.)
|
||||||
|
│ ├── utils.py # Helpers
|
||||||
|
│ └── ...
|
||||||
|
│
|
||||||
|
├── templates/
|
||||||
|
│ └── index.html # UI
|
||||||
|
│
|
||||||
|
├── output/ # Book results
|
||||||
|
│
|
||||||
|
├── app.py # Flask webserver + endpoints
|
||||||
|
├── replacements.txt # Tekstvervangers
|
||||||
|
├── Dockerfile
|
||||||
|
├── docker-compose.yml
|
||||||
|
├── requirements.txt
|
||||||
|
└── .env
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# ▶️ Project handmatig starten (ZONDER Docker)
|
||||||
|
|
||||||
|
Zorg dat dependencies geïnstalleerd zijn:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
Start de Flask server:
|
||||||
|
python app.py
|
||||||
|
|
||||||
|
Open daarna:
|
||||||
|
👉 http://localhost:5000
|
||||||
|
|
||||||
|
Docker Build (zonder compose)
|
||||||
|
|
||||||
|
Manueel builden:
|
||||||
|
|
||||||
|
docker build -t bookscraper .
|
||||||
|
docker run -p 5000:5000 --env-file .env bookscraper
|
||||||
|
docker run \
|
||||||
|
-p 5000:5000 \
|
||||||
|
--env-file .env \
|
||||||
|
-v $(pwd)/output:/app/output \
|
||||||
|
bookscraper
|
||||||
@ -1,53 +1,71 @@
|
|||||||
from flask import Flask, request, render_template_string
|
# app.py
|
||||||
from scraper.book_scraper import BookScraper
|
from flask import Flask, request, Response, render_template
|
||||||
from scraper.sites import BookSite
|
import time
|
||||||
import sys
|
import queue
|
||||||
import os
|
import os
|
||||||
|
|
||||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
from scraper.book_scraper import BookScraper
|
||||||
|
from scraper.sites import BookSite
|
||||||
|
from scraper.logger import add_listener, remove_listener, LOG_BUFFER
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
# --- GET: toon formulier ---
|
@app.route("/")
|
||||||
@app.route("/", methods=["GET"])
|
|
||||||
def index():
|
def index():
|
||||||
return render_template_string("""
|
return render_template("index.html")
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h2>BookScraper</h2>
|
|
||||||
<form method="post">
|
|
||||||
<label>Book URL:</label><br>
|
|
||||||
<input name="url" style="width:400px"><br>
|
|
||||||
<button type="submit">Scrape</button>
|
|
||||||
</form>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
""")
|
|
||||||
|
|
||||||
|
|
||||||
# --- POST: scraper uitvoeren ---
|
|
||||||
@app.route("/", methods=["POST"])
|
|
||||||
def run_scraper():
|
|
||||||
url = request.form.get("url")
|
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
# RUN SCRAPER
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
|
||||||
|
@app.route("/run", methods=["POST"])
|
||||||
|
def run_scraper():
|
||||||
|
data = request.json
|
||||||
site = BookSite()
|
site = BookSite()
|
||||||
scraper = BookScraper(site, url)
|
scraper = BookScraper(site, data["url"])
|
||||||
result = scraper.execute()
|
result = scraper.execute()
|
||||||
|
|
||||||
return render_template_string("""
|
return {
|
||||||
<html>
|
"title": result["title"],
|
||||||
<body>
|
"buffer": LOG_BUFFER.getvalue()
|
||||||
<h2>Scrape result: {{title}}</h2>
|
}
|
||||||
<h3>Debug output:</h3>
|
|
||||||
<pre style='background:#111;color:#0f0;padding:10px;border-radius:8px'>
|
|
||||||
{{debug}}
|
|
||||||
</pre>
|
|
||||||
<p><a href="/">Terug</a></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
""", title=result["title"], debug=result["debug"])
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
# REALTIME LOG STREAM (SSE)
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
|
||||||
|
@app.route("/stream")
|
||||||
|
def stream():
|
||||||
|
|
||||||
|
def event_stream():
|
||||||
|
q = queue.Queue()
|
||||||
|
|
||||||
|
# push logregels van BookScraper naar SSE
|
||||||
|
def listener(line):
|
||||||
|
q.put(line)
|
||||||
|
|
||||||
|
add_listener(listener)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
msg = q.get() # blokkeert totdat logregel binnenkomt
|
||||||
|
yield f"data: {msg}\n\n"
|
||||||
|
except GeneratorExit:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
remove_listener(listener)
|
||||||
|
|
||||||
|
return Response(event_stream(), mimetype="text/event-stream")
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(debug=True)
|
debug = os.getenv("FLASK_DEBUG", "0") == "1"
|
||||||
|
host = os.getenv("HOST", "0.0.0.0")
|
||||||
|
port = int(os.getenv("PORT", "5000"))
|
||||||
|
|
||||||
|
app.run(debug=debug, host=host, port=port)
|
||||||
|
|||||||
@ -0,0 +1,25 @@
|
|||||||
|
version: "3.9"
|
||||||
|
|
||||||
|
services:
|
||||||
|
bookscraper:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: bookscraper
|
||||||
|
ports:
|
||||||
|
- "5050:5000"
|
||||||
|
|
||||||
|
# Mount alles zoals je lokaal al werkt
|
||||||
|
volumes:
|
||||||
|
- .:/app # volledige projectmap
|
||||||
|
- /Users/peter/Desktop/books:/app/output
|
||||||
|
|
||||||
|
# Bestaande .env wordt automatisch geladen door Docker Compose
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
|
||||||
|
# Zorg dat Flask NIET in debugmode gaat (jouw code bepaalt dit)
|
||||||
|
environment:
|
||||||
|
FLASK_ENV: "production"
|
||||||
|
|
||||||
|
restart: unless-stopped
|
||||||
|
Before Width: | Height: | Size: 11 KiB |
@ -1,22 +1,40 @@
|
|||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<title>Book Scraper</title>
|
<title>BookScraper</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: Arial; padding:20px; }
|
||||||
|
#log { background:#000; color:#0f0; padding:10px; height:400px; overflow:auto; white-space:pre-wrap; }
|
||||||
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
|
||||||
<h1>Book Scraper</h1>
|
<h1>BookScraper</h1>
|
||||||
|
|
||||||
{% if error %}
|
<input id="url" type="text" placeholder="Book URL" style="width:400px">
|
||||||
<p style="color:red">{{ error }}</p>
|
<button onclick="startScrape()">Start</button>
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<form method="post">
|
<h2>Realtime log:</h2>
|
||||||
<label for="book_url">Enter Book URL:</label><br><br>
|
<div id="log"></div>
|
||||||
<input type="text" id="book_url" name="book_url" style="width:400px">
|
|
||||||
<br><br>
|
<script>
|
||||||
<button type="submit">Scrape</button>
|
function startScrape() {
|
||||||
</form>
|
document.getElementById("log").innerHTML = "";
|
||||||
|
|
||||||
|
const evtSource = new EventSource("/stream");
|
||||||
|
evtSource.onmessage = function(e) {
|
||||||
|
const logDiv = document.getElementById("log");
|
||||||
|
logDiv.innerText += e.data + "\n";
|
||||||
|
logDiv.scrollTop = logDiv.scrollHeight;
|
||||||
|
};
|
||||||
|
|
||||||
|
fetch("/run", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ url: document.getElementById("url").value })
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
Loading…
Reference in new issue