Compare commits

...

3 Commits

@ -134,3 +134,11 @@ docker compose up
docker compose down docker compose down
docker compose build docker compose build
docker compose up docker compose up
tar \
--exclude="**pycache**" \
--exclude="_/**pycache**/_" \
--exclude="\*.pyc" \
--exclude=".venv" \
--exclude="venv" \
-czvf project.tar.gz .

@ -8,6 +8,9 @@ load_dotenv()
print(">>> [WEB] Importing celery_app …") print(">>> [WEB] Importing celery_app …")
from celery_app import celery_app from celery_app import celery_app
from db.db import init_db
init_db() # ensure DB schema exists before Flask starts
from flask import Flask, render_template, request, jsonify from flask import Flask, render_template, request, jsonify
from scraper.logger import log_debug from scraper.logger import log_debug
@ -125,6 +128,33 @@ def celery_result(task_id):
return jsonify({"ready": False}) return jsonify({"ready": False})
# =====================================================
# API: book status new model
# =====================================================
def getStatus(book_id):
state = r.hgetall(f"book:{book_id}:state")
status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_id
au_total = dl_total
return {
"book_id": book_id,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": au_total,
}
# ===================================================== # =====================================================
# REDIS BACKEND — BOOK STATE MODEL # REDIS BACKEND — BOOK STATE MODEL
# ===================================================== # =====================================================
@ -132,33 +162,26 @@ REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True) r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
def list_active_books(): def list_active_booksold():
"""Return list of active books from Redis Book State Model.""" """Return list of active books from Redis Book State Model."""
keys = r.keys("book:*:status") keys = r.keys("book:*:state")
books = [] books = []
for key in keys: for key in keys:
book_id = key.split(":")[1] book_id = key.split(":")[1]
status = r.get(f"book:{book_id}:status") or "unknown" print(book_id)
title = r.get(f"book:{book_id}:title") or book_id books.append(getStatus(book_id))
dl_done = int(r.get(f"book:{book_id}:download:done") or 0) return books
dl_total = int(r.get(f"book:{book_id}:download:total") or 0)
au_done = int(r.get(f"book:{book_id}:audio:done") or 0)
au_total = dl_total
books.append(
{
"book_id": book_id,
"title": title,
"status": status,
"download_done": dl_done,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": au_total,
}
)
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_id = key[first + 1 : second]
books.append(getStatus(book_id))
return books return books
@ -170,27 +193,10 @@ def api_books():
return jsonify(list_active_books()) return jsonify(list_active_books())
# =====================================================
# API: book status
# =====================================================
@app.route("/api/book/<book_id>/status") @app.route("/api/book/<book_id>/status")
def api_book_status(book_id): def api_book_status(book_id):
status = r.get(f"book:{book_id}:status") or "unknown"
dl_done = int(r.get(f"book:{book_id}:download:done") or 0)
dl_total = int(r.get(f"book:{book_id}:download:total") or 0)
au_done = int(r.get(f"book:{book_id}:audio:done") or 0)
au_total = dl_total
return jsonify( return jsonify(getStatus(book_id))
{
"book_id": book_id,
"status": status,
"download_done": dl_done,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": au_total,
}
)
# ===================================================== # =====================================================

@ -5,6 +5,9 @@ from dotenv import load_dotenv
print(">>> [celery_app] Loading .env BEFORE initializing Celery...") print(">>> [celery_app] Loading .env BEFORE initializing Celery...")
load_dotenv() load_dotenv()
from db.db import init_db
init_db() # ensures DB exists for all workers
BROKER = os.getenv("REDIS_BROKER") BROKER = os.getenv("REDIS_BROKER")
BACKEND = os.getenv("REDIS_BACKEND") BACKEND = os.getenv("REDIS_BACKEND")

@ -0,0 +1,119 @@
# ============================================================
# File: db/db.py
# Purpose:
# Raw SQLite engine for BookScraper.
# Provides ONLY low-level DB primitives.
# - Connection management (WAL mode)
# - init_db() schema creation
# - upsert_book() atomic write
# - raw fetch helpers (private)
#
# All business logic belongs in repository.py.
# ============================================================
import os
import sqlite3
from threading import Lock
DB_PATH = os.environ.get("BOOKSCRAPER_DB", "/app/data/books.db")
# Ensure directory exists
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
# Per-process connection cache
_connection_cache = {}
_connection_lock = Lock()
# ------------------------------------------------------------
# Connection handling
# ------------------------------------------------------------
def get_db():
pid = os.getpid()
if pid not in _connection_cache:
with _connection_lock:
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.row_factory = sqlite3.Row
enable_wal_mode(conn)
_connection_cache[pid] = conn
return _connection_cache[pid]
def enable_wal_mode(conn):
conn.execute("PRAGMA journal_mode=DELETE;")
conn.execute("PRAGMA synchronous=NORMAL;")
conn.commit()
# ------------------------------------------------------------
# Schema creation
# ------------------------------------------------------------
def init_db():
conn = get_db()
conn.execute(
"""
CREATE TABLE IF NOT EXISTS books (
book_id TEXT PRIMARY KEY,
title TEXT,
author TEXT,
cover_url TEXT,
cover_path TEXT,
chapters_total INTEGER,
status TEXT,
downloaded INTEGER DEFAULT 0,
parsed INTEGER DEFAULT 0,
audio_done INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
last_update DATETIME
);
"""
)
conn.commit()
# ------------------------------------------------------------
# WRITE OPERATIONS
# ------------------------------------------------------------
def upsert_book(book_id, **fields):
"""
Raw upsert primitive. Repository layer should call this.
"""
conn = get_db()
keys = ["book_id"] + list(fields.keys())
values = [book_id] + list(fields.values())
placeholders = ",".join(["?"] * len(values))
updates = ", ".join([f"{k} = excluded.{k}" for k in fields.keys()])
sql = f"""
INSERT INTO books ({','.join(keys)})
VALUES ({placeholders})
ON CONFLICT(book_id)
DO UPDATE SET {updates},
last_update = CURRENT_TIMESTAMP;
"""
conn.execute(sql, values)
conn.commit()
# ------------------------------------------------------------
# RAW READ OPERATIONS (PRIVATE)
# ------------------------------------------------------------
def _raw_get_book(book_id):
conn = get_db()
row = conn.execute("SELECT * FROM books WHERE book_id = ?;", (book_id,)).fetchone()
return dict(row) if row else None
def _raw_get_all_books():
conn = get_db()
cur = conn.execute("SELECT * FROM books ORDER BY created_at DESC;")
return [dict(row) for row in cur.fetchall()]

@ -0,0 +1,97 @@
# ============================================================
# File: db/repository.py
# Purpose:
# High-level BookScraper database interface.
# This is the ONLY module Celery tasks and Flask should use.
#
# Uses low-level primitives from db.db, but exposes
# domain-level operations:
# - fetch_book / fetch_all_books
# - create_or_update_book
# - set_status
# - incrementing counters
# ============================================================
from db.db import (
upsert_book,
_raw_get_book,
_raw_get_all_books,
)
# ------------------------------------------------------------
# FETCH OPERATIONS
# ------------------------------------------------------------
def fetch_book(book_id):
"""Return a single book dict or None."""
return _raw_get_book(book_id)
def fetch_all_books():
"""Return all books ordered newest → oldest."""
return _raw_get_all_books()
# ------------------------------------------------------------
# BOOK CREATION / METADATA
# ------------------------------------------------------------
def create_or_update_book(
book_id,
title=None,
author=None,
chapters_total=None,
cover_url=None,
cover_path=None,
status=None,
):
fields = {}
if title is not None:
fields["title"] = title
if author is not None:
fields["author"] = author
if chapters_total is not None:
fields["chapters_total"] = chapters_total
if cover_url is not None:
fields["cover_url"] = cover_url
if cover_path is not None:
fields["cover_path"] = cover_path
if status is not None:
fields["status"] = status
if fields:
upsert_book(book_id, **fields)
# ------------------------------------------------------------
# STATUS MANAGEMENT
# ------------------------------------------------------------
def set_status(book_id, status):
upsert_book(book_id, status=status)
# ------------------------------------------------------------
# INCREMENTING COUNTERS (atomic)
# ------------------------------------------------------------
def inc_downloaded(book_id, amount=1):
book = _raw_get_book(book_id)
if not book:
return
cur = book.get("downloaded", 0) or 0
upsert_book(book_id, downloaded=cur + amount)
def inc_parsed(book_id, amount=1):
book = _raw_get_book(book_id)
if not book:
return
cur = book.get("parsed", 0) or 0
upsert_book(book_id, parsed=cur + amount)
def inc_audio_done(book_id, amount=1):
book = _raw_get_book(book_id)
if not book:
return
cur = book.get("audio_done", 0) or 0
upsert_book(book_id, audio_done=cur + amount)

@ -5,14 +5,15 @@ services:
redis: redis:
image: redis:7 image: redis:7
container_name: bookscraper_redis container_name: bookscraper_redis
command: [ command:
[
"redis-server", "redis-server",
"--save", "--save",
"", # Disable RDB snapshots "",
"--appendonly", "--appendonly",
"no", # Disable AOF "no",
"--stop-writes-on-bgsave-error", "--stop-writes-on-bgsave-error",
"no", # Never block writes "no",
] ]
ports: ports:
- "6379:6379" - "6379:6379"
@ -41,7 +42,8 @@ services:
- PYTHONUNBUFFERED=1 - PYTHONUNBUFFERED=1
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
restart: "no" restart: "no"
# ---------------------------------------------------------- # ----------------------------------------------------------
@ -54,7 +56,8 @@ services:
container_name: bookscraper_web container_name: bookscraper_web
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -77,7 +80,8 @@ services:
container_name: worker_download container_name: worker_download
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -96,7 +100,8 @@ services:
container_name: worker_parse container_name: worker_parse
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -115,7 +120,8 @@ services:
container_name: worker_save container_name: worker_save
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
@ -134,7 +140,8 @@ services:
container_name: worker_scraping container_name: worker_scraping
volumes: volumes:
- .:/app - .:/app
- /Users/peter/Desktop/books:/app/output - /Users/peter/mnt/asustor/Sync/bookscraper/books:/Users/peter/mnt/asustor/Sync/bookscraper/books
- /Users/peter/mnt/asustor/Sync/bookscraper/db:/Users/peter/mnt/asustor/Sync/bookscraper/db
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy

Binary file not shown.

@ -16,6 +16,7 @@ import os
import requests import requests
import shutil import shutil
from scraper.abort import abort_requested # DEBUG allowed from scraper.abort import abort_requested # DEBUG allowed
from db.repository import create_or_update_book
# NEW: Redis State Model (C&U) # NEW: Redis State Model (C&U)
from scraper.progress import ( from scraper.progress import (
@ -133,6 +134,7 @@ class DownloadController:
return return
try: try:
for entry in os.listdir(self.book_base): for entry in os.listdir(self.book_base):
if entry.lower().startswith("volume_"): if entry.lower().startswith("volume_"):
vol_dir = os.path.join(self.book_base, entry) vol_dir = os.path.join(self.book_base, entry)
@ -144,6 +146,29 @@ class DownloadController:
except Exception as e: except Exception as e:
log(f"[CTRL] Cover replication failed: {e}") log(f"[CTRL] Cover replication failed: {e}")
def store_cover_in_static(self):
"""
Copy the main cover.jpg from book_base into static/covers/<book_id>.jpg.
This allows the Flask web UI to serve the cover directly.
"""
src = os.path.join(self.book_base, "cover.jpg")
if not os.path.exists(src):
log("[CTRL] No cover.jpg found, cannot store in static/covers")
return
# static/covers/<book_id>.jpg
static_dir = os.path.join("static", "covers")
os.makedirs(static_dir, exist_ok=True)
dst = os.path.join(static_dir, f"{self.book_id}.jpg")
try:
shutil.copyfile(src, dst)
log(f"[CTRL] Cover stored for UI: {dst}")
except Exception as e:
log(f"[CTRL] Failed to store cover in static: {e}")
# --------------------------------------------------------- # ---------------------------------------------------------
# Volume isolation # Volume isolation
# --------------------------------------------------------- # ---------------------------------------------------------
@ -225,7 +250,7 @@ class DownloadController:
# ------------------------------------------------------- # -------------------------------------------------------
self.replicate_cover_to_volumes() self.replicate_cover_to_volumes()
self.store_cover_in_static()
# ------------------------------------------------------- # -------------------------------------------------------
try: try:
generate_all_scripts( generate_all_scripts(

@ -25,7 +25,7 @@ def set_total(book_id: str, total: int):
# ------------------------------------------------------------ # ------------------------------------------------------------
# COUNTERS # COUNTERS legacy
# ------------------------------------------------------------ # ------------------------------------------------------------
def inc_completed(book_id: str): def inc_completed(book_id: str):
r.incr(f"progress:{book_id}:completed") r.incr(f"progress:{book_id}:completed")
@ -96,6 +96,7 @@ def init_book_state(
"status": "scraping", "status": "scraping",
"chapters_total": chapters_total, "chapters_total": chapters_total,
"chapters_done": 0, "chapters_done": 0,
"chapters_download_skipped": 0,
"audio_total": 0, "audio_total": 0,
"audio_done": 0, "audio_done": 0,
"last_update": now, "last_update": now,
@ -120,7 +121,7 @@ def set_last_update(book_id: str):
# ------------------------------------------------------------ # ------------------------------------------------------------
# Chapter counters # Chapter counters new model
# ------------------------------------------------------------ # ------------------------------------------------------------
def set_chapter_total(book_id: str, total: int): def set_chapter_total(book_id: str, total: int):
key = f"book:{book_id}:state" key = f"book:{book_id}:state"
@ -128,9 +129,15 @@ def set_chapter_total(book_id: str, total: int):
set_last_update(book_id) set_last_update(book_id)
def inc_chapter_download_skipped(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_download_skipped", 1)
set_last_update(book_id)
def inc_chapter_done(book_id: str): def inc_chapter_done(book_id: str):
key = f"book:{book_id}:state" key = f"book:{book_id}:state"
r.hincrby(key, "chapters_done", 1) r.hincrby(key, "chapters_download_done", 1)
set_last_update(book_id) set_last_update(book_id)
@ -149,6 +156,12 @@ def inc_audio_done(book_id: str):
set_last_update(book_id) set_last_update(book_id)
def inc_audio_skipped(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "audio_skipped", 1)
set_last_update(book_id)
# ------------------------------------------------------------ # ------------------------------------------------------------
# Skip reasons # Skip reasons
# ------------------------------------------------------------ # ------------------------------------------------------------
@ -171,7 +184,14 @@ def get_state(book_id: str):
state = r.hgetall(key) or {} state = r.hgetall(key) or {}
# Numeric conversions # Numeric conversions
numeric_fields = ["chapters_total", "chapters_done", "audio_total", "audio_done"] numeric_fields = [
"chapters_total",
"chapters_download_done",
"chapters_download_skipped",
"audio_total",
"audio_skipped",
"audio_done",
]
for field in numeric_fields: for field in numeric_fields:
if field in state: if field in state:
try: try:

@ -36,7 +36,8 @@
All rights reserved= All rights reserved=
Copyright= Copyright=
飘天文学= 飘天文学=
=
…=
# --- Piaotia specific --- # --- Piaotia specific ---
请记住本书域名= 请记住本书域名=
请收藏本书= 请收藏本书=
@ -53,7 +54,15 @@ Copyright=
章节出错= 章节出错=
点此举报= 点此举报=
举报原因= 举报原因=
www.piaotia.com=
www.piaotian.com=
www.=
www=
.com=
piaotia=
.net=
piaotian=
www.piaotia.com=
# --- Ads / QR / watermark --- # --- Ads / QR / watermark ---
关注公众号= 关注公众号=
微信扫一扫= 微信扫一扫=
@ -68,10 +77,17 @@ sponsored=
ADVERTISEMENT= ADVERTISEMENT=
Advertisment= Advertisment=
Adblock= Adblock=
bookid=
bookname=
# --- Mode / UI related --- # --- Mode / UI related ---
选择背景颜色= 选择背景颜色=
选择字体大小= 选择字体大小=
繁體中文= 繁體中文=
模式选择= 模式选择=
阅读模式= 阅读模式=
冲榜
求票
诸神学徒
感谢各位书友的支持=
您的支持就是我们最大的动力=
感谢各位书友的支持,您的支持就是我们最大的动力=

@ -35,7 +35,7 @@ def detect_volumes(book_base: str):
except Exception: except Exception:
continue continue
vols.sort() vols.sort()
return [v[0] for v in vols] return vols
# ------------------------------------------------------------ # ------------------------------------------------------------
@ -43,12 +43,12 @@ def detect_volumes(book_base: str):
# ------------------------------------------------------------ # ------------------------------------------------------------
def build_merge_block(title: str, author: str, volumes): def build_merge_block(title: str, author: str, volumes):
lines = [] lines = []
for vol in volumes: for num, dirname in volumes:
line = ( line = (
f'm4b-tool merge --jobs=4 --writer="{author}" ' f'm4b-tool merge --jobs=4 --writer="{author}" '
f'--albumartist="{author}" --album="{title}" ' f'--albumartist="{author}" --album="{title}" '
f'--name="{title}" --output-file="{title}-{vol}.m4b" ' f'--name="{title}" --output-file="{title}-{num}.m4b" '
f'"{vol}" -vvv' f'"{dirname}" -vvv'
) )
lines.append(line) lines.append(line)

@ -8,6 +8,9 @@ import os
import subprocess import subprocess
import time import time
from scraper.progress import inc_audio_done, inc_audio_skipped
# from db.repository import inc_audio_done
from scraper.abort import abort_requested from scraper.abort import abort_requested
from redis import Redis from redis import Redis
from urllib.parse import urlparse from urllib.parse import urlparse
@ -52,6 +55,7 @@ def generate_audio(
# Abort early # Abort early
if abort_requested(book_id, backend_client): if abort_requested(book_id, backend_client):
inc_audio_skipped(book_id)
log(f"[AUDIO] ABORT detected → skip CH{chapter_number}") log(f"[AUDIO] ABORT detected → skip CH{chapter_number}")
return return
@ -132,7 +136,7 @@ def generate_audio(
os.makedirs(base_dir, exist_ok=True) os.makedirs(base_dir, exist_ok=True)
safe_num = f"{chapter_number:04d}" safe_num = f"{chapter_number:04d}"
audio_file = os.path.join(base_dir, f"{safe_num}.m4a") audio_file = os.path.join(base_dir, f"{safe_num}.m4b")
if os.path.exists(audio_file): if os.path.exists(audio_file):
log(f"[AUDIO] Skip CH{chapter_number} → already exists") log(f"[AUDIO] Skip CH{chapter_number} → already exists")
@ -160,6 +164,8 @@ def generate_audio(
# ============================================================ # ============================================================
try: try:
subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT) subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT)
inc_audio_done(book_id)
log(f"[AUDIO] CH{chapter_number}: Completed") log(f"[AUDIO] CH{chapter_number}: Completed")
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:

@ -13,6 +13,12 @@ from celery_app import celery_app
from scraper.utils import get_save_path from scraper.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started from scraper.abort import abort_requested, chapter_started, mark_chapter_started
from scraper.progress import (
inc_completed,
inc_chapter_done,
inc_chapter_download_skipped,
)
from db.repository import inc_downloaded, set_status
from logbus.publisher import log from logbus.publisher import log
from scraper.ui_log import push_ui from scraper.ui_log import push_ui
@ -111,7 +117,7 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
if abort_requested(book_id) and not chapter_started(book_id, chapter_num): if abort_requested(book_id) and not chapter_started(book_id, chapter_num):
msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)" msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)"
log_msg(book_id, msg) log_msg(book_id, msg)
inc_chapter_download_skipped(book_id)
return { return {
"book_id": book_id, "book_id": book_id,
"chapter": chapter_dict, "chapter": chapter_dict,
@ -149,7 +155,7 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
wait_for_global_delay() wait_for_global_delay()
acquire_global_slot(MAX_CONCURRENCY) acquire_global_slot(MAX_CONCURRENCY)
log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}") # log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
# ----------------------------------------------------------- # -----------------------------------------------------------
# HTTP DOWNLOAD # HTTP DOWNLOAD
@ -207,4 +213,4 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
finally: finally:
set_global_delay() set_global_delay()
release_global_slot() release_global_slot()
log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}") # log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")

@ -11,9 +11,85 @@ from bs4 import BeautifulSoup
from scraper.utils import clean_text, load_all_replacements from scraper.utils import clean_text, load_all_replacements
from scraper.tasks.download_tasks import log_msg # unified logger from scraper.tasks.download_tasks import log_msg # unified logger
from bs4 import NavigableString, Comment
print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)") print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
def extract_piaotia_content(soup):
"""
Extract clean chapter content from Piaotia pages.
Start after the table following <H1>.
End before nav/ads/footer/copyright.
"""
h1 = soup.find("h1")
if not h1:
return None
# -------- Find first table after <h1> --------
table = None
for sib in h1.next_siblings:
if getattr(sib, "name", None) == "table":
table = sib
break
if not table:
return None
parts = []
# -------- Iterate after table --------
for sib in table.next_siblings:
name = getattr(sib, "name", None)
text = None
if hasattr(sib, "get_text"):
text = sib.get_text(strip=True)
# === STOP CONDITIONS ===
# Comments like <!-- 翻页上AD开始 -->
if isinstance(sib, Comment) and ("翻页" in sib):
break
# Explicit footer blocks
if name == "div":
sid = sib.get("id", "")
cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break
# Copyright block — strongest indicator
if text and ("重要声明" in text or "Copyright" in text):
break
# Navigation or 推荐阅读
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break
# Skip scripts, ads, centers
if name in ("script", "style"):
continue
# Skip JS containers like <center><script>...</script></center>
if name == "center":
continue
# === ACCUMULATE TEXT ===
if isinstance(sib, NavigableString):
s = sib.strip()
if s:
parts.append(s)
elif hasattr(sib, "get_text"):
t = sib.get_text(separator="\n").strip()
if t:
parts.append(t)
return "\n".join(parts).strip()
@celery_app.task(bind=True, queue="parse", ignore_result=False) @celery_app.task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, download_result: dict): def parse_chapter(self, download_result: dict):
""" """
@ -63,32 +139,38 @@ def parse_chapter(self, download_result: dict):
node = tmp node = tmp
break break
# ------------------------------------------------------------
# PIAOTIA FALLBACK:
# Extract content between <H1> and the "bottomlink" block.
# ------------------------------------------------------------
raw = None raw = None
# --- STRICT SELECTOR FAILED → Try Piaotia extractor ---
if node is None: if node is None:
h1 = soup.find("h1") raw = extract_piaotia_content(soup)
if h1:
content_parts = [] # # ------------------------------------------------------------
for sib in h1.next_siblings: # # PIAOTIA FALLBACK:
# # Extract content between <H1> and the "bottomlink" block.
sib_class = getattr(sib, "get", lambda *_: None)("class") # # ------------------------------------------------------------
if sib_class and ( # raw = None
"bottomlink" in sib_class or sib_class == "bottomlink" # if node is None:
): # h1 = soup.find("h1")
break # if h1:
# content_parts = []
if getattr(sib, "name", None) in ["script", "style", "center"]: # for sib in h1.next_siblings:
continue
# sib_class = getattr(sib, "get", lambda *_: None)("class")
if hasattr(sib, "get_text"): # if sib_class and (
content_parts.append(sib.get_text(separator="\n")) # "bottomlink" in sib_class or sib_class == "bottomlink"
else: # ):
content_parts.append(str(sib)) # break
raw = "\n".join(content_parts) # if getattr(sib, "name", None) in ["script", "style", "center"]:
# continue
# if hasattr(sib, "get_text"):
# content_parts.append(sib.get_text(separator="\n"))
# else:
# content_parts.append(str(sib))
# raw = "\n".join(content_parts)
# ------------------------------------------------------------ # ------------------------------------------------------------
# FINAL FALLBACK # FINAL FALLBACK

@ -8,12 +8,12 @@ print(">>> [IMPORT] save_tasks.py loaded")
from celery import shared_task from celery import shared_task
import os import os
from scraper.utils import get_save_path from scraper.utils import get_save_path
from scraper.tasks.download_tasks import log_msg # unified logger from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.progress import ( from scraper.progress import (
inc_completed, inc_completed,
inc_skipped, inc_chapter_done,
inc_chapter_download_skipped,
) )
from scraper.tasks.audio_tasks import generate_audio from scraper.tasks.audio_tasks import generate_audio
@ -54,7 +54,7 @@ def save_chapter(self, parsed: dict):
path = parsed.get("path", None) path = parsed.get("path", None)
log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num}{path}") log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num}{path}")
inc_skipped(book_id) inc_chapter_download_skipped(book_id)
volume_name = os.path.basename(volume_path.rstrip("/")) volume_name = os.path.basename(volume_path.rstrip("/"))
@ -103,6 +103,7 @@ def save_chapter(self, parsed: dict):
f.write(text) f.write(text)
log_msg(book_id, f"[SAVE] Saved chapter {chapter_num}{path}") log_msg(book_id, f"[SAVE] Saved chapter {chapter_num}{path}")
inc_chapter_done(book_id)
inc_completed(book_id) inc_completed(book_id)
# Determine volume name # Determine volume name

@ -0,0 +1,38 @@
#!/bin/sh
main_dir="$( cd "$( dirname "$0" )" && pwd )"
shopt -s nocasematch
for subfolder in "$main_dir"/*; do
if [ -d "$subfolder" ]; then
audiofolder="$subfolder/Audio"
mkdir -p "$audiofolder"
for entry in "$subfolder"/*.txt; do
fn=$(basename "$entry")
[[ "${entry##*.}" =~ txt ]]
echo "$fn"
inputfile="$subfolder/$fn"
outputfile="$audiofolder/${fn%.*}.m4b"
now=$(date +"%T")
echo "Current time : $now"
echo "$inputfile ->"
echo "$outputfile"
if [ -f "$outputfile" ]; then
echo "$outputfile exists: skipping"
else
say --voice=Sinji \
--output-file="$outputfile" \
--input-file="$inputfile" \
--file-format=m4bf \
--quality=127 \
-r 200 \
--data-format=aac
fi
done
fi
done

@ -97,6 +97,7 @@ def clean_text(raw: str, repl: dict) -> str:
# Apply loaded replacements # Apply loaded replacements
for key, val in repl.items(): for key, val in repl.items():
# print(f"Replacing: {key} → {val}")
txt = txt.replace(key, val) txt = txt.replace(key, val)
# Collapse 3+ blank lines → max 1 # Collapse 3+ blank lines → max 1

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

@ -16,7 +16,7 @@ console.log(">>> log_view.js LOADING…");
--------------------------------------------------------- */ --------------------------------------------------------- */
let LOG_FILTER = "ALL"; let LOG_FILTER = "ALL";
let LAST_LOG_INDEX = -1; // For delta polling let LAST_LOG_INDEX = -1; // For delta polling
const MAX_LOG_LINES = 2000; // Rolling cap to prevent freezing const MAX_LOG_LINES = 1000; // Rolling cap to prevent freezing
function applyLogFilter() { function applyLogFilter() {
console.log(">>> log_view.js applyLogFilter(), filter =", LOG_FILTER); console.log(">>> log_view.js applyLogFilter(), filter =", LOG_FILTER);
@ -49,11 +49,11 @@ document.addEventListener("DOMContentLoaded", () => {
console.log(">>> log_view.js: log viewer detected."); console.log(">>> log_view.js: log viewer detected.");
// Filter dropdown // Filter dropdown
filterSel.addEventListener("change", () => { // filterSel.addEventListener("change", () => {
LOG_FILTER = filterSel.value; // LOG_FILTER = filterSel.value;
console.log(">>> log_view.js filter changed to:", LOG_FILTER); // console.log(">>> log_view.js filter changed to:", LOG_FILTER);
applyLogFilter(); // applyLogFilter();
}); // });
// Clear log window // Clear log window
if (clearBtn) { if (clearBtn) {
@ -127,6 +127,6 @@ function pollLogs() {
} }
// Poll every 800 ms // Poll every 800 ms
setInterval(pollLogs, 800); setInterval(pollLogs, 1800);
console.log(">>> log_view.js LOADED"); console.log(">>> log_view.js LOADED");

@ -29,8 +29,6 @@
<script> <script>
const BOOK_ID = "{{ book_id }}"; const BOOK_ID = "{{ book_id }}";
</script> </script>
<script src="/static/js/helpers.js"></script>
<!-- Shared log viewer --> <!-- Shared log viewer -->
<script src="/static/js/log_view.js"></script> <script src="/static/js/log_view.js"></script>
@ -38,7 +36,4 @@
<script src="/static/js/dashboard.js"></script> <script src="/static/js/dashboard.js"></script>
<!-- Existing global app logic --> <!-- Existing global app logic -->
<script src="/static/js/progress.js"></script>
<script src="/static/js/app.js"></script>
{% endblock %} {% endblock %}

@ -57,7 +57,9 @@ Copyright ©=
本站立场无关= 本站立场无关=
均由网友发表或上传= 均由网友发表或上传=
感谢各位书友的支持,您的支持就是我们最大的动力 感谢各位书友的支持,您的支持就是我们最大的动力
飘天文学www.piaotia.com
感谢各位书友的支持
您的支持就是我们最大的动力
# ---------- COMMON NOISE ---------- # ---------- COMMON NOISE ----------
广告= 广告=
广告位= 广告位=

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save