parsing fix+progress half working

feat/dashboard-upgrade
peter.fong 2 weeks ago
parent 7ee6c5e276
commit 5159c32f58

@ -134,3 +134,11 @@ docker compose up
docker compose down
docker compose build
docker compose up
tar \
--exclude="**pycache**" \
--exclude="_/**pycache**/_" \
--exclude="\*.pyc" \
--exclude=".venv" \
--exclude="venv" \
-czvf project.tar.gz .

@ -125,6 +125,33 @@ def celery_result(task_id):
return jsonify({"ready": False})
# =====================================================
# API: book status new model
# =====================================================
def getStatus(book_id):
state = r.hgetall(f"book:{book_id}:state")
status = state.get("status") or "unknown"
dl_done = int(state.get("chapters_download_done", 0))
dl_skipped = int(state.get("chapters_download_skipped", 0))
dl_total = int(state.get("chapters_total", 0))
au_done = int(state.get("audio_done") or 0)
title = state.get("title") or book_id
au_total = dl_total
return {
"book_id": book_id,
"title": title,
"status": status,
"download_done": dl_done,
"download_skipped": dl_skipped,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": au_total,
}
# =====================================================
# REDIS BACKEND — BOOK STATE MODEL
# =====================================================
@ -132,36 +159,29 @@ REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
def list_active_books():
def list_active_booksold():
"""Return list of active books from Redis Book State Model."""
keys = r.keys("book:*:status")
keys = r.keys("book:*:state")
books = []
for key in keys:
book_id = key.split(":")[1]
status = r.get(f"book:{book_id}:status") or "unknown"
title = r.get(f"book:{book_id}:title") or book_id
dl_done = int(r.get(f"book:{book_id}:download:done") or 0)
dl_total = int(r.get(f"book:{book_id}:download:total") or 0)
au_done = int(r.get(f"book:{book_id}:audio:done") or 0)
au_total = dl_total
books.append(
{
"book_id": book_id,
"title": title,
"status": status,
"download_done": dl_done,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": au_total,
}
)
print(book_id)
books.append(getStatus(book_id))
return books
def list_active_books():
books = []
for key in r.scan_iter(match="book:*:state", count=1000):
first = key.find(":")
second = key.find(":", first + 1)
book_id = key[first + 1 : second]
books.append(getStatus(book_id))
return books
# =====================================================
# API: list all active books
# =====================================================
@ -170,27 +190,10 @@ def api_books():
return jsonify(list_active_books())
# =====================================================
# API: book status
# =====================================================
@app.route("/api/book/<book_id>/status")
def api_book_status(book_id):
status = r.get(f"book:{book_id}:status") or "unknown"
dl_done = int(r.get(f"book:{book_id}:download:done") or 0)
dl_total = int(r.get(f"book:{book_id}:download:total") or 0)
au_done = int(r.get(f"book:{book_id}:audio:done") or 0)
au_total = dl_total
return jsonify(
{
"book_id": book_id,
"status": status,
"download_done": dl_done,
"download_total": dl_total,
"audio_done": au_done,
"audio_total": au_total,
}
)
return jsonify(getStatus(book_id))
# =====================================================

Binary file not shown.

@ -25,7 +25,7 @@ def set_total(book_id: str, total: int):
# ------------------------------------------------------------
# COUNTERS
# COUNTERS legacy
# ------------------------------------------------------------
def inc_completed(book_id: str):
r.incr(f"progress:{book_id}:completed")
@ -96,6 +96,7 @@ def init_book_state(
"status": "scraping",
"chapters_total": chapters_total,
"chapters_done": 0,
"chapters_download_skipped": 0,
"audio_total": 0,
"audio_done": 0,
"last_update": now,
@ -120,7 +121,7 @@ def set_last_update(book_id: str):
# ------------------------------------------------------------
# Chapter counters
# Chapter counters new model
# ------------------------------------------------------------
def set_chapter_total(book_id: str, total: int):
key = f"book:{book_id}:state"
@ -128,9 +129,15 @@ def set_chapter_total(book_id: str, total: int):
set_last_update(book_id)
def inc_chapter_download_skipped(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_download_skipped", 1)
set_last_update(book_id)
def inc_chapter_done(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "chapters_done", 1)
r.hincrby(key, "chapters_download_done", 1)
set_last_update(book_id)
@ -149,6 +156,12 @@ def inc_audio_done(book_id: str):
set_last_update(book_id)
def inc_audio_skipped(book_id: str):
key = f"book:{book_id}:state"
r.hincrby(key, "audio_skipped", 1)
set_last_update(book_id)
# ------------------------------------------------------------
# Skip reasons
# ------------------------------------------------------------
@ -171,7 +184,14 @@ def get_state(book_id: str):
state = r.hgetall(key) or {}
# Numeric conversions
numeric_fields = ["chapters_total", "chapters_done", "audio_total", "audio_done"]
numeric_fields = [
"chapters_total",
"chapters_download_done",
"chapters_download_skipped",
"audio_total",
"audio_skipped",
"audio_done",
]
for field in numeric_fields:
if field in state:
try:

@ -36,7 +36,8 @@
All rights reserved=
Copyright=
飘天文学=
=
…=
# --- Piaotia specific ---
请记住本书域名=
请收藏本书=
@ -53,7 +54,15 @@ Copyright=
章节出错=
点此举报=
举报原因=
www.piaotia.com=
www.piaotian.com=
www.=
www=
.com=
piaotia=
.net=
piaotian=
www.piaotia.com=
# --- Ads / QR / watermark ---
关注公众号=
微信扫一扫=
@ -68,10 +77,17 @@ sponsored=
ADVERTISEMENT=
Advertisment=
Adblock=
bookid=
bookname=
# --- Mode / UI related ---
选择背景颜色=
选择字体大小=
繁體中文=
模式选择=
阅读模式=
冲榜
求票
诸神学徒
感谢各位书友的支持=
您的支持就是我们最大的动力=
感谢各位书友的支持,您的支持就是我们最大的动力=

@ -8,6 +8,7 @@ import os
import subprocess
import time
from scraper.progress import inc_audio_done, inc_audio_skipped
from scraper.abort import abort_requested
from redis import Redis
from urllib.parse import urlparse
@ -52,6 +53,7 @@ def generate_audio(
# Abort early
if abort_requested(book_id, backend_client):
inc_audio_skipped(book_id)
log(f"[AUDIO] ABORT detected → skip CH{chapter_number}")
return
@ -160,6 +162,8 @@ def generate_audio(
# ============================================================
try:
subprocess.run(cmd, shell=True, check=True, timeout=AUDIO_TIMEOUT)
inc_audio_done(book_id)
log(f"[AUDIO] CH{chapter_number}: Completed")
except subprocess.TimeoutExpired:

@ -13,6 +13,11 @@ from celery_app import celery_app
from scraper.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started
from scraper.progress import (
inc_completed,
inc_chapter_done,
inc_chapter_download_skipped,
)
from logbus.publisher import log
from scraper.ui_log import push_ui
@ -111,7 +116,7 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
if abort_requested(book_id) and not chapter_started(book_id, chapter_num):
msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)"
log_msg(book_id, msg)
inc_chapter_download_skipped(book_id)
return {
"book_id": book_id,
"chapter": chapter_dict,
@ -149,7 +154,7 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
wait_for_global_delay()
acquire_global_slot(MAX_CONCURRENCY)
log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
# log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
# -----------------------------------------------------------
# HTTP DOWNLOAD
@ -207,4 +212,4 @@ def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
finally:
set_global_delay()
release_global_slot()
log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")
# log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")

@ -11,9 +11,85 @@ from bs4 import BeautifulSoup
from scraper.utils import clean_text, load_all_replacements
from scraper.tasks.download_tasks import log_msg # unified logger
from bs4 import NavigableString, Comment
print(">>> [IMPORT] parse_tasks.py loaded (enhanced parser)")
def extract_piaotia_content(soup):
"""
Extract clean chapter content from Piaotia pages.
Start after the table following <H1>.
End before nav/ads/footer/copyright.
"""
h1 = soup.find("h1")
if not h1:
return None
# -------- Find first table after <h1> --------
table = None
for sib in h1.next_siblings:
if getattr(sib, "name", None) == "table":
table = sib
break
if not table:
return None
parts = []
# -------- Iterate after table --------
for sib in table.next_siblings:
name = getattr(sib, "name", None)
text = None
if hasattr(sib, "get_text"):
text = sib.get_text(strip=True)
# === STOP CONDITIONS ===
# Comments like <!-- 翻页上AD开始 -->
if isinstance(sib, Comment) and ("翻页" in sib):
break
# Explicit footer blocks
if name == "div":
sid = sib.get("id", "")
cls = sib.get("class", [])
if sid in ("thumb", "tags", "tips", "Commenddiv", "feit2"):
break
# Copyright block — strongest indicator
if text and ("重要声明" in text or "Copyright" in text):
break
# Navigation or 推荐阅读
if text and (text.startswith(("推荐阅读", "目录", "目 录"))):
break
# Skip scripts, ads, centers
if name in ("script", "style"):
continue
# Skip JS containers like <center><script>...</script></center>
if name == "center":
continue
# === ACCUMULATE TEXT ===
if isinstance(sib, NavigableString):
s = sib.strip()
if s:
parts.append(s)
elif hasattr(sib, "get_text"):
t = sib.get_text(separator="\n").strip()
if t:
parts.append(t)
return "\n".join(parts).strip()
@celery_app.task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, download_result: dict):
"""
@ -63,32 +139,38 @@ def parse_chapter(self, download_result: dict):
node = tmp
break
# ------------------------------------------------------------
# PIAOTIA FALLBACK:
# Extract content between <H1> and the "bottomlink" block.
# ------------------------------------------------------------
raw = None
# --- STRICT SELECTOR FAILED → Try Piaotia extractor ---
if node is None:
h1 = soup.find("h1")
if h1:
content_parts = []
for sib in h1.next_siblings:
sib_class = getattr(sib, "get", lambda *_: None)("class")
if sib_class and (
"bottomlink" in sib_class or sib_class == "bottomlink"
):
break
if getattr(sib, "name", None) in ["script", "style", "center"]:
continue
if hasattr(sib, "get_text"):
content_parts.append(sib.get_text(separator="\n"))
else:
content_parts.append(str(sib))
raw = "\n".join(content_parts)
raw = extract_piaotia_content(soup)
# # ------------------------------------------------------------
# # PIAOTIA FALLBACK:
# # Extract content between <H1> and the "bottomlink" block.
# # ------------------------------------------------------------
# raw = None
# if node is None:
# h1 = soup.find("h1")
# if h1:
# content_parts = []
# for sib in h1.next_siblings:
# sib_class = getattr(sib, "get", lambda *_: None)("class")
# if sib_class and (
# "bottomlink" in sib_class or sib_class == "bottomlink"
# ):
# break
# if getattr(sib, "name", None) in ["script", "style", "center"]:
# continue
# if hasattr(sib, "get_text"):
# content_parts.append(sib.get_text(separator="\n"))
# else:
# content_parts.append(str(sib))
# raw = "\n".join(content_parts)
# ------------------------------------------------------------
# FINAL FALLBACK

@ -8,12 +8,12 @@ print(">>> [IMPORT] save_tasks.py loaded")
from celery import shared_task
import os
from scraper.utils import get_save_path
from scraper.tasks.download_tasks import log_msg # unified logger
from scraper.progress import (
inc_completed,
inc_skipped,
inc_chapter_done,
inc_chapter_download_skipped,
)
from scraper.tasks.audio_tasks import generate_audio
@ -54,7 +54,7 @@ def save_chapter(self, parsed: dict):
path = parsed.get("path", None)
log_msg(book_id, f"[SAVE] SKIP chapter {chapter_num}{path}")
inc_skipped(book_id)
inc_chapter_download_skipped(book_id)
volume_name = os.path.basename(volume_path.rstrip("/"))
@ -103,6 +103,7 @@ def save_chapter(self, parsed: dict):
f.write(text)
log_msg(book_id, f"[SAVE] Saved chapter {chapter_num}{path}")
inc_chapter_done(book_id)
inc_completed(book_id)
# Determine volume name

@ -97,6 +97,7 @@ def clean_text(raw: str, repl: dict) -> str:
# Apply loaded replacements
for key, val in repl.items():
# print(f"Replacing: {key} → {val}")
txt = txt.replace(key, val)
# Collapse 3+ blank lines → max 1

@ -127,6 +127,6 @@ function pollLogs() {
}
// Poll every 800 ms
setInterval(pollLogs, 800);
setInterval(pollLogs, 1800);
console.log(">>> log_view.js LOADED");

@ -57,7 +57,9 @@ Copyright ©=
本站立场无关=
均由网友发表或上传=
感谢各位书友的支持,您的支持就是我们最大的动力
飘天文学www.piaotia.com
感谢各位书友的支持
您的支持就是我们最大的动力
# ---------- COMMON NOISE ----------
广告=
广告位=

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save