You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
101 lines
3.2 KiB
101 lines
3.2 KiB
# =========================================================
|
|
# File: scraper/tasks/parse_tasks.py
|
|
# Purpose: Parse downloaded HTML into clean chapter text.
|
|
#
|
|
# Abort Behavior:
|
|
# - parse MUST ALWAYS RUN once download has started
|
|
# - even if the user triggers abort afterwards
|
|
# - (abort only prevents new chapters from starting)
|
|
#
|
|
# Logging:
|
|
# - Same unified log_msg(book_id, message) as download_tasks
|
|
# - publisher.log → console
|
|
# - ui_log.push_ui → GUI
|
|
# =========================================================
|
|
|
|
from celery_app import celery_app
|
|
from bs4 import BeautifulSoup
|
|
|
|
from scraper.utils import clean_text, load_replacements
|
|
from scraper.tasks.download_tasks import log_msg # unified logger
|
|
|
|
print(">>> [IMPORT] parse_tasks.py loaded")
|
|
|
|
|
|
@celery_app.task(bind=True, queue="parse", ignore_result=False)
|
|
def parse_chapter(self, download_result: dict, meta: dict):
|
|
"""
|
|
Parse raw HTML returned by download_chapter into clean chapter text.
|
|
"""
|
|
|
|
# Extract book_id stored by download_tasks
|
|
book_id = download_result.get("book_id", "NOBOOK")
|
|
|
|
# ------------------------------------------------------------
|
|
# 1) DOWNLOAD SKIPPED → PARSE ALSO SKIPS
|
|
# ------------------------------------------------------------
|
|
if download_result.get("skipped"):
|
|
chapter = download_result.get("chapter")
|
|
log_msg(book_id, f"[PARSE] SKIP chapter {chapter} (download skipped)")
|
|
return download_result
|
|
|
|
# ------------------------------------------------------------
|
|
# 2) Normal Parsing
|
|
# ------------------------------------------------------------
|
|
chapter_num = download_result["chapter"]
|
|
chapter_url = download_result["url"]
|
|
html = download_result["html"]
|
|
|
|
log_msg(book_id, f"[PARSE] Parsing chapter {chapter_num}")
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
selectors = [
|
|
"#content",
|
|
".content",
|
|
"div#content",
|
|
"div.content",
|
|
"div#chaptercontent",
|
|
"#chapterContent",
|
|
".read-content",
|
|
]
|
|
|
|
node = None
|
|
for sel in selectors:
|
|
tmp = soup.select_one(sel)
|
|
if tmp:
|
|
node = tmp
|
|
break
|
|
|
|
raw = node.get_text() if node else soup.get_text()
|
|
|
|
# ------------------------------------------------------------
|
|
# Apply global replacements
|
|
# ------------------------------------------------------------
|
|
REPL = load_replacements()
|
|
text = clean_text(raw, REPL)
|
|
|
|
# ------------------------------------------------------------
|
|
# Chapter 1 gets full header
|
|
# ------------------------------------------------------------
|
|
if chapter_num == 1:
|
|
book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
|
|
|
|
header = (
|
|
f"{meta.get('title','')}\n"
|
|
f"Author: {meta.get('author','')}\n"
|
|
f"Description:\n{meta.get('description','')}\n"
|
|
f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
|
|
)
|
|
text = header + text
|
|
|
|
log_msg(book_id, f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars")
|
|
|
|
return {
|
|
"book_id": book_id,
|
|
"chapter": chapter_num,
|
|
"url": chapter_url,
|
|
"text": text,
|
|
"length": len(text),
|
|
}
|