You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/parse_tasks.py

58 lines
1.4 KiB

# scraper/tasks/parse_tasks.py
from celery import shared_task
from logbus.publisher import log
from scraper.utils import clean_text
from bs4 import BeautifulSoup
@shared_task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, html: str, chapter_url: str):
"""
Parse downloaded chapter HTML into clean text.
Returns a dict:
{
"url": chapter_url,
"text": "...parsed text..."
}
"""
try:
log(f"[PARSE] Start parsing: {chapter_url}")
soup = BeautifulSoup(html, "html.parser")
# Veel Chinese sites gebruiken dit soort containers:
possible_blocks = [
"#content",
".content",
"div#content",
"div.content",
"div#chaptercontent",
"#chapterContent"
]
node = None
for sel in possible_blocks:
r = soup.select_one(sel)
if r:
node = r
break
if not node:
log(
f"[PARSE] WARNING: no known content block found in {chapter_url}")
text = clean_text(soup.get_text())
else:
text = clean_text(node.get_text())
log(f"[PARSE] Finished parsing: {chapter_url} ({len(text)} chars)")
return {
"url": chapter_url,
"text": text,
}
except Exception as exc:
log(f"[PARSE] ERROR parsing {chapter_url}: {exc}")
raise