You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
1.4 KiB
58 lines
1.4 KiB
# scraper/tasks/parse_tasks.py
|
|
|
|
from celery import shared_task
|
|
from logbus.publisher import log
|
|
from scraper.utils import clean_text
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
@shared_task(bind=True, queue="parse", ignore_result=False)
|
|
def parse_chapter(self, html: str, chapter_url: str):
|
|
"""
|
|
Parse downloaded chapter HTML into clean text.
|
|
Returns a dict:
|
|
{
|
|
"url": chapter_url,
|
|
"text": "...parsed text..."
|
|
}
|
|
"""
|
|
try:
|
|
log(f"[PARSE] Start parsing: {chapter_url}")
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Veel Chinese sites gebruiken dit soort containers:
|
|
possible_blocks = [
|
|
"#content",
|
|
".content",
|
|
"div#content",
|
|
"div.content",
|
|
"div#chaptercontent",
|
|
"#chapterContent"
|
|
]
|
|
|
|
node = None
|
|
for sel in possible_blocks:
|
|
r = soup.select_one(sel)
|
|
if r:
|
|
node = r
|
|
break
|
|
|
|
if not node:
|
|
log(
|
|
f"[PARSE] WARNING: no known content block found in {chapter_url}")
|
|
text = clean_text(soup.get_text())
|
|
else:
|
|
text = clean_text(node.get_text())
|
|
|
|
log(f"[PARSE] Finished parsing: {chapter_url} ({len(text)} chars)")
|
|
|
|
return {
|
|
"url": chapter_url,
|
|
"text": text,
|
|
}
|
|
|
|
except Exception as exc:
|
|
log(f"[PARSE] ERROR parsing {chapter_url}: {exc}")
|
|
raise
|