You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/parse_tasks.py

80 lines
1.8 KiB

# scraper/tasks/parse_tasks.py
from celery_app import celery_app
from logbus.publisher import log
from bs4 import BeautifulSoup
from scraper.utils import clean_text, load_replacements
print(">>> [IMPORT] parse_tasks.py loaded")
@celery_app.task(bind=True, queue="parse", ignore_result=False)
def parse_chapter(self, download_result: dict, meta: dict):
"""
download_result:
{
"chapter": int,
"url": str,
"html": str
}
meta:
{
"title": str,
"author": str,
"description": str
}
"""
chapter_num = download_result["chapter"]
url = download_result["url"]
html = download_result["html"]
log(f"[PARSE] Parsing chapter {chapter_num}")
soup = BeautifulSoup(html, "lxml")
selectors = [
"#content",
".content",
"div#content",
"div.content",
"div#chaptercontent",
"#chapterContent",
".read-content",
]
node = None
for sel in selectors:
tmp = soup.select_one(sel)
if tmp:
node = tmp
break
raw = node.get_text() if node else soup.get_text()
# replacements
REPL = load_replacements()
text = clean_text(raw, REPL)
# ---------------------------------------------------
# HEADER ONLY FOR CHAPTER 1
# ---------------------------------------------------
if chapter_num == 1:
header = (
f"{meta.get('title','')}\n"
f"Author: {meta.get('author','')}\n"
f"Description:\n{meta.get('description','')}\n"
f"URL: {url}\n" + "-" * 50 + "\n\n"
)
text = header + text
log(f"[PARSE] Parsed chapter {chapter_num}: {len(text)} chars")
return {
"chapter": chapter_num,
"url": url,
"text": text,
"length": len(text),
}