You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/save_tasks.py

58 lines
1.4 KiB

# scraper/tasks/save_tasks.py
from celery import shared_task
from logbus.publisher import log
import os
@shared_task(bind=True, queue="save", ignore_result=False)
def save_chapter(self, result: dict, base_path: str):
"""
Save parsed chapter text to disk.
result = {
"url": ...,
"text": ...
}
"""
try:
text = result.get("text", "")
url = result.get("url")
# Haal chapter nummer uit URL
# Bijvoorbeeld: .../12345.html
# ⇒ 12345
chapter_number = extract_chapter_number(url)
if not os.path.exists(base_path):
os.makedirs(base_path, exist_ok=True)
filename = f"{chapter_number:05d}.txt"
path = os.path.join(base_path, filename)
with open(path, "w", encoding="utf-8") as f:
f.write(text)
log(f"[SAVE] Saved chapter {chapter_number}{path}")
return {"chapter": chapter_number, "path": path}
except Exception as exc:
log(f"[SAVE] ERROR saving chapter from {url}: {exc}")
raise
def extract_chapter_number(url: str) -> int:
"""
Utility extractor for chapter numbers from a URL.
Example: https://site.com/1234.html → 1234
"""
try:
import re
m = re.search(r'(\d+)\.html?', url)
if m:
return int(m.group(1))
except:
pass
return 0