# scraper/tasks/save_tasks.py from celery import shared_task from logbus.publisher import log import os @shared_task(bind=True, queue="save", ignore_result=False) def save_chapter(self, result: dict, base_path: str): """ Save parsed chapter text to disk. result = { "url": ..., "text": ... } """ try: text = result.get("text", "") url = result.get("url") # Haal chapter nummer uit URL # Bijvoorbeeld: .../12345.html # ⇒ 12345 chapter_number = extract_chapter_number(url) if not os.path.exists(base_path): os.makedirs(base_path, exist_ok=True) filename = f"{chapter_number:05d}.txt" path = os.path.join(base_path, filename) with open(path, "w", encoding="utf-8") as f: f.write(text) log(f"[SAVE] Saved chapter {chapter_number} → {path}") return {"chapter": chapter_number, "path": path} except Exception as exc: log(f"[SAVE] ERROR saving chapter from {url}: {exc}") raise def extract_chapter_number(url: str) -> int: """ Utility extractor for chapter numbers from a URL. Example: https://site.com/1234.html → 1234 """ try: import re m = re.search(r'(\d+)\.html?', url) if m: return int(m.group(1)) except: pass return 0