You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
1.4 KiB
58 lines
1.4 KiB
# scraper/tasks/save_tasks.py
|
|
|
|
from celery import shared_task
|
|
from logbus.publisher import log
|
|
import os
|
|
|
|
|
|
@shared_task(bind=True, queue="save", ignore_result=False)
|
|
def save_chapter(self, result: dict, base_path: str):
|
|
"""
|
|
Save parsed chapter text to disk.
|
|
result = {
|
|
"url": ...,
|
|
"text": ...
|
|
}
|
|
"""
|
|
|
|
try:
|
|
text = result.get("text", "")
|
|
url = result.get("url")
|
|
|
|
# Haal chapter nummer uit URL
|
|
# Bijvoorbeeld: .../12345.html
|
|
# ⇒ 12345
|
|
chapter_number = extract_chapter_number(url)
|
|
|
|
if not os.path.exists(base_path):
|
|
os.makedirs(base_path, exist_ok=True)
|
|
|
|
filename = f"{chapter_number:05d}.txt"
|
|
path = os.path.join(base_path, filename)
|
|
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(text)
|
|
|
|
log(f"[SAVE] Saved chapter {chapter_number} → {path}")
|
|
|
|
return {"chapter": chapter_number, "path": path}
|
|
|
|
except Exception as exc:
|
|
log(f"[SAVE] ERROR saving chapter from {url}: {exc}")
|
|
raise
|
|
|
|
|
|
def extract_chapter_number(url: str) -> int:
|
|
"""
|
|
Utility extractor for chapter numbers from a URL.
|
|
Example: https://site.com/1234.html → 1234
|
|
"""
|
|
try:
|
|
import re
|
|
m = re.search(r'(\d+)\.html?', url)
|
|
if m:
|
|
return int(m.group(1))
|
|
except:
|
|
pass
|
|
return 0
|