kmftools/bookscraper/scraper/tasks/download_tasks.py

# ============================================================
# File: scraper/tasks/download_tasks.py
# Purpose: Download chapter HTML with global concurrency,
#          retry/backoff logic, 429 support, and abort-awareness.
#
# Logging:
#   - timestamp + book_id in message
#   - logbus.publisher → console
#   - ui_log.push_ui → Redis GUI
# ============================================================

from celery_app import celery_app
from scraper.utils import get_save_path
from scraper.abort import abort_requested, chapter_started, mark_chapter_started

from scraper.progress import (
    inc_completed,
    inc_chapter_done,
    inc_chapter_download_skipped,
)
from logbus.publisher import log
from scraper.ui_log import push_ui

import requests
import redis
import os
import time
from datetime import datetime


print(">>> [IMPORT] download_tasks.py loaded")


# -----------------------------------------------------------
# TIMESTAMPED LOG WRAPPER
# -----------------------------------------------------------
def log_msg(book_id: str, message: str):
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full = f"{ts} [{book_id}] {message}"
    log(full)
    push_ui(full)


# -----------------------------------------------------------
# ENV CONFIG
# -----------------------------------------------------------
MAX_RETRIES = int(os.getenv("DOWNLOAD_MAX_RETRIES", "7"))
BASE_DELAY = int(os.getenv("DOWNLOAD_BASE_DELAY", "2"))
BACKOFF = int(os.getenv("DOWNLOAD_BACKOFF_MULTIPLIER", "2"))
DELAY_429 = int(os.getenv("DOWNLOAD_429_DELAY", "10"))

MAX_CONCURRENCY = int(os.getenv("DOWNLOAD_MAX_GLOBAL_CONCURRENCY", "1"))
GLOBAL_DELAY = int(os.getenv("DOWNLOAD_GLOBAL_MIN_DELAY", "1"))

REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
redis_client = redis.Redis.from_url(REDIS_URL)

SEM_KEY = "download:active"
DELAY_KEY = "download:delay_lock"


# -----------------------------------------------------------
# DELAY + CONCURRENCY HELPERS
# -----------------------------------------------------------
def wait_for_global_delay():
    if GLOBAL_DELAY <= 0:
        return
    while redis_client.exists(DELAY_KEY):
        time.sleep(0.1)


def set_global_delay():
    if GLOBAL_DELAY <= 0:
        return
    redis_client.set(DELAY_KEY, "1", nx=True, ex=GLOBAL_DELAY)


def acquire_global_slot(max_slots: int, retry_delay: float = 0.5):
    while True:
        current = redis_client.incr(SEM_KEY)
        if current <= max_slots:
            return
        redis_client.decr(SEM_KEY)
        time.sleep(retry_delay)


def release_global_slot():
    redis_client.decr(SEM_KEY)


# ============================================================
# CELERY TASK — NEW SIGNATURE WITH chapter_dict + book_meta
# ============================================================
@celery_app.task(bind=True, queue="download", ignore_result=False)
def download_chapter(self, book_id: str, chapter_dict: dict, book_meta: dict):
    """
    New unified chapter model:
        chapter_dict = {
            "num": int,
            "url": str,
            "title": str,
            "volume_path": str
        }

    book_meta is propagated through the pipeline for parse/save.
    """

    chapter_num = chapter_dict.get("num")
    chapter_url = chapter_dict.get("url")
    chapter_title = chapter_dict.get("title") or f"Chapter {chapter_num}"
    volume_path = chapter_dict.get("volume_path")

    # -----------------------------------------------------------
    # ABORT BEFORE START
    # -----------------------------------------------------------
    if abort_requested(book_id) and not chapter_started(book_id, chapter_num):
        msg = f"[ABORT] Skip chapter {chapter_num} (abort active, not started)"
        log_msg(book_id, msg)
        inc_chapter_download_skipped(book_id)
        return {
            "book_id": book_id,
            "chapter": chapter_dict,
            "html": None,
            "skipped": True,
            "path": None,
            "abort": True,
            "book_meta": book_meta,
        }

    # Mark chapter as started
    mark_chapter_started(book_id, chapter_num)

    # -----------------------------------------------------------
    # SKIP IF FILE ALREADY EXISTS
    # -----------------------------------------------------------
    save_path = get_save_path(chapter_num, volume_path)

    if os.path.exists(save_path):
        log_msg(book_id, f"[DL] SKIP {chapter_num} ({chapter_title}) → {save_path}")
        return {
            "book_id": book_id,
            "chapter": chapter_dict,
            "html": None,
            "skipped": True,
            "path": save_path,
            "book_meta": book_meta,
        }

    # -----------------------------------------------------------
    # GLOBAL + SYNC DELAY
    # -----------------------------------------------------------
    if GLOBAL_DELAY > 0:
        time.sleep(GLOBAL_DELAY)

    wait_for_global_delay()
    acquire_global_slot(MAX_CONCURRENCY)
    # log_msg(book_id, f"[DL] ACQUIRED SLOT for chapter {chapter_num}")

    # -----------------------------------------------------------
    # HTTP DOWNLOAD
    # -----------------------------------------------------------
    try:
        log_msg(
            book_id,
            f"[DL] Downloading {chapter_num} ({chapter_title}): {chapter_url}",
        )

        resp = requests.get(
            chapter_url,
            headers={"User-Agent": "Mozilla/5.0"},
            timeout=20,
        )
        resp.raise_for_status()

        resp.encoding = resp.apparent_encoding or "gb2312"
        html = resp.text

        log_msg(book_id, f"[DL] OK {chapter_num}: {len(html)} bytes")

        return {
            "book_id": book_id,
            "chapter": chapter_dict,
            "html": html,
            "skipped": False,
            "path": save_path,
            "book_meta": book_meta,
        }

    except Exception as exc:
        attempt = self.request.retries
        delay = BASE_DELAY * (BACKOFF**attempt)

        # Specific 429 handler
        if getattr(getattr(exc, "response", None), "status_code", None) == 429:
            log_msg(
                book_id,
                f"[DL] 429 {chapter_num} → WAIT {DELAY_429}s "
                f"(attempt {attempt}/{MAX_RETRIES})",
            )
            time.sleep(DELAY_429)
            set_global_delay()
            raise self.retry(exc=exc, countdown=0, max_retries=MAX_RETRIES)

        # Normal retry
        log_msg(
            book_id,
            f"[DL] ERROR {chapter_num}: {exc} → retry in {delay}s "
            f"(attempt {attempt}/{MAX_RETRIES})",
        )
        raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)

    finally:
        set_global_delay()
        release_global_slot()
        # log_msg(book_id, f"[DL] RELEASED SLOT for chapter {chapter_num}")