Implement global download delay, Redis-based concurrency, cleanup utils, update pipelines

2 weeks ago · 788572e1fa
parent f27b33a882
commit 788572e1fa
9 changed files with 281 additions and 115 deletions
--- a/bookscraper/scraper/download_controller.py
+++ b/bookscraper/scraper/download_controller.py
@ -24,11 +24,14 @@ class DownloadController:
        self.book_base = os.path.join(root, self.title)
        os.makedirs(self.book_base, exist_ok=True)
-        # constant metadata for all chapters
+        # ------------------------------------------
        # FIXED: meta now includes book_url
        # ------------------------------------------
        self.meta = {
            "title": self.scrape_result.get("title"),
            "author": self.scrape_result.get("author"),
            "description": self.scrape_result.get("description"),
            "book_url": self.scrape_result.get("book_url"),
        }
    def get_volume_path(self, chapter_num: int) -> str:
@ -51,20 +54,17 @@ class DownloadController:
            chapter_num = ch["num"]
            chapter_url = ch["url"]
            # compute volume directory
            vol_path = self.get_volume_path(chapter_num)
            # build the pipeline for this chapter
            tasks.append(
                build_chapter_pipeline(
                    chapter_num,
                    chapter_url,
-                    vol_path,  # ✔ correct volume path!!
+                    vol_path,
-                    self.meta,  # ✔ pass metadata once
+                    self.meta,
                )
            )
        # parallel processing
        job_group = group(tasks)
        async_result = job_group.apply_async()
--- a/bookscraper/scraper/tasks/download_tasks.py
+++ b/bookscraper/scraper/tasks/download_tasks.py
@ -2,15 +2,150 @@
 from celery_app import celery_app
 from logbus.publisher import log
 import requests
 import os
 import time
 import redis
 from scraper.utils import get_save_path
 print(">>> [IMPORT] download_tasks.py loaded")
 # ---------------------------
 # Retry parameters from .env
 # ---------------------------
 MAX_RETRIES = int(os.getenv("DOWNLOAD_MAX_RETRIES", "7"))
 BASE_DELAY = int(os.getenv("DOWNLOAD_BASE_DELAY", "2"))
 BACKOFF = int(os.getenv("DOWNLOAD_BACKOFF_MULTIPLIER", "2"))
 DELAY_429 = int(os.getenv("DOWNLOAD_429_DELAY", "10"))
-@celery_app.task(bind=True, queue="download", ignore_result=False)
+# ---------------------------
-def download_chapter(self, chapter_num: int, chapter_url: str):
+# GLOBAL CONCURRENCY LIMIT
-    log(f"[DL] Downloading chapter {chapter_num}: {chapter_url}")
+# ---------------------------
 MAX_CONCURRENCY = int(os.getenv("DOWNLOAD_MAX_GLOBAL_CONCURRENCY", "1"))
 # ---------------------------
 # GLOBAL MINIMUM DELAY
 # ---------------------------
 GLOBAL_DELAY = int(os.getenv("DOWNLOAD_GLOBAL_MIN_DELAY", "1"))
 DELAY_KEY = "download:delay_lock"
 # ---------------------------
 # Redis connection
 # ---------------------------
 REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
 redis_client = redis.Redis.from_url(REDIS_URL)
 SEM_KEY = "download:active"  # semaphore counter
 # ======================================================
 # GLOBAL DELAY FUNCTIONS
 # ======================================================
 def wait_for_global_delay():
    """
    Block until no global delay lock exists.
    Prevents hammering the server too fast.
    """
    if GLOBAL_DELAY <= 0:
        return
    while redis_client.exists(DELAY_KEY):
        time.sleep(0.1)
 def set_global_delay():
    """
    After finishing a download (or skip),
    set a TTL lock so all workers wait a minimum time.
    """
    if GLOBAL_DELAY <= 0:
        return
    # SET key NX EX:
    # - only set if not existing
    # - expires automatically
    redis_client.set(DELAY_KEY, "1", nx=True, ex=GLOBAL_DELAY)
 # ======================================================
 # GLOBAL CONCURRENCY FUNCTIONS
 # ======================================================
 def acquire_global_slot(max_slots: int, retry_delay: float = 0.5):
    """
    GLOBAL semaphore with Redis.
    Atomic INCR. If limit exceeded, undo & wait.
    """
    while True:
        current = redis_client.incr(SEM_KEY)
        if current <= max_slots:
            return  # acquired OK
        redis_client.decr(SEM_KEY)
        time.sleep(retry_delay)
 def release_global_slot():
    """Release semaphore."""
    redis_client.decr(SEM_KEY)
 print(f">>> [CONFIG] Global concurrency = {MAX_CONCURRENCY}")
 print(f">>> [CONFIG] Global min delay = {GLOBAL_DELAY}s")
 print(
    f">>> [CONFIG] download retries = "
    f"max={MAX_RETRIES}, base={BASE_DELAY}, backoff={BACKOFF}, 429={DELAY_429}"
 )
 # ======================================================
 # CELERY TASK
 # ======================================================
@celery_app.task(
    bind=True,
    queue="download",
    ignore_result=False,
 )
 def download_chapter(self, chapter_num: int, chapter_url: str, base_path: str):
    """
    base_path komt uit pipeline.py
    Download wordt SKIPPED als het bestand al bestaat.
    """
    save_path = get_save_path(chapter_num, base_path)
    # ------------------------------------------------------------------
    # 1. SKIP IF EXISTS — maar WEL global delay zetten!
    # ------------------------------------------------------------------
    if os.path.exists(save_path):
        wait_for_global_delay()
        set_global_delay()
        log(f"[DL] SKIP chapter {chapter_num} (exists) → {save_path}")
        return {
            "chapter": chapter_num,
            "url": chapter_url,
            "html": None,
            "skipped": True,
            "path": save_path,
        }
    # ------------------------------------------------------------------
    # 2. GLOBAL DELAY — throttle downloads globally
    # ------------------------------------------------------------------
    wait_for_global_delay()
    # ------------------------------------------------------------------
    # 3. GLOBAL CONCURRENCY — only X downloads at the same time
    # ------------------------------------------------------------------
    acquire_global_slot(MAX_CONCURRENCY)
    log(f"[DL] ACQUIRED SLOT for chapter {chapter_num}")
    try:
        # ------------------------------------------------------------------
        # 4. DOWNLOAD LOGIC
        # ------------------------------------------------------------------
        log(f"[DL] Downloading chapter {chapter_num}: {chapter_url}")
        resp = requests.get(
            chapter_url,
            headers={"User-Agent": "Mozilla/5.0"},
@ -20,14 +155,45 @@ def download_chapter(self, chapter_num: int, chapter_url: str):
        resp.encoding = resp.apparent_encoding or "gb2312"
        html = resp.text
        log(f"[DL] OK {chapter_num}: {len(html)} bytes")
        return {
            "chapter": chapter_num,
            "url": chapter_url,
            "html": html,
            "skipped": False,
            "path": save_path,
        }
    except Exception as exc:
-        log(f"[DL] ERROR {chapter_url}: {exc}")
+        # ------------------------------------------------------------------
-        raise
+        # 5. RETRY LOGIC
        # ------------------------------------------------------------------
        attempt = self.request.retries
        delay = BASE_DELAY * (BACKOFF**attempt)
        if (
            hasattr(exc, "response")
            and getattr(exc.response, "status_code", None) == 429
        ):
            delay = DELAY_429 + delay
            log(
                f"[DL] 429 Too Many Requests → retry in {delay}s "
                f"(attempt {attempt}/{MAX_RETRIES})"
            )
            raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
        log(
            f"[DL] ERROR on {chapter_url}: {exc} → retry in {delay}s "
            f"(attempt {attempt}/{MAX_RETRIES})"
        )
        raise self.retry(exc=exc, countdown=delay, max_retries=MAX_RETRIES)
    finally:
        # ------------------------------------------------------------------
        # 6. ALWAYS set delay + release semaphore
        # ------------------------------------------------------------------
        set_global_delay()
        release_global_slot()
        log(f"[DL] RELEASED SLOT for chapter {chapter_num}")
--- a/bookscraper/scraper/tasks/parse_tasks.py
+++ b/bookscraper/scraper/tasks/parse_tasks.py
@ -10,24 +10,16 @@ print(">>> [IMPORT] parse_tasks.py loaded")
@celery_app.task(bind=True, queue="parse", ignore_result=False)
 def parse_chapter(self, download_result: dict, meta: dict):
    """
    download_result:
      {
        "chapter": int,
        "url": str,
        "html": str
      }
    meta:
      {
        "title": str,
        "author": str,
        "description": str
      }
    """
    # 1) SKIP mode
    if download_result.get("skipped"):
        chapter = download_result.get("chapter")
        log(f"[PARSE] SKIP chapter {chapter} (download skipped)")
        return download_result
    # 2) Normal mode
    chapter_num = download_result["chapter"]
-    url = download_result["url"]
+    chapter_url = download_result["url"]
    html = download_result["html"]
    log(f"[PARSE] Parsing chapter {chapter_num}")
@ -53,19 +45,20 @@ def parse_chapter(self, download_result: dict, meta: dict):
    raw = node.get_text() if node else soup.get_text()
    # replacements
    REPL = load_replacements()
    text = clean_text(raw, REPL)
-    # ---------------------------------------------------
+    # -----------------------------
-    # HEADER ONLY FOR CHAPTER 1
+    # FIXED: chapter 1 header = book URL
-    # ---------------------------------------------------
+    # -----------------------------
    if chapter_num == 1:
        book_url = meta.get("book_url") or meta.get("url") or "UNKNOWN"
        header = (
            f"{meta.get('title','')}\n"
            f"Author: {meta.get('author','')}\n"
            f"Description:\n{meta.get('description','')}\n"
-            f"URL: {url}\n" + "-" * 50 + "\n\n"
+            f"Book URL: {book_url}\n" + "-" * 50 + "\n\n"
        )
        text = header + text
@ -73,7 +66,7 @@ def parse_chapter(self, download_result: dict, meta: dict):
    return {
        "chapter": chapter_num,
-        "url": url,
+        "url": chapter_url,
        "text": text,
        "length": len(text),
    }
--- a/bookscraper/scraper/tasks/pipeline.py
+++ b/bookscraper/scraper/tasks/pipeline.py
@ -1,6 +1,16 @@
 # scraper/tasks/pipeline.py
 """
 Build the pipeline for a single chapter:
 download → parse → save
 This module must NOT import scraping.py or controllers,
 otherwise Celery will hit circular imports on worker startup.
 Only import task functions here.
 """
 from celery import chain
 from scraper.tasks.download_tasks import download_chapter
 from scraper.tasks.parse_tasks import parse_chapter
 from scraper.tasks.save_tasks import save_chapter
@ -10,12 +20,17 @@ def build_chapter_pipeline(
    chapter_number: int, chapter_url: str, base_path: str, meta: dict
 ):
    """
-    Build a download → parse → save pipeline for one chapter.
+    Construct a Celery chain for one chapter:
-    meta bevat:
+      1. download_chapter
-       title, author, description
+      2. parse_chapter
      3. save_chapter
    """
    return chain(
-        download_chapter.s(chapter_number, chapter_url),
+        # download_chapter needs ALL THREE arguments
-        parse_chapter.s(meta),  # ← METADATA DOORGEVEN
+        download_chapter.s(chapter_number, chapter_url, base_path),
        # parse_chapter gets the output of download_chapter + meta as extra arg
        parse_chapter.s(meta),
        # save_chapter needs base_path as extra arg
        save_chapter.s(base_path),
    )
--- a/bookscraper/scraper/tasks/save_tasks.py
+++ b/bookscraper/scraper/tasks/save_tasks.py
@ -4,12 +4,23 @@ print(">>> [IMPORT] save_tasks.py loaded")
 from celery import shared_task
 from logbus.publisher import log
 import os
 from scraper.utils import get_save_path
@shared_task(bind=True, queue="save", ignore_result=False)
 def save_chapter(self, parsed: dict, base_path: str):
    print(f">>> [save_tasks] save_chapter() CALLED for chapter {parsed.get('chapter')}")
    # ----------------------------
    # SKIP: If pipeline marked skip
    # ----------------------------
    if parsed.get("skipped"):
        chapter = parsed.get("chapter")
        path = parsed.get("path")
        log(f"[SAVE] SKIP chapter {chapter} (already exists) → {path}")
        print(f">>> [save_tasks] SKIPPED {path}")
        return {"chapter": chapter, "path": path, "skipped": True}
    try:
        chapter_number = parsed.get("chapter")
        url = parsed.get("url")
@ -20,8 +31,8 @@ def save_chapter(self, parsed: dict, base_path: str):
        os.makedirs(base_path, exist_ok=True)
-        filename = f"{chapter_number:05d}.txt"
+        # unified filename logic
-        path = os.path.join(base_path, filename)
+        path = get_save_path(chapter_number, base_path)
        with open(path, "w", encoding="utf-8") as f:
            f.write(text)
--- a/bookscraper/scraper/tasks/scraping.py
+++ b/bookscraper/scraper/tasks/scraping.py
@ -31,11 +31,15 @@ def start_scrape_book(self, url: str):
        log(f"[SCRAPING] DRY_RUN: limiting chapters to first {TEST_LIMIT}")
        chapters = chapters[:TEST_LIMIT]
    # ---------------------------------------------------
    # FIX: add book_url so parse_chapter has the real url
    # ---------------------------------------------------
    result = {
        "title": scraper.book_title,
        "author": scraper.book_author,
        "description": scraper.book_description,
        "cover": scraper.cover_url,
        "book_url": url,
        "chapters": [
            {"num": ch.number, "title": ch.title, "url": ch.url} for ch in chapters
        ],
--- a/bookscraper/scraper/tasks/utils.py
+++ b/bookscraper/scraper/tasks/utils.py
@ -1,57 +0,0 @@
 # scraper/utils.py
 import re
 import os
 from pathlib import Path
 # ------------------------------------------------------------
 # Load replacements from text_replacements.txt (optional file)
 # ------------------------------------------------------------
 def load_replacements(filepath="text_replacements.txt") -> dict:
    """
    Load key=value style replacements.
    Empty or missing file → return {}.
    """
    path = Path(filepath)
    if not path.exists():
        return {}
    repl = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if "=" in line:
                key, val = line.split("=", 1)
                repl[key.strip()] = val.strip()
    return repl
 # ------------------------------------------------------------
 # Clean extracted HTML text
 # ------------------------------------------------------------
 def clean_text(raw: str, repl_dict: dict = None) -> str:
    """
    Normalizes whitespace, removes junk, and applies replacements.
    repl_dict is optional → falls back to {}.
    """
    if repl_dict is None:
        repl_dict = {}
    txt = raw
    # Normalize CRLF
    txt = txt.replace("\r", "")
    # Collapse multiple blank lines
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    # Apply replacements
    for key, val in repl_dict.items():
        txt = txt.replace(key, val)
    # Strip excessive whitespace at edges
    return txt.strip()
--- a/bookscraper/scraper/utils.py
+++ b/bookscraper/scraper/utils.py
@ -1,36 +1,67 @@
-# scraper/utils.py
+import os
 import re
 from pathlib import Path
-def load_replacements(path="text_replacements.txt") -> dict:
+# ------------------------------------------------------------
 # Load replacements from text_replacements.txt (optional file)
 # ------------------------------------------------------------
 def load_replacements(filepath="text_replacements.txt") -> dict:
    """
-    Load key=value replacements from a simple text file.
+    Load key=value style replacements.
-    Lines beginning with # are ignored.
+    Empty or missing file → return {}.
    Lines starting with '#' are ignored.
    """
-    fp = Path(path)
+    path = Path(filepath)
-    if not fp.exists():
+
    if not path.exists():
        return {}
    repl = {}
    for line in fp.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
-        if "=" in line:
+    with open(path, "r", encoding="utf-8") as f:
-            k, v = line.split("=", 1)
+        for line in f:
-            repl[k.strip()] = v.strip()
+            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if "=" in line:
                key, val = line.split("=", 1)
                repl[key.strip()] = val.strip()
    return repl
-def clean_text(raw: str, repl_dict: dict) -> str:
+# ------------------------------------------------------------
 # Clean extracted HTML text
 # ------------------------------------------------------------
 def clean_text(raw: str, repl_dict: dict = None) -> str:
    """
-    Cleans text using user-defined replacements.
+    Normalize whitespace, remove junk, apply replacements.
    repl_dict is optional → {} if none provided.
    """
-    txt = raw
+    if repl_dict is None:
        repl_dict = {}
    txt = raw.replace("\r", "")  # normalize CRLF
-    for k, v in repl_dict.items():
+    # Collapse 3+ blank lines → max 1 empty line
-        txt = txt.replace(k, v)
+    txt = re.sub(r"\n{3,}", "\n\n", txt)
    # Apply replacements
    for key, val in repl_dict.items():
        txt = txt.replace(key, val)
    return txt.strip()
 # ------------------------------------------------------------
 # Determine save path for a chapter (shared by download & save)
 # ------------------------------------------------------------
 def get_save_path(chapter_num: int, base_path: str) -> str:
    """
    Returns the filesystem path where this chapter should be saved.
    Formats the filename as 0001.txt, 0002.txt, ...
    """
    filename = f"{chapter_num:04d}.txt"
    return os.path.join(base_path, filename)
--- a/bookscraper/text_replacements.txt
+++ b/bookscraper/text_replacements.txt
@ -20,8 +20,10 @@
 返回飘天文学网首页=
 永久地址：www.piaotia.com=
 www.piaotia.com=
 www.piaotia.com
 piaotia.com=
 piaotian.com=
 飘天文学
 www.piaotian.com=
 www.piaotian.net=
@ -54,6 +56,7 @@ Copyright ©=
 本小说来自互联网资源，如果侵犯您的权益请联系我们=
 本站立场无关=
 均由网友发表或上传=
 感谢各位书友的支持，您的支持就是我们最大的动力
 # ---------- COMMON NOISE ----------
 广告=