You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kmftools/bookscraper/scraper/tasks/scraping.py

98 lines
3.4 KiB

# ============================================================
# File: scraper/tasks/scraping.py
# Purpose: Scrape metadata + chapter list and initialise
# Redis progress tracking + launch download controller
# ============================================================
from celery_app import celery_app
from logbus.publisher import log
import os
import redis
from scraper.logger_decorators import logcall
from scraper.sites import BookSite
from scraper.book_scraper import BookScraper
from scraper.abort import clear_abort # no circular deps
from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT
print(">>> [IMPORT] scraping.py loaded")
# Redis connection (same as Celery broker)
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
@celery_app.task(bind=True, queue="scraping", ignore_result=False)
def start_scrape_book(self, url: str):
"""Scrapes metadata + chapters and prepares download tracking."""
# ------------------------------------------------------------
# NEW: clear UI log buffer at start of new run
# ------------------------------------------------------------
reset_ui_logs()
log(f"[SCRAPING] Start scraping for: {url}")
# ------------------------------------------------------------
# Book scrape
# ------------------------------------------------------------
site = BookSite()
scraper = BookScraper(site, url)
result = scraper.execute() # returns dict with metadata + chapters
chapters = result.get("chapters", [])
full_count = len(chapters)
# ------------------------------------------------------------
# DRY RUN
# ------------------------------------------------------------
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
if DRY_RUN:
log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}")
chapters = chapters[:TEST_LIMIT]
result["chapters"] = chapters
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
# ------------------------------------------------------------
# BOOK RUN ID (using title as ID)
# ------------------------------------------------------------
title = result.get("title") or "UnknownBook"
book_id = title # user requirement
result["book_id"] = book_id
log(f"[SCRAPING] Assigned book_id = '{book_id}'")
# ------------------------------------------------------------
# RESET ABORT + INITIALISE PROGRESS
# ------------------------------------------------------------
clear_abort(book_id)
r.set(f"progress:{book_id}:total", len(chapters))
r.set(f"progress:{book_id}:done", 0)
r.delete(f"logs:{book_id}") # clear old logs if any
r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}")
r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters")
# ------------------------------------------------------------
# DISPATCH DOWNLOAD CONTROLLER
# ------------------------------------------------------------
celery_app.send_task(
"scraper.tasks.controller_tasks.launch_downloads",
args=[book_id, result],
queue="controller",
)
log(f"[SCRAPING] Dispatched download controller for '{book_id}'")
return {
"book_id": book_id,
"title": result.get("title"),
"author": result.get("author"),
"chapters": len(chapters),
}