You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
3.4 KiB
97 lines
3.4 KiB
# ============================================================
|
|
# File: scraper/tasks/scraping.py
|
|
# Purpose: Scrape metadata + chapter list and initialise
|
|
# Redis progress tracking + launch download controller
|
|
# ============================================================
|
|
|
|
from celery_app import celery_app
|
|
from logbus.publisher import log
|
|
import os
|
|
import redis
|
|
|
|
from scraper.sites import BookSite
|
|
from scraper.book_scraper import BookScraper
|
|
from scraper.abort import clear_abort # no circular deps
|
|
from scraper.ui_log import reset_ui_logs # <-- NEW IMPORT
|
|
|
|
print(">>> [IMPORT] scraping.py loaded")
|
|
|
|
# Redis connection (same as Celery broker)
|
|
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
|
|
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
|
|
|
|
|
|
@celery_app.task(bind=True, queue="scraping", ignore_result=False)
|
|
def start_scrape_book(self, url: str):
|
|
"""Scrapes metadata + chapters and prepares download tracking."""
|
|
|
|
# ------------------------------------------------------------
|
|
# NEW: clear UI log buffer at start of new run
|
|
# ------------------------------------------------------------
|
|
reset_ui_logs()
|
|
|
|
log(f"[SCRAPING] Start scraping for: {url}")
|
|
|
|
# ------------------------------------------------------------
|
|
# Book scrape
|
|
# ------------------------------------------------------------
|
|
site = BookSite()
|
|
scraper = BookScraper(site, url)
|
|
result = scraper.execute() # returns dict with metadata + chapters
|
|
|
|
chapters = result.get("chapters", [])
|
|
full_count = len(chapters)
|
|
|
|
# ------------------------------------------------------------
|
|
# DRY RUN
|
|
# ------------------------------------------------------------
|
|
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
|
|
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
|
|
|
|
if DRY_RUN:
|
|
log(f"[SCRAPING] DRY_RUN: limiting chapters to {TEST_LIMIT}")
|
|
chapters = chapters[:TEST_LIMIT]
|
|
result["chapters"] = chapters
|
|
|
|
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
|
|
|
|
# ------------------------------------------------------------
|
|
# BOOK RUN ID (using title as ID)
|
|
# ------------------------------------------------------------
|
|
title = result.get("title") or "UnknownBook"
|
|
book_id = title # user requirement
|
|
|
|
result["book_id"] = book_id
|
|
|
|
log(f"[SCRAPING] Assigned book_id = '{book_id}'")
|
|
|
|
# ------------------------------------------------------------
|
|
# RESET ABORT + INITIALISE PROGRESS
|
|
# ------------------------------------------------------------
|
|
clear_abort(book_id)
|
|
|
|
r.set(f"progress:{book_id}:total", len(chapters))
|
|
r.set(f"progress:{book_id}:done", 0)
|
|
r.delete(f"logs:{book_id}") # clear old logs if any
|
|
|
|
r.rpush(f"logs:{book_id}", f":: SCRAPING STARTED for {url}")
|
|
r.rpush(f"logs:{book_id}", f":: Found {len(chapters)} chapters")
|
|
|
|
# ------------------------------------------------------------
|
|
# DISPATCH DOWNLOAD CONTROLLER
|
|
# ------------------------------------------------------------
|
|
celery_app.send_task(
|
|
"scraper.tasks.controller_tasks.launch_downloads",
|
|
args=[book_id, result],
|
|
queue="controller",
|
|
)
|
|
|
|
log(f"[SCRAPING] Dispatched download controller for '{book_id}'")
|
|
|
|
return {
|
|
"book_id": book_id,
|
|
"title": result.get("title"),
|
|
"author": result.get("author"),
|
|
"chapters": len(chapters),
|
|
}
|