You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102 lines
3.5 KiB
102 lines
3.5 KiB
# ============================================================
|
|
# File: scraper/tasks/scraping.py
|
|
# Purpose:
|
|
# Scrape ONLY metadata + chapter list.
|
|
# Does NOT launch download controller anymore.
|
|
# Controller decides when pipelines start.
|
|
# ============================================================
|
|
|
|
from celery_app import celery_app
|
|
from logbus.publisher import log
|
|
import os
|
|
import redis
|
|
|
|
from scraper.logger_decorators import logcall
|
|
from scraper.sites import BookSite
|
|
from scraper.book_scraper import BookScraper
|
|
from scraper.abort import clear_abort
|
|
from scraper.ui_log import reset_ui_logs
|
|
|
|
from scraper.services.init_service import InitService
|
|
|
|
print(">>> [IMPORT] scraping.py loaded")
|
|
|
|
# Redis connection (same DB as Celery broker)
|
|
REDIS_URL = os.getenv("REDIS_BROKER", "redis://redis:6379/0")
|
|
r = redis.Redis.from_url(REDIS_URL, decode_responses=True)
|
|
|
|
|
|
@celery_app.task(
|
|
bind=True,
|
|
queue="scraping",
|
|
ignore_result=False,
|
|
name="scraper.tasks.scraping.start_scrape_book",
|
|
)
|
|
@logcall
|
|
def start_scrape_book(self, url: str):
|
|
"""
|
|
Scrapes metadata + chapters.
|
|
DOES NOT START download / pipeline controller.
|
|
The controller_tasks.start_full_scrape() task will call this one.
|
|
"""
|
|
|
|
# ------------------------------------------------------------
|
|
# CLEAR UI LOG BUFFER
|
|
# ------------------------------------------------------------
|
|
reset_ui_logs()
|
|
log(f"[SCRAPING] Start scraping for: {url}")
|
|
|
|
# ------------------------------------------------------------
|
|
# SCRAPE (old engine)
|
|
# ------------------------------------------------------------
|
|
site = BookSite()
|
|
scraper = BookScraper(site, url)
|
|
result = scraper.execute() # → { title, author, chapters, cover_url, ... }
|
|
|
|
chapters = result.get("chapters", [])
|
|
full_count = len(chapters)
|
|
|
|
# ------------------------------------------------------------
|
|
# Compute unified book_idx
|
|
# ------------------------------------------------------------
|
|
book_idx = InitService.derive_book_id(url)
|
|
result["book_idx"] = book_idx
|
|
|
|
log(f"[SCRAPING] Assigned book_idx = {book_idx}")
|
|
|
|
# ------------------------------------------------------------
|
|
# DRY RUN TEST LIMIT
|
|
# ------------------------------------------------------------
|
|
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
|
|
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
|
|
|
|
if DRY_RUN:
|
|
log(f"[SCRAPING] DRY RUN active → limiting chapters to {TEST_LIMIT}")
|
|
result["chapters"] = chapters[:TEST_LIMIT]
|
|
|
|
# ------------------------------------------------------------
|
|
# LOG RESULTS
|
|
# ------------------------------------------------------------
|
|
log(
|
|
f"[SCRAPING] Completed scrape: "
|
|
f"{len(result['chapters'])}/{full_count} chapters"
|
|
)
|
|
|
|
# ------------------------------------------------------------
|
|
# RESET ABORT + INITIALIZE LEGACY PROGRESS
|
|
# ------------------------------------------------------------
|
|
clear_abort(book_idx)
|
|
|
|
r.set(f"progress:{book_idx}:total", len(result["chapters"]))
|
|
r.set(f"progress:{book_idx}:done", 0)
|
|
|
|
r.delete(f"logs:{book_idx}")
|
|
r.rpush(f"logs:{book_idx}", f":: SCRAPING STARTED for {url}")
|
|
r.rpush(f"logs:{book_idx}", f":: Found {len(result['chapters'])} chapters")
|
|
|
|
# ------------------------------------------------------------
|
|
# IMPORTANT: DO NOT DISPATCH any pipelines here
|
|
# Controller will receive scrape_result and continue.
|
|
# ------------------------------------------------------------
|
|
return result
|