You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
53 lines
1.4 KiB
53 lines
1.4 KiB
# scraper/tasks/scraping.py
|
|
#
|
|
from celery_app import celery_app
|
|
from logbus.publisher import log
|
|
import os
|
|
|
|
from scraper.sites import BookSite
|
|
from scraper.book_scraper import BookScraper
|
|
from scraper.tasks.controller_tasks import launch_downloads
|
|
|
|
print(">>> [IMPORT] scraping.py loaded")
|
|
|
|
|
|
@celery_app.task(bind=True, queue="scraping", ignore_result=False)
|
|
def start_scrape_book(self, url: str):
|
|
"""Scrapes metadata + chapter list."""
|
|
|
|
log(f"[SCRAPING] Start scraping for: {url}")
|
|
|
|
site = BookSite()
|
|
scraper = BookScraper(site, url)
|
|
scraper.parse_book_info()
|
|
|
|
chapters = scraper.get_chapter_list()
|
|
full_count = len(chapters)
|
|
|
|
DRY_RUN = os.getenv("DRY_RUN", "0") == "1"
|
|
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "5"))
|
|
|
|
if DRY_RUN:
|
|
log(f"[SCRAPING] DRY_RUN: limiting chapters to first {TEST_LIMIT}")
|
|
chapters = chapters[:TEST_LIMIT]
|
|
|
|
result = {
|
|
"title": scraper.book_title,
|
|
"author": scraper.book_author,
|
|
"description": scraper.book_description,
|
|
"cover": scraper.cover_url,
|
|
"chapters": [
|
|
{"num": ch.number, "title": ch.title, "url": ch.url} for ch in chapters
|
|
],
|
|
}
|
|
|
|
log(f"[SCRAPING] Completed scrape: {len(chapters)}/{full_count} chapters")
|
|
|
|
celery_app.send_task(
|
|
"scraper.tasks.controller_tasks.launch_downloads",
|
|
args=[result],
|
|
queue="controller",
|
|
)
|
|
|
|
return result
|