From 3ed85d08e328429eb6050fd3f53d1b89f387d45a Mon Sep 17 00:00:00 2001 From: "peter.fong" Date: Sat, 29 Nov 2025 20:25:12 +0000 Subject: [PATCH] bookscraper single thread --- bookscraper/Dockerfile | 20 + bookscraper/README.md | 125 +++++ bookscraper/app.py | 96 ++-- bookscraper/docker-compose.yml | 25 + .../output/合成召唤/piaotian/cover.jpg | Bin 11043 -> 0 bytes bookscraper/scraper/book_scraper.py | 430 +++++++++++------- bookscraper/scraper/logger.py | 64 ++- bookscraper/scraper/sites.py | 2 +- bookscraper/templates/index.html | 40 +- 9 files changed, 588 insertions(+), 214 deletions(-) create mode 100644 bookscraper/Dockerfile create mode 100644 bookscraper/README.md create mode 100644 bookscraper/docker-compose.yml delete mode 100644 bookscraper/output/合成召唤/piaotian/cover.jpg diff --git a/bookscraper/Dockerfile b/bookscraper/Dockerfile new file mode 100644 index 0000000..5c0c780 --- /dev/null +++ b/bookscraper/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +# Pillow dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libjpeg62-turbo-dev zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Kopieer volledige app (zoals hij nu is) +COPY . . + +# Expose Flask port +EXPOSE 5000 + +# Gebruik jouw eigen app.py als entrypoint +CMD ["python", "app.py"] diff --git a/bookscraper/README.md b/bookscraper/README.md new file mode 100644 index 0000000..b3f96d4 --- /dev/null +++ b/bookscraper/README.md @@ -0,0 +1,125 @@ +# 📚 BookScraper — Web UI + Docker + Live Log Streaming + +BookScraper is een moderne, volledig geautomatiseerde scraper voor Chinese webnovels +zoals **Piaotian / Piaotia**. +Het project combineert een krachtige scraping-engine met een prettige webinterface. + +--- + +# 🔍 Wat doet dit project? + +BookScraper bestaat uit drie belangrijke onderdelen: + +--- + +## 1. 🧠 BookScraper Engine (Python) + +Dit is de kern van het project. +De engine: + +- Leest basisinformatie van een boek (titel, auteur, cover) +- Zoekt alle chapter-links +- Downloadt elk chapter met: + - **Retry systeem** + - **Anti-429 backoff** + (wacht: `backoff * attempt + 1 seconde`) + - Detectie van lege chapters → automatisch opnieuw proberen +- Past tekstreplacements toe (via `replacements.txt`) +- Slaat chapters geordend op +- Splits lange boeken automatisch in volumes (`v1/`, `v2/`, `v3/`…) + +De engine is **bestand tegen rate limiting** van Piaotian en soortgelijke sites +en werkt met een **throttle (MAX_DOWNLOADS_PER_SEC)** om blokkades te voorkomen. + +--- + +## 2. 🌐 Flask Webinterface (UI) + +De webinterface biedt: + +- Een invoerveld voor de boek-URL +- Een knop: **Run Scraper** +- Live feedback via **server-sent events (SSE)** + +Tijdens het scrapen zie je realtime updates verschijnen, zoals: + +[DEBUG] GET chapter 1123 +[DEBUG] HTTP 429 → retry sleep 4.0s +[DEBUG] Saved chapter: output/xxx/01123_章名.txt + + +Hierdoor voelt het alsof de scraper “live” aan het werk is. + +--- + +## 3. 📡 Live Logging (SSE) + +De Logger vangt alle BookScraper-meldingen op en streamt ze +via `/stream` naar de webinterface. +Dit maakt het ideaal om scraping in de gaten te houden zonder console. + +--- + +## 4. 🔧 Configuratie via `.env` + +Om het project flexibel te houden wordt alles ingesteld via `.env`: + +- Throttle (`MAX_DOWNLOADS_PER_SEC`) +- Debugmode (`FLASK_DEBUG`) +- DRY_RUN (alleen eerste chapters) +- Volume size +- Host & Port + +De `.env` wordt automatisch geladen door Docker Compose en door Flask. + +--- + +# 📦 Projectstructuur + +bookscraper/ +│ +├── scraper/ +│ ├── book_scraper.py # De scraper engine +│ ├── logger.py # SSE logger +│ ├── sites.py # Site configuratie (selectors etc.) +│ ├── utils.py # Helpers +│ └── ... +│ +├── templates/ +│ └── index.html # UI +│ +├── output/ # Book results +│ +├── app.py # Flask webserver + endpoints +├── replacements.txt # Tekstvervangers +├── Dockerfile +├── docker-compose.yml +├── requirements.txt +└── .env + + +--- + +# ▶️ Project handmatig starten (ZONDER Docker) + +Zorg dat dependencies geïnstalleerd zijn: + +```bash +pip install -r requirements.txt +Start de Flask server: +python app.py + +Open daarna: +👉 http://localhost:5000 + +Docker Build (zonder compose) + +Manueel builden: + +docker build -t bookscraper . +docker run -p 5000:5000 --env-file .env bookscraper +docker run \ + -p 5000:5000 \ + --env-file .env \ + -v $(pwd)/output:/app/output \ + bookscraper diff --git a/bookscraper/app.py b/bookscraper/app.py index 4f6d9a6..ed983c6 100644 --- a/bookscraper/app.py +++ b/bookscraper/app.py @@ -1,53 +1,71 @@ -from flask import Flask, request, render_template_string -from scraper.book_scraper import BookScraper -from scraper.sites import BookSite -import sys +# app.py +from flask import Flask, request, Response, render_template +import time +import queue import os -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from scraper.book_scraper import BookScraper +from scraper.sites import BookSite +from scraper.logger import add_listener, remove_listener, LOG_BUFFER app = Flask(__name__) -# --- GET: toon formulier --- -@app.route("/", methods=["GET"]) +@app.route("/") def index(): - return render_template_string(""" - - -

BookScraper

-
-
-
- -
- - - """) - - -# --- POST: scraper uitvoeren --- -@app.route("/", methods=["POST"]) -def run_scraper(): - url = request.form.get("url") + return render_template("index.html") + +# ---------------------------------------------------------- +# RUN SCRAPER +# ---------------------------------------------------------- + +@app.route("/run", methods=["POST"]) +def run_scraper(): + data = request.json site = BookSite() - scraper = BookScraper(site, url) + scraper = BookScraper(site, data["url"]) result = scraper.execute() - return render_template_string(""" - - -

Scrape result: {{title}}

-

Debug output:

-
-{{debug}}
-        
-

Terug

- - - """, title=result["title"], debug=result["debug"]) + return { + "title": result["title"], + "buffer": LOG_BUFFER.getvalue() + } + +# ---------------------------------------------------------- +# REALTIME LOG STREAM (SSE) +# ---------------------------------------------------------- + +@app.route("/stream") +def stream(): + + def event_stream(): + q = queue.Queue() + + # push logregels van BookScraper naar SSE + def listener(line): + q.put(line) + + add_listener(listener) + + try: + while True: + msg = q.get() # blokkeert totdat logregel binnenkomt + yield f"data: {msg}\n\n" + except GeneratorExit: + pass + finally: + remove_listener(listener) + + return Response(event_stream(), mimetype="text/event-stream") + + +# ---------------------------------------------------------- if __name__ == "__main__": - app.run(debug=True) + debug = os.getenv("FLASK_DEBUG", "0") == "1" + host = os.getenv("HOST", "0.0.0.0") + port = int(os.getenv("PORT", "5000")) + + app.run(debug=debug, host=host, port=port) diff --git a/bookscraper/docker-compose.yml b/bookscraper/docker-compose.yml new file mode 100644 index 0000000..ade0a70 --- /dev/null +++ b/bookscraper/docker-compose.yml @@ -0,0 +1,25 @@ +version: "3.9" + +services: + bookscraper: + build: + context: . + dockerfile: Dockerfile + container_name: bookscraper + ports: + - "5050:5000" + + # Mount alles zoals je lokaal al werkt + volumes: + - .:/app # volledige projectmap + - /Users/peter/Desktop/books:/app/output + + # Bestaande .env wordt automatisch geladen door Docker Compose + env_file: + - .env + + # Zorg dat Flask NIET in debugmode gaat (jouw code bepaalt dit) + environment: + FLASK_ENV: "production" + + restart: unless-stopped diff --git a/bookscraper/output/合成召唤/piaotian/cover.jpg b/bookscraper/output/合成召唤/piaotian/cover.jpg deleted file mode 100644 index 733afb469ca947d06c8b260784213d2628e2c738..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11043 zcmbW6WmFt6*XM`g6faPqI74xlB7-{=DDG{6qEpTHXK3i?&oIz2F)=XyjsEhlAAmuENy^ACi}ga= z0-MQ|OyFx`9uBiybr-qLqGICm3W`d~DynL_ zdin;2M#d&qpR8?c?d%=gJv_au&7 zf9-#R{U2N;|G1uEV4!1Q|A!0hnfJenPJ)5S$d5%TtBq~p`hrQ|D-M}lVqSF@F0-J{ zDY>QFBpwA&XoKbKKWP6&_J0Q!^8ZElzrg-C*CGHP9qpg<&`AK&fHPvrBZCq8nj&k1 z-fyEB)Wu9@)O=q!WYR_96|$!ui`gl6;XT!h^PMe)jcJU^4NZkQ8i+LAgrZ*0jJAx;6)k zQMqh~9{Y@ZWx{Ost&w6U-=<9Z;QD!QZ3WD^BS)e`Jo8R-oYv za*-uTvL__>w`!6K)=8-pf@{~;0(mjEH(Uf%FVWk)Ol-fR_UNoHLxu!IWiq!PlocPq zC6fpV4RUz}g_}?G0{c}fbWx9!Co@yDiiKVgNJgPh_}X{gnM?YzN|tw%vgwJpR*E6o z1UoxJ^#c4%+fR6eoSJKDQ?Mw10-mw0o$UejrklM7_S)4$it;Dx%0^H+qrQt&L8yMg ztW{&Z;OpVku%}b>8l%U#xqSz*OkQX={0AZ($5cIFOUkt8#}hWqoV0iGK{9D^|8UAB z2`%gi`@qytH+lg7bafY%BoJ}2!q(h~tZOA-u`9SIPl7$IGuL5t2|Tm^u0d-CRT?_VqZOg@v73#Kg?Y5eQ1e>&#q3sx0TqC7^OSu zzM&(S*}Nw|O{`|lq0{W2pL@Rf0e5Zx&6K?R3ry@C7g38N^EC(;f|q7o_#^@~b9l;g zz__S4QqXu)nfvhf;TnIj9PTGF{mfmt5!H{GoFj z(4&(uK)dfh1h4Q&(H%5zb9-;BXs-HXQINC%y65K-4+H5srfhvsupH^{>xl`j`(QY? zaNgz^A>ovPjR|_zY}jVLc73tQCnoJt6-<7Ay#cY^)K0fJMYAZ}e)4C8V7Ucci(YlB z>Y@@8ESC{cTJX#zjN9W2uX{1i-t*lzE<`IaetyYj@nOG7PmB;zGh{l}X$voN_aG-# zq~^-nkTI?(puBJw{jZ$R<;%*#QKp!4p1Om84nfCqk*Rl9BmE@}Z5{^V=5;C0sr&Mj zPrvqk39L((JU40e@~TnBJBM+EsPv!mQerR9zKS(8^t2)v0uJ*r@r4plqJzG;4&Gc> z&NzWbt%|al7s9ccz8EU6F*Mg$5Nx_7Sw&o1EK^s!E^@VJ8aQUn1j0wgZb{LyzEbTu zmuSI6mJCvNxn1JESzLhXtazbc~2}G z&!|8CTJWa8`9@1xUdCw5I>l<3o}G$bnB2>;TJ_sGNyHB2Z@V8vgnLDPbKShSw3d^? z&uf^bVT1N8PS~^+L$Dc8Ic_H_Rh4D!7k86Mp%dJFsLs*?2f9#^3(Me6dK+hcWfQZT z3&PwX3%0;*`h}!kFF@^#d0|N?!LHduL192T{)h0uZK%wgcT0V!^3As(W)N8D4$SVryZC^;>=Uim*yc#L61d{|`aV|8B9~k1G+y2; zIsA7~i{t~hFAMv01JJ|pVQj54h%i3V>80)rE5VnB=hXC2KEI3QnJ0-`U*{^hxkVQh zVHT=OvL9TG~(h%X(q8p39y|OHUr5a)P21;=A831N1?v}=4j{0CoY8J$W%;C!?$}qyR}ss zzg$focf7m2gt*_-+aH-@r-5Janq*_jeSclq*Q)o`61Wf6ne-)>W1K?3&KmKcLVon! zy|S*QcsM3g|RGceI`DM+_w(U@x?&Tr-n@pD`q27`Qm1?p=pj@TzcTUfrUXaF0Fk` zS>M8l_RwMZodg4WbvI_3@T)=TB3Z(s$_mW3y4aMgmoagi##!IpkF~uV9dSV4v3BBE zG?tS>v87GQnCzj|HOmNw7foSREv{G0O?R{EH#s_15DN;woU2rzxMDP_mB^FtRoun& zGK4roWm5eJ2R?6qSNIMDA?uqg07Ass$1RqoN{>YLZSftqLX6M7WfLm5<}ZPMo`s=$ zlTPcMi1;B%)tHZgOi7Q7kOk^RIm-+ERCOnPsUKJLYM6PLu*0I}$^=}zQ=%W2`DH8p z(PbhzxT9p9XNp4^msR*a(?-(|(vG&yo=iRI=-3TKzGYa=UuY>0CfEJuti-ge&IqgB z%O6;h>_1%qUvT_87IbDRczma!f!ALE(dS~@h!rEzEHv<`rZGgK zLhLpP*N|iOfcil^=vyF_I+s_4GkL3$(Ux{_Onc63*!mW$kRKVi&m%9w&ws9IPl`&4 z32_$UK;iYmTve!0<%Ei3NDKJp3Pv9_^;po8{P9aCS-c$Pzh!a5I!?p$pgJ2xR+M92 zyd@37$(y^UaoKR;oXWWlyTpLLR5Eh2yP0gdZu}X5o88FF6O^N40E+ z8LL0D9yI!;k@T@L6r0C}$L+V}B7%)-`?>_*?&58YsVYrB z$60b#AZ-P1fKtNz#WC;uIY-uPVNM{w>93jB)9H!M;l{!DYuF&JfJk!s{WrP(X&K8%RRT{JN_ zGapFqfxS$_=SzNtdio2{ou<{&0n06^&1{5tH79!j15ZHwbcKXyRWe2l(Yr<|G+yNI z56CCKqWCphqLlhsUGkL;oQkl|^NTZ}{Xt}mQ(t-|0uyLQ~^I&KH# z=DPXNK)Wj7wM53b;vwhaWu`3bftUkA4Q9uW{w+9uka1APN+}Za^Jj^8b+|@4^5BW+ zv~4BrdNd_z#UxAsXO3#C0og7rZ#zcD0+w(bh2xw98V%{(N0QkIMqN z`1J)Dq(V|0yAHocr@%xbF-+&APDaM5d>M>Mb}8cLINE|f{JqeCg;>|W2{KK zy$USRD>31n9HKO*(n%N4JfRq>lDdtZg}b;r#)8Fzt}_d=m$RF&FMh8RuZ4i33~|Yr zS>k1La>LzBa-%*U59FM$<7Az&a!K2Nt=|CIc8h<#qtLU%vwjTZK5oocUM9Rl)F0KfN6Rou*_9Fo`Lz) zkB-3Sv$k`h3~0CQpGON%PeMNjFgrUot7qy%jZ>RXSc4^f;2AaAp`SjM4#9j%?;hfU zy4oqydc`|~p>b1;%^PQvep`p&*#HZ0$$mRk!|78dhO0)YT7nXZThAx!c$keHbzsN( z8kVBBRk5j0TKRUA$y!12QcZvqFKP*+K=j zEs`-d9QvH@7yi|Lz#FjDS7a3m^n8=nt#69&=h_lS5AjEK=2H4>Kdmr?N_;RES5^L6 zvowFGC&tBl>n%sk59Ht%hklCb(DAoubnE!2y{f4;|6e%_HD_3nn4`69AM-xNg9-@w zHK9ph6f>X3#s-hMc~hI?%*DZ_2Cw;)uqjM@)ztU$j|sbF!k!nl_Iuz#y)7-DhW|}? zq3l)%rM;T0>eocw$I6d$5r^88o>UWXk%8_jIK-`0i9)8B(F811t9gW3MvlDEQN?j`cG|{H8$zs;)}TQ9)w?@t!kZ|KVqQ z$;QQw6wTj7?%*A^=htjs1f1hU&zuN2HzV zv@IKkY>|*08QfKNicV4#-`k?iM!eTdEPMkk*b!)-kVZ4oFVjROE`r`0_)dS4-O@m}QxSs%y{zzY)=oA(b>A?ffR0zwb$3iP+1z$)zHqDI z5{V`jvy;o>yPL~%2;^!E!n41dEw!WGaGT6o)>E9WK=tQMn?N}+dHkBooWJN^i4r|i zt)P#hcWkZ6{*)WEXw;*x%Um`-)u=_1;^=pkjQRE?M5?%r`hJtH#M2STnCe!fstCyK zYb~~?Ha3rQGdc&G=>1^^XBmwcB43vbgE{fvXLINuXje@YX??$#LWto`1k_myjmj0! z;585z4k=8+Ok=$u5&Q-+n?*5R*NR-N3aT zOVQTp4b(l@)od@eRn~j+QNOUJA^Y-(H%Str^6z80vuSeWAu+5ksZIF>Q!59J%1X3( zh_z&5LHf0_jSy)LqX@4*KRJ=Db`?cFTB=hi7c>mHLybhw?A(vqIcsARN9$^RYm&3r zXtplk7n`2hONZ)gG2cZ1gim{PSm$zlW$jc^cq;`%EiSW7cEeGWT7zzmK)sx8FnZ ze1}-LWgrP_vhbpxu9BBU7EsP~D+taFojkLM_0<-ndh->|@%O&~yFuEz-U}>1W>D~6ncQUHWw^TuNtG~Hp$6$}I7SZpw``7M1j~1hm?WQ4 zwwVc-_aR>`dKk-?1VBLR91a`g#9)#86LY@pxfA^1$k+#^_hO(7Dg{!vNuV&v5&y6_l_2iNiGs3d1X~0jdYUusGRl?x z(fdYh79zBQ)Gkbi?^mqeUBHWyHhl>Jw1L5f2t~Twb!QwU?&F8drN%CI#>ZWi?c_(c z?u#*REqSvk%Fj}?ie2u4Uy;XF^)t2*!Sf0}@}AWYrU3s-c=4uJQ|h+OpxEmsf4Fwn zmGJU-3ipDkEO?{zs<3*Bhi)aII%75Hjnqdo6h8Skpa;2gFTXt!S$sb)Z1ad; zk33#^q<=K0O73R$S<;oq^y*cv3bDD1;U8%qk2HnKo6U^s&0<_les(8jK!<+rTd>j? zpt7xY1<(`BUgRpGf~YI(?Gwyr)5*5%4)^6XZ=bvr}SH(kh=DZ;mzel?&7V?6s1^v9-rQM((nWhzR z1DkL%+ZQh^ihl*ZV{;Mf_GG>g*514>QzYvNoE0eMu#V42P#m?3vLaFOsBYR-#=cF?wK!_s@)kBWuzIdh3IqE?OF;mkaX} z=V{tFHny02sV3_fTlOkgrs}nZ$fj(M^DW+`G@2dEYR}=O)G7jJIAjZ5^hnnZ)mNf3 z#9o`v+OC z+yS>GM2jD8bN{pSzC(APv;EZhy03OK=or}Ek*gT4_D>vLWYQ$Gr-pNswE+F*ThYVx ziXygZ#bvaCQM1+wkZj4CgO|3vD`7P4a7|}z6_o4^w$tz2zFJ2Q(~1=( zB#NSrxYsRw(X@1n1O1sBWvIcGIn(A^k2o~{+_E(ktKtJzOTkhO!fhR?iy+_j>~L##ved!LSYAd} zC=(w)zH)iIE()x9jAZHCh=V;X&`;=1zlBaFP^RPjSpvm7JM?BAQ>QD2Rlcic9SL}t zKSB_ykmpoR0%O2S*;9k8tVu5)z!d(4W>9Mkx21S3S@YPcu%suIiRTr0e*xT7{2TBfYVfnx3l3IC{XDCI)C zW_y)gon0=Tf#L=!uA5^moAx@omc0#&`SZz1I@a=mr^P5h3y+_^RVV8n(lyJQG~L%= zh&*Fg;eB3C($!T!V(?Bq1vSicMo{iiH0Q9SWH7cy|!G4O|R;A9$A*I)7ag$*z?BUh-sIZT^wt4WzE$%afdRi4~?ad4zPPQy3VbEfMtG2x~S z0$!tfq@i|vx9%$WE=ytnLibS6XgX^e;U#Hij`t$`umaKC^5IM(p4#sHCvdr`oc3#t zu|l`7xQN#L!`CXA%yW>N7Sk${_rOr|>CfCA*Rc&~{Pt-5bhBF-qpg<43bq`$=y!&G z-clm(c2YY`6MSrC>SsJenbO`i=1A)7s+{m_H0tpcDswzb9Ea&5?xhOI-VGL*2xr*O zJ?lG;_U#_Vi0vU3z#ZdX);TbNCfj6~zF&0UeVr;a)mncafFPw||EQ!TtyZwnAE}YP z-C}<;j*++|Tf5_#K+c_KlZye6OZ@mpS2eO>gLLUe2(xoUJJzKj;rUELjQOk%bmTV!`5-rl%3 z9RB8;fG=zDiH;@~y9Z_L@x!qg*QT&(QZAQEk^rMt_X*t!M=r$K)44&DtAxKB?8M3; zl6u9`?Fn+dom1aXXU{*qLNw^4!=02DZQKBuYstavYZ^F#HZ>S=42i5z2w!NRwFk?#{5IHl$kQ18SNf+-jt1=8`N{ri=uYT(-{(?%?jS&^1bDMdw?tP=n^=iiBkGGmDYwn`NqNZAk_iU`g1=%c>b?v%m*JHFO?lx+s1wpJ>4)^eaE8aTsX0emn zO4Lr^T@#Vr?wV6BF51_VnmfGFj*Q0%sLn0XHOiqmt#7CRS8RSD50M%Z#K!>mY`f0m zZI!8=K=<0h`uwQ_C;Rvf{?vO(QB8af5ZlPK0BI|ob+`LMVnS!Oky$seFGW$8>fef& z1uDac15=g8b)Q!JTd4l!9mL6p=~O_5(dHWP2Z=N8-t3~`USh||0=J~RgZ=}PvHZeR zXk=z&_b@H{`}fxeQYGMWtt!o9^6~7LEo{;8tIFrG4BZ$>=7OATo5EB<{WXE0IpvKG zNVj2o{q)IY_jNiA1N21#?nzvJ1L#9>x1-IJ5(MI=qOG;a&SRG~`}3keO0`08@H11L zxnu@*x|35Kzj8L-!dBT%p{Ow*>T?2JJ$xG5M`q-;qqhe{?MBP#rgwo zb47RY)?f6kS{ijmSxU!U{k)6;t?!r{J1_$E3P<7k*)LIeY+H|AOTZs8m-&7M56Pw* zm^jU&=F;2rz8n^(JQuMlc~#BK!o`K2avKsXhR0dfr2dO4(6sLOy^s;TU>V6?uz;BG z;;L`b7=^&7&~LIpN&ed)j$NDmi8M_@F7Bzd(R;nfV>cS>d_4EYJL^dD>Ao_2H3&cv zO<(z=xK0;NoRFCt1j{}N@CJP;(L|#3J8A$91h{adgKVI3-nMH+O8xOL_j*qyah&-s zv4bUHqoy%`>V%L^{B+y%~+;I%UnmOZpAHx)&2M z#hUhB=RWJ6z;&XsBB;uYrS$5Xn|K`QzOvz_Cwt0OH%*$ot4w%EiUCKlahyqGN?6ZXBFk_kUtiKhLPz>+ zchuf=|6)U4!giwZ!_KIOJEhS|?HTk3lNQ2M>36|SLeEQjwTKJT*((`+r+~)$^@RBOH z=&)Vy2I)056df+e03_A)7kLRx$@@2rU=ovet?Eo9+UIR6UdlVTZj-0Nx6;hi0V4GLaCstA{HZ zW`69G&hUIHy{9C0U~wvn@OY&qODgHy*Xt%pnIx{;PzI==_a!DW@35a0G$Y=-UptBV zVgA8j*5ukdgcpIy{xX>f>Kft~s=rm!+B#R#nig4M3l{v&BBG zQkw35K99mn8mnZs6s26xJ@_-1>w`7X_FI9ZwX=&EJ(v(n>(&syWYWBfwRE2n!?x_E zO8Z#B#@Ui4_D_jxhoQIg>%rGPl`F4&)NgC!=sBmHS^sp0H`MrD)tMjL72UhBt3<~f zH_W*dnT!q8Q@qLT%gcDD$}9?+hIRNnrweHs>rruy@np@gAL9@4cKwPhRsbfsV7vM2 z36SoQgcz0}!@|}S48MOHTlP)z)&~)lm3{~q(~rBNCHpdY>;2$xXf)E#NVL^SHfQS7 zZrB1B z`pq^+lfrH@6U6?my`b4qN-+-5E1>dc;IMn?feltX^|2)a-OEJGD&A(v3orOn4)&I! zl&+MQ^<##8oO;j87~1d-L+L4)qBD)p2ZV&}52A+4chcGVcF5PVtRo`;Ityw3%YNNk zyfj}t5|-yncv*h|xiEsbo)PZ6S0nGr#?Ruk_L4QA8{1~3zk(3rkC<{K=mPzOf5cH1 z$$tUJGOxE-ducac-W@>h-8M!a7dYTP#lFS-rszVZhU{=OV|R($%DOi?^8JRW(xWBhU` zZgbOkZj`%kos1apz~M~dSM*X2jB)6*VbKM)80 zXQ=uK?Q!GDf98Qr>b3Q4bkE7hfEK&_l&UozO|)`qFT~uU$6{7uzONuc7pH51$=kW~ z#7K%Y#I_;zT}Y36qZ5YrR-68W=Zd}~#n7zKQ6uv!5wi2BEzw`6nw*?#?$)ok2~$=v zM;P{O46DpM9{Ek1KRa-%tH_mZCxN37A?I6W4WYklmcA6OSNsJGOSbunj?;4d1w?~( zkV22IU#U9(;1?RTeuZ;$EIM2big!@MO1#+_%8pwwppaz-ITHViq&U*+z8m$V7FqIy z7!zrIw>qs7$dMReocWxqnWl-z0e&}8e=9rEhi$L48n?$7X#NX*gz zL0b%f-_41Mxrb7&zG;*Fqm#Fe{1;zNt~l7hD|H&)2aF(-y*Zt_3}XhRCIi!Y1S2!v zn-Hgi!mJi0NcP-UcPiN8@sKR#w5U-+nJb9nn$wYj^J9li4TR+# zKtS60S3?X!Y`1b)9QbTWQ)TFqmS68GiQ6*H=bL^^X!t?g);UZiLn-y0`YW^_<%!sAh1vxRgY!nI04ARw3$r*HXepzzcb zi0?mVT`I)Qj^VhE#u@@Be_8+B-}wthvP|Ga=ZUW=Pmbl!n9>coU>V5^1ycXC*T`NN zA?iY}mK5a9-Q#>gf$j3qHFsRt=5Si48ObgDiY&t3@pwDL$JXX;gxE@>K1E#6@4?qK z-x*b%wj!vu?Ud7)j%8G_QJyEi9qJ}i>sBZ%{GGUkkm1`YYD;U+qtGa~Urgu1Od#f} z*GZZO)E`b`*IHi&I>kFBsZ#1$&(JQ^|73Jq=o~2Heqb%{&P(BSek>FwIz}VZg8JTv z{mW;EjH|TQOgj^#o}4&wvs{4ExW@U@+!Ba_c;ov@jqDB6L|k9k)FVerL1$gU!!opXk(9?C1+mBS<&=hrEU|vqJPVS%Bhp8rMtMM z=VwV6cy2n#sMu^CiQ7n&f)b+*+%Ni+u0`^O&h(j&&8~X3tkYH5c3qS8YBXNRIu9XA z^nt@%Uh!YT?7gW0V)`$@?IVkMdP($8C%#tOoC4uU*CL1UA9gk0fxb+Tg|`~~#thae zFB6Ei*2BeqH^Jl08*;8UK|)StoX`f}A@;Fns*T=ew$2?UTv49IYl6(?(x&_h(Rubq zG%Cz7+)E$X4i+H=u0`oi#n9bf(+0L~!=Rhk=N#7>?V(7BmwEs3-qOWifZ*SS{{ibs Be0l%? diff --git a/bookscraper/scraper/book_scraper.py b/bookscraper/scraper/book_scraper.py index 56b4f25..83b0348 100644 --- a/bookscraper/scraper/book_scraper.py +++ b/bookscraper/scraper/book_scraper.py @@ -6,18 +6,14 @@ from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from PIL import Image from io import BytesIO -from dotenv import load_dotenv -from scraper.logger import setup_logger, LOG_BUFFER +from scraper.logger import log_debug from scraper.utils import clean_text, load_replacements -load_dotenv() -logger = setup_logger() - class Chapter: - def __init__(self, number, title, url): - self.number = number + def __init__(self, num, title, url): + self.number = num self.title = title self.url = url self.text = "" @@ -34,88 +30,135 @@ class BookScraper: self.cover_url = "" self.chapters = [] - self.chapter_base = None self.base_path = None + self.chapter_base = None + + # ENV + self.DRY_RUN = os.getenv("DRY_RUN", "1") == "1" + self.TEST_LIMIT = int(os.getenv("TEST_LIMIT", "10")) + self.MAX_DL = float(os.getenv("MAX_DOWNLOADS_PER_SEC", "1")) + self.min_delay = 1.0 / self.MAX_DL if self.MAX_DL > 0 else 1.0 + self._last_download_time = 0 + + # replacements.txt + fp = os.path.join(os.getcwd(), "replacements.txt") + extra = load_replacements(fp) + self.site.replacements.update(extra) + + self.start_time = None + self.total_chapters = 0 + self.volume_dirs = {} + + # ------------------------------------------------------------ + # RATE LIMITER + # ------------------------------------------------------------ - # ENV settings - self.DRY_RUN = os.getenv("DRY_RUN", "0") == "1" - self.TEST_CHAPTER_LIMIT = int(os.getenv("TEST_CHAPTER_LIMIT", "10")) - self.MAX_VOL_SIZE = int(os.getenv("MAX_VOL_SIZE", "1500")) - self.MAX_DL_PER_SEC = int(os.getenv("MAX_DL_PER_SEC", "2")) + def throttle(self): + now = time.time() + elapsed = now - self._last_download_time - # Load text replacements - self.replacements = load_replacements("replacements.txt") + if elapsed < self.min_delay: + time.sleep(self.min_delay - elapsed) - # ----------------------------------------------------- + self._last_download_time = time.time() + + # ------------------------------------------------------------ def execute(self): - LOG_BUFFER.seek(0) - LOG_BUFFER.truncate(0) + log_debug(f"Starting scraper for {self.url}") - logger.debug("Starting scraper for %s", self.url) - soup = self.get_document(self.url) + self.start_time = time.time() + soup = self.get_doc_with_retry(self.url) self.parse_title(soup) self.parse_author(soup) self.parse_description(soup) self.parse_cover(soup) + self.prepare_output_folder() chapter_page = self.get_chapter_page(soup) self.parse_chapter_links(chapter_page) + self.prepare_volume_folders() if self.DRY_RUN: - logger.debug( - "DRY RUN → downloading only first %s chapters", self.TEST_CHAPTER_LIMIT) - self.get_some_chapters(self.TEST_CHAPTER_LIMIT) + self.download_some(self.TEST_LIMIT) else: - self.get_all_chapters() - self.split_into_volumes() + self.download_all() + + return {"title": self.book_title} + + # ------------------------------------------------------------ + # HTTP GET WITH RETRIES + HARD 429 COOLDOWN WITH COUNTDOWN + # ------------------------------------------------------------ + def get_doc_with_retry(self, url): + attempt = 1 + + while True: + self.throttle() + log_debug(f"GET {url} (attempt {attempt})") + + try: + resp = requests.get( + url, + headers={"User-Agent": "Mozilla/5.0"}, + timeout=10, + ) + except Exception as e: + log_debug(f"Network error {e} → retry in {attempt + 1}s") + time.sleep(attempt + 1) + attempt += 1 + continue - return { - "title": self.book_title, - "debug": LOG_BUFFER.getvalue() - } + code = resp.status_code + log_debug(f"HTTP {code} for {url}") + + # 429 → hard cooldown with countdown + if code == 429: + cooldown = 60 + log_debug(f"429 detected — cooldown {cooldown}s") + for i in range(cooldown, 0, -1): + log_debug(f"429 cooldown… {i}s remaining") + time.sleep(1) + attempt += 1 + continue - # ----------------------------------------------------- - # NETWORK - # ----------------------------------------------------- - def get_document(self, url): - logger.debug("GET %s", url) - time.sleep(1 / max(1, self.MAX_DL_PER_SEC)) + # recoverable + if code in (403, 500): + wait = min(5 * attempt, 30) + log_debug(f"HTTP {code} → retry in {wait}s") + time.sleep(wait) + attempt += 1 + continue - resp = requests.get( - url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) - resp.encoding = self.site.encoding + if code == 200: + resp.encoding = self.site.encoding + return BeautifulSoup(resp.text, "lxml") - logger.debug("HTTP %s for %s", resp.status_code, url) - return BeautifulSoup(resp.text, "lxml") + # unexpected + wait = attempt + 1 + log_debug(f"Unexpected HTTP {code} → sleep {wait}s") + time.sleep(wait) + attempt += 1 - # ----------------------------------------------------- - # BASIC PARSERS (piaotia structure) - # ----------------------------------------------------- + # ------------------------------------------------------------ def parse_title(self, soup): h1 = soup.find("h1") - if h1: - self.book_title = h1.get_text(strip=True) - else: - self.book_title = "UnknownTitle" - logger.debug("Book title: %s", self.book_title) + self.book_title = h1.get_text(strip=True) if h1 else "UnknownTitle" + log_debug(f"Book title = {self.book_title}") def parse_author(self, soup): - td = soup.find("td", string=lambda t: t and "作" in t and "者" in t) - if td: - raw = td.get_text(strip=True) - if ":" in raw: - self.book_author = raw.split(":", 1)[1].strip() - else: - self.book_author = "UnknownAuthor" - else: - self.book_author = "UnknownAuthor" - logger.debug("Book author: %s", self.book_author) + td = soup.find("td", string=lambda t: t and "作" in t) + self.book_author = ( + td.get_text(strip=True).split(":")[1] + if td and ":" in td.get_text() + else "UnknownAuthor" + ) + log_debug(f"Book author = {self.book_author}") def parse_description(self, soup): span = soup.find("span", string=lambda t: t and "内容简介" in t) if not span: + log_debug("No description found") self.book_description = "" return @@ -123,113 +166,210 @@ class BookScraper: for sib in span.next_siblings: if getattr(sib, "name", None) == "span": break - txt = sib.get_text(strip=True) if not isinstance( - sib, str) else sib.strip() - if txt: - parts.append(txt) + text = ( + sib.get_text(strip=True) + if hasattr(sib, "get_text") + else str(sib).strip() + ) + if text: + parts.append(text) self.book_description = "\n".join(parts) - logger.debug("Description parsed (%s chars)", - len(self.book_description)) + log_debug(f"Description length = {len(self.book_description)}") + # ------------------------------------------------------------ def parse_cover(self, soup): - selector = ( - "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table " - "> tr:nth-of-type(4) > td:nth-of-type(1) > table > tr:nth-of-type(1) " - "> td:nth-of-type(2) > a:nth-of-type(1) > img" - ) - img = soup.select_one(selector) - if img: - self.cover_url = urljoin(self.site.root, img.get("src")) - else: - logger.debug("Cover not found!") - logger.debug("Cover URL = %s", self.cover_url) + cover = soup.find( + "img", src=lambda v: v and "files/article/image" in v) + if not cover: + log_debug("Cover not found") + return - # ----------------------------------------------------- + self.cover_url = urljoin(self.site.root, cover.get("src")) + log_debug(f"Cover URL = {self.cover_url}") + + # ------------------------------------------------------------ def prepare_output_folder(self): - output_root = os.getenv("OUTPUT_DIR", "./output") - self.base_path = Path(output_root) / self.book_title / self.site.name + self.base_path = Path("output") / self.book_title / self.site.name self.base_path.mkdir(parents=True, exist_ok=True) - logger.debug("Output directory: %s", self.base_path) if self.cover_url: - self.save_image(self.cover_url, self.base_path / "cover.jpg") + self.download_cover() + + def download_cover(self): + log_debug(f"Downloading cover: {self.cover_url}") - def save_image(self, url, path): - logger.debug("Downloading cover: %s", url) resp = requests.get( - url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) - if resp.status_code == 200: + self.cover_url, + headers={"User-Agent": "Mozilla/5.0"}, + timeout=10, + ) + + if resp.status_code != 200: + return + + if "html" in resp.headers.get("Content-Type", ""): + return + + try: img = Image.open(BytesIO(resp.content)) - img.save(path) - logger.debug("Cover saved to %s", path) + except: + return + + img.save(self.base_path / "cover.jpg") + log_debug("Cover saved") - # ----------------------------------------------------- - # CHAPTER PAGE - # ----------------------------------------------------- + # ------------------------------------------------------------ def get_chapter_page(self, soup): node = soup.select_one( - "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table") - link = node.select_one("a") - href = link.get("href") - chapter_url = urljoin(self.site.root, href) - - parsed = urlparse(chapter_url) - base = parsed.path.rsplit("/", 1)[0] + "/" - self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{base}" + "html > body > div:nth-of-type(6) > div:nth-of-type(2) > div > table" + ) + href = node.select_one("a").get("href") + url = urljoin(self.site.root, href) - logger.debug("Chapter index URL = %s", chapter_url) - logger.debug("CHAPTER_BASE = %s", self.chapter_base) + parsed = urlparse(url) + bp = parsed.path.rsplit("/", 1)[0] + "/" + self.chapter_base = f"{parsed.scheme}://{parsed.netloc}{bp}" - return self.get_document(chapter_url) + return self.get_doc_with_retry(url) + # ------------------------------------------------------------ def parse_chapter_links(self, soup): - container = soup.select_one("div.centent") - links = container.select("ul li a[href]") + cont = soup.select_one(self.site.chapter_list_selector) + items = cont.select("ul li a[href]") - for i, a in enumerate(links, 1): + self.chapters = [] + idx = 1 + for a in items: href = a.get("href") if not href.endswith(".html"): continue - - abs_url = urljoin(self.chapter_base, href) title = a.get_text(strip=True) - self.chapters.append(Chapter(i, title, abs_url)) + full = urljoin(self.chapter_base, href) + self.chapters.append(Chapter(idx, title, full)) + idx += 1 + + self.total_chapters = len(self.chapters) + log_debug(f"Found {self.total_chapters} chapters") + + # ------------------------------------------------------------ + def prepare_volume_folders(self): + max_size = int(os.getenv("MAX_VOL_SIZE", "200")) + num_vols = (self.total_chapters + max_size - 1) // max_size - logger.debug("Total chapters: %s", len(self.chapters)) + for v in range(1, num_vols + 1): + d = self.base_path / f"v{v}" + d.mkdir(parents=True, exist_ok=True) + self.volume_dirs[v] = d - # ----------------------------------------------------- - # DOWNLOAD CHAPTERS - # ----------------------------------------------------- - def get_all_chapters(self): + # ------------------------------------------------------------ + def download_all(self): for ch in self.chapters: - ch.text = self.fetch_chapter(ch) - logger.debug("CH %s length = %s", ch.number, len(ch.text)) + self.download_chapter(ch) - def get_some_chapters(self, limit): + def download_some(self, limit): for ch in self.chapters[:limit]: - ch.text = self.fetch_chapter(ch) - filename = self.base_path / f"{ch.number:05d}_{ch.title}.txt" - filename.write_text(ch.text, encoding="utf-8") - logger.debug("Saved test chapter: %s", filename) + self.download_chapter(ch) + + # ------------------------------------------------------------ + def download_chapter(self, ch): + # Determine volume + filename + max_size = int(os.getenv("MAX_VOL_SIZE", "200")) + volume = ((ch.number - 1) // max_size) + 1 + vdir = self.volume_dirs.get(volume, self.base_path) + + expected_name = f"{ch.number:05d}_{ch.title}.txt" + fname = vdir / expected_name + expected_full_path = str(fname.resolve()) + + # STRICT SKIP CHECK + if fname.exists() and fname.is_file(): + actual_size = fname.stat().st_size + + # correct name? + if fname.name == expected_name: + expected_dir = str(vdir.resolve()) + actual_dir = str(fname.parent.resolve()) + + if expected_dir == actual_dir: + if actual_size > 300: + log_debug( + f"Skip chapter {ch.number}/{self.total_chapters}: already exists\n" + f" Path: {expected_full_path}\n" + f" Size: {actual_size} bytes" + ) + return + else: + log_debug( + f"Existing file too small ({actual_size} bytes), redownloading: {expected_full_path}" + ) + else: + log_debug( + f"Directory mismatch for chapter {ch.number}, redownloading" + ) + else: + log_debug( + f"Filename mismatch for chapter {ch.number}, redownloading\n" + f" Expected: {expected_name}\n" + f" Found: {fname.name}" + ) + + # PROGRESS INFO + percent = (ch.number / self.total_chapters) * 100 + elapsed = time.time() - self.start_time + avg_time = elapsed / max(ch.number - 1, 1) + remaining = self.total_chapters - ch.number + eta_seconds = max(0, remaining * avg_time) + + eta_min = int(eta_seconds // 60) + eta_sec = int(eta_seconds % 60) + + log_debug( + f"Fetching chapter {ch.number}/{self.total_chapters} " + f"({percent:.2f}%, ETA {eta_min}m {eta_sec}s): " + f"{ch.title}" + ) + + # RETRY EMPTY CONTENT + attempt = 1 + while True: + soup = self.get_doc_with_retry(ch.url) + text = self.parse_chapter_text(soup) - def fetch_chapter(self, ch): - soup = self.get_document(ch.url) - text = self.parse_chapter_text(soup) - return clean_text(text, self.replacements) + if text.strip(): + ch.text = text + break + + wait = min(10 + attempt, 30) + log_debug(f"Empty chapter → retry in {wait}s") + time.sleep(wait) + attempt += 1 + + fname.write_text(ch.text, encoding="utf-8") + log_debug(f"Saved chapter to v{volume}: {fname}") + chapter_delay = float(os.getenv("CHAPTER_DELAY", "2")) + log_debug(f"Throttling {chapter_delay}s before next chapter") + time.sleep(chapter_delay) + + # ------------------------------------------------------------ def parse_chapter_text(self, soup): body = soup.body + if not body: + return "" + h1 = body.find("h1") + if not h1: + return "" parts = [] collecting = False for sib in h1.next_siblings: - if getattr(sib, "get", None) and sib.get("class") == ["bottomlink"]: - break - if getattr(sib, "get", None) and sib.get("class") == ["toplink"]: + if getattr(sib, "class", None) == ["toplink"]: continue + if getattr(sib, "class", None) == ["bottomlink"]: + break if getattr(sib, "name", None) in ["script", "style"]: continue @@ -238,32 +378,14 @@ class BookScraper: collecting = True continue - txt = sib.strip() if isinstance(sib, str) else sib.get_text("\n", strip=True) - if txt: - parts.append(txt) - - return "\n".join(parts).strip() - - # ----------------------------------------------------- - # SPLIT VOLUMES - # ----------------------------------------------------- - def split_into_volumes(self): - logger.debug( - "Splitting into volumes (max %s chapters per volume)", self.MAX_VOL_SIZE) - - chapters = len(self.chapters) - volume = 1 - index = 0 - - while index < chapters: - chunk = self.chapters[index:index + self.MAX_VOL_SIZE] - volume_dir = self.base_path / f"v{volume}" - volume_dir.mkdir(exist_ok=True) - - for ch in chunk: - filename = volume_dir / f"{ch.number:05d}_{ch.title}.txt" - filename.write_text(ch.text, encoding="utf-8") - - logger.debug("Volume %s saved (%s chapters)", volume, len(chunk)) - volume += 1 - index += self.MAX_VOL_SIZE + text = ( + sib.get_text("\n", strip=True) + if hasattr(sib, "get_text") + else str(sib).strip() + ) + if text: + parts.append(text) + + raw = "\n".join(parts) + raw = clean_text(raw, self.site.replacements) + return raw.strip() diff --git a/bookscraper/scraper/logger.py b/bookscraper/scraper/logger.py index f70d0d5..e0f28f1 100644 --- a/bookscraper/scraper/logger.py +++ b/bookscraper/scraper/logger.py @@ -2,26 +2,72 @@ import logging from io import StringIO -# In-memory buffer returned to web UI +# In-memory buffer (voor eindresultaat) LOG_BUFFER = StringIO() +# List van callbacks (SSE-clients) +LISTENERS = [] + + +def add_listener(callback): + """Registreer een SSE listener callback.""" + LISTENERS.append(callback) + + +def remove_listener(callback): + """Verwijder SSE listener (bij disconnect).""" + if callback in LISTENERS: + LISTENERS.remove(callback) + + +def broadcast(line): + """Stuur logregel naar alle listeners.""" + for cb in LISTENERS[:]: + try: + cb(line) + except Exception: + LISTENERS.remove(cb) + def setup_logger(): + """Creëer logger die naar console, buffer én SSE broadcast.""" logger = logging.getLogger("bookscraper") logger.setLevel(logging.DEBUG) - logger.handlers = [] # voorkomen dubbele handlers bij reload + logger.handlers = [] - # Console handler + # formatter + fmt = logging.Formatter("[%(levelname)s] %(message)s") + + # console handler ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) - ch.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) + ch.setFormatter(fmt) + + # buffer handler + bh = logging.StreamHandler(LOG_BUFFER) + bh.setLevel(logging.DEBUG) + bh.setFormatter(fmt) - # Buffer handler for returning to UI - mh = logging.StreamHandler(LOG_BUFFER) - mh.setLevel(logging.DEBUG) - mh.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) + # SSE handler + class SSEHandler(logging.Handler): + def emit(self, record): + msg = self.format(record) + broadcast(msg) + + sh = SSEHandler() + sh.setLevel(logging.DEBUG) + sh.setFormatter(fmt) logger.addHandler(ch) - logger.addHandler(mh) + logger.addHandler(bh) + logger.addHandler(sh) return logger + + +# Globale logger +LOGGER = setup_logger() + + +def log_debug(msg): + LOGGER.debug(msg) diff --git a/bookscraper/scraper/sites.py b/bookscraper/scraper/sites.py index 89d3451..51023dc 100644 --- a/bookscraper/scraper/sites.py +++ b/bookscraper/scraper/sites.py @@ -3,7 +3,7 @@ class BookSite: self.name = "piaotian" self.root = "https://www.ptwxz.com" self.chapter_list_selector = "div.centent" - self.encoding = "gb2312" + self.encoding = "GB18030" self.replacements = { "  ": "\n", "手机用户请访问http://m.piaotian.net": "", diff --git a/bookscraper/templates/index.html b/bookscraper/templates/index.html index 03526d9..7cb4612 100644 --- a/bookscraper/templates/index.html +++ b/bookscraper/templates/index.html @@ -1,22 +1,40 @@ - Book Scraper + BookScraper + -

Book Scraper

+

BookScraper

-{% if error %} -

{{ error }}

-{% endif %} + + -
-

- -

- -
+

Realtime log:

+
+ +