Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
9 [6 U i; V, B5 o) v4 |" u2 g- #!/usr/bin/env python! t* r1 o" y( J7 d. y9 ]0 c. H
- # -*- encoding: utf-8 -*-: k1 C0 S4 W9 f8 Y' D# j6 Z& ^
- # Created on 2019-05-05 21:43:11
# e8 {6 _! `3 F) ?( o! n3 J - # Project: XiaoShuo+ Y% r% |' b! J: V) h+ {* P6 d
-
@( t; {0 N F# \# a6 J2 Y4 K x - from pyspider.libs.base_handler import *
8 C. [: L7 k. f, n' O3 o ?7 N - import pymysql. k- {: u* p8 Z. K
- import random
0 i- G1 P& s8 ?# Z; A8 ~- { - import datetime
4 _8 j, {) N! g, |& K6 ]( H# S, q! G - import urllib2,HTMLParser,re
+ q, T. _- K# W& W, x2 } - import os7 C6 O b# ~, s) F
- import sys% y: }) c/ G# V5 s) r- X4 J( y
- import re2 G4 R: }. v& P6 K5 J
- import codecs
4 H( m6 {- o* K. T1 H/ ? - import requests
+ R6 O% {) U, n' _9 \4 z - import json4 F" N9 L8 R2 n, G- }$ |
- 9 B% L% G" G+ [- G0 q, B
- class Handler(BaseHandler):1 ]: k4 O" y- X6 `' c* r ^3 r
- global Datos
( J% p* a9 y* `6 m' G6 u - global P_dir
/ L7 P+ }. | k. }4 C( m4 W' z- P - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径, `! Y5 |* O8 Z( P6 ~7 n
- global Datos, w9 r* b. Z, M3 {- f+ C7 N7 s
- Datos = {}7 r) G7 b' A- u# i- J' T* C+ h
- headers= {. H4 u+ A/ u' J }" m1 m$ g7 _& b
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',+ d' `7 o$ N5 f6 A
- 'Accept-Encoding':'gzip, deflate, sdch',
_$ A; W( \3 I7 {# y - 'Accept-Language':'zh-CN,zh;q=0.8',, {( `+ L3 m# ?& _7 Y' @
- 'Cache-Control':'max-age=0',0 w. Q6 b- u! m& C2 l
- 'Connection':'keep-alive',
3 b; |1 {0 {9 w# @- e/ R - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
/ N! r. i! K4 r0 e - }
0 q6 w8 y& ?9 ~8 V - crawl_config = {) S4 H$ y3 `3 W% w2 r* \ r. t0 Q8 A
- 'headers' : headers,. D/ n' X: S6 i; m5 K
- 'timeout' : 300 O$ t, U$ T4 O b, E/ f. f! z
- }
! m' T" L; q1 B - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):6 A% a' K3 o" i: Z
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
" [+ O6 S2 ~8 e# x1 @3 A. O: n! J( S - try:2 x1 t+ ?' n, }9 [
- cursor = db.cursor()- w. d- @; m$ |$ S
- #注意此处字符串的占位符要加双引号"%s"
( K S% z* p+ x - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);1 v5 f- M, g6 B$ L8 l6 C* ]- h
- # print(sql)
9 {9 @+ v% b' n - cursor.execute(sql)& x1 Y y1 C4 G4 l' e0 p% q8 s# m
-
+ O+ }; e% @; \' N - #qid = cursor.lastrowid
1 x* v0 i A" p5 B+ P1 w5 {- D# T - #print(qid), R/ x6 L/ Y. m: C
- 5 u6 ?/ b% Q. s8 c' k# H2 _! y
- db.commit(). L& T2 ]0 u) [- K h! }1 C# m
- except Exception as err:
( g9 ~ k9 Y/ [! n6 A& D0 i - print("Error %s for execute sql: %s" % (err, sql)): x K5 L8 d; o* A- \8 X& h
- db.rollback()
- v+ T) {+ E4 o G1 x' A& B7 \0 z2 d - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
. e1 h" R& I" K, j - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
% V2 R4 f: y6 S" W - try:
: G: h) P/ s4 b4 Y; t( \9 b* p. L1 Z# I - cursor = db.cursor()7 ?; j, ]7 @' |! w) _5 m
- #注意此处字符串的占位符要加双引号"%s"/ Z' D9 o; J0 a% m
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);; H( `5 C5 M4 @) z; X; V8 d
- # print(sql)
8 R, j$ g7 T! F7 U( z0 m - cursor.execute(sql)
; v( D% }( G& F% N- M - 0 e# P0 m; S8 l6 Y* f. T
- #qid = cursor.lastrowid F, U2 E4 f" u. ~9 _" o
- #print(qid)
. R- M3 Z I9 x7 r; n* V& `9 }! W - 3 V+ Z0 j5 j* s7 _
- db.commit() r. F, Y0 j+ `
- except Exception as err:+ h X+ y J0 x% d ~. t: T+ M' C* B
- print("Error %s for execute sql: %s" % (err, sql))
# x7 o5 d; c3 N w0 |8 q# L - db.rollback(): g9 l' h! V& n7 Q3 Y+ Z' {
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):8 @7 `5 g* D- K6 P
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
. g! p8 J& ]$ v9 s0 G7 {3 t - try:
$ l* O# O) }6 F - cursor = db.cursor()
$ O! v+ [, |: W9 }' I - #注意此处字符串的占位符要加双引号"%s"
+ }9 W) s) \5 V9 w& |: } - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);- U% O" Y! }" U6 e f3 I
- print(sql)- H1 B( t+ E7 z& p; ] ?
- cursor.execute(sql)
2 |+ Q2 V0 l5 X: B) s g4 c6 b - print(cursor.lastrowid)
, P7 r2 x( U- n8 Y2 e - db.commit()
3 ]1 t. ^: J# s4 a - except Exception as err:' Y4 U+ l! \0 f G3 b
- # except:* j2 y J( p; [0 p! B
- # print('Failed')( d+ | G$ e( x- V P
- print("Error %s for execute sql: %s" % (err, sql))
% }2 g: j/ Z, k6 E3 P7 `- m! M4 ` - db.rollback()$ W: P8 \8 H& a
- 6 {5 O: b$ _' \5 X' G
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): 9 |! `) i9 M! E1 n; k
- reload(sys)4 o$ L& K& j% t( g3 B5 p
- sys.setdefaultencoding("gbk"): H! C+ u2 U9 X2 F
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址' M4 U5 m+ F0 R
- locoy_data = {
. n* L% d( s% ?$ T - 'my_u':'用户名', #后台用户名4 C/ _* O$ b% c
- 'my_p':'密码', #后台密码0 |& Z: g' O; C# Q* @
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
" |6 x% y4 g' F2 ]6 {1 } - 'caid':Cater_Name.encode('gbk', 'ignore'),
$ O* Z) K+ r/ h+ [( n - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),1 }) o! Q7 d8 I' d$ ?- N4 E
- 'article':BookConte.encode('gbk', 'ignore'),
; }: A6 q; F& W x- J - 'author':Book_author.encode('gbk', 'ignore'),
% S) v' k1 v8 W% F: N - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),+ l. m* h8 |1 W) o
- 'thumb':Book_img,
* N$ k1 n$ M% L - 'content':Book_Introduction.encode('gbk', 'ignore'),2 ~' r$ y y8 J/ E7 L$ A4 h
- 'abover':abover.encode('gbk', 'ignore')
5 y' U3 X N7 Z$ l - }
% t5 F) @' M. x+ y6 _' y - res = requests.post(locoy_url, data=locoy_data)1 s& g$ ^+ N5 i1 g+ D* {
- print res.text
" |, @1 w7 c+ ?0 l. S7 ^9 W - print res.content3 f. O" _8 Q' n/ f
- # print Dsd
) M: _( Y: H5 U! I9 M$ L! X+ X& B& o& ] - return res' X, A4 W& V/ Z9 P1 }% M
- / _+ t. [2 z2 N# G! B
- def __init__(self):: T) T; G, Z6 N s- K
- self.base_url1 = 'https://www.****.cc/'* ^6 Z9 V9 d( ?- \
- self.base_url2 = '/'4 J1 U: d4 X- a/ t0 O N" x5 J& H8 w
- self.CaterId = []) C; R% w# m8 t% S" G! h; F6 C, i# S
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']; i' {& f: J* O" y
- self.page_num = 17 ^7 o s- Q" p# |+ L8 U
- self.total_num = 200 ' T! x C# r7 x# l+ j9 R
-
( U8 n6 K& N$ I/ k# n) Y - @every(minutes=8 * 60)
t/ j4 T; f1 q( m4 _+ i; f# w - def on_start(self):
$ V9 i$ g: l/ B3 q - global Cater_Name4 @$ P, Y J3 e( i
- Cater_Name = []+ ^) j% f5 ~& A& R' [& S
- while self.page_num <= self.total_num:
; I& t7 C& Q4 _4 S" q5 c - for self.CaterId in self.CaterIds:
# x- U" a5 A/ e - if self.CaterId == 'xuanhuan':0 ~2 G/ T$ j; S2 @; x; V7 d$ x- h
- Cater_Name = '玄幻'& }9 G! a7 f! d' f4 Y
- if self.CaterId == 'wuxia':8 a$ h% X7 i5 ? r
- Cater_Name = '武侠'
( U4 E6 o. G" m; H - if self.CaterId == 'lishi':# B* v5 l; w# |$ t p, @/ `
- Cater_Name = '历史' - f- G! p1 t+ z& \, g* v6 ^
- if self.CaterId == 'yanqing': e, [5 L- h! s
- Cater_Name = '都市'
0 f1 r1 C4 u- T! i - if self.CaterId == 'nvsheng':
/ a$ ^7 i) m4 D! s+ ?. D3 c - Cater_Name = '都市'
) t) J; Y; H: l2 E' ?2 b: { - if self.CaterId == 'kehuan':
0 O! B: i+ q5 Y( C4 x+ z' U4 k - Cater_Name = '科幻' 6 x, ?$ X' P. v- s4 b
- if self.CaterId == 'kongbu':
8 z3 ^' D; ?4 s - Cater_Name = '游戏'
- T4 Q/ K+ o7 n7 q% A2 A8 R, ` - print self.CaterId0 D0 ?( k' O4 F! T" `% I9 i0 X
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
8 m+ ~5 V( [) E6 r- s - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)) W+ p: S# R% F h
- self.page_num += 1
( Q. Q/ v1 J% L# n8 z: z -
. h" _* X8 }( h - def list_Caterg(self, response):" M; D# H& N/ B9 f% q2 Z
- Cater_Name = response.save
, ?8 q/ w+ ?& a3 X1 L - for each in response.doc('.pic-list a[href^="http"]').items():$ y5 I8 E4 D! a' j
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
% f- o( Q5 h' ?3 B$ N: B -
! v* k% Z' K# Q2 K( d: }/ |* N - def list_Caterg_detail(self, response):
- w3 w! `3 B% W) U2 r - Cater_Name = response.save3 y) q) L* a: K0 ]
- # print Cater_Name/ e: I1 E" v" @$ H- l S4 d
- Bookname = response.doc('h1').text(), s' i. f* G9 {' _9 y( W1 c
- print Bookname+ _# ?$ N! z9 P9 Q( S/ l
- Book_author = response.doc('.authorname > a').text()
) J- X' w/ Z3 q+ E - # print Book_author$ ~; t, i3 S! l8 `
- Book_Introduction = response.doc('.book-intro > div').text()) Q+ T8 L, E1 A& c
- # print Book_Introduction# B- U2 l3 |% g( W6 g$ c
- Book_Synopsis = response.doc('b').eq(1).text()
+ A/ c7 T3 V" A8 ]) ^/ c- P - # print Book_Synopsis
' T- T5 `# a* m8 C, w - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]$ S; p# D F7 c/ C
- # print Book_Palabras; S- Y% t4 Y' a. @3 S% X- n
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
: k/ I# V9 R6 P* V - # print BookIDs- ~% o0 n7 _) \$ P/ L, G
- Book_Dates = str(datetime.datetime.now()) # J/ ~3 e9 Q1 b1 o% P% A7 k
- for imgs in response.doc('.bigpic > img[src^="http"]').items():7 a" [1 W5 H6 u8 e* \+ C+ ]
- img = imgs.attr.src4 I% e/ z/ Q* [) A# L7 B5 f
- print img' L$ B7 J0 l5 J, X% U4 `0 a
- #小说封面下载( S4 R7 Q. b& {. i5 |
- extension = self.getExtension(img)
& ~7 M* }" M0 I/ ` - name = self.getname(img)
% P9 [8 w( h+ g - file_name = name + "." + extension
0 m* ~+ @0 ?# t2 q - imgDir = P_dir + name- b& N6 e2 }+ {+ Z4 _3 f/ X. N
- Locaimg = imgDir + "/" + file_name
% O9 w- }/ O8 ~" z8 Q$ H - print Locaimg; r' M+ q" ]% J
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
* G; @' I! s# L' P5 j4 F3 L8 e - print('attachment url is ' + img) #
0 }5 _4 V& u3 U& Q( B - Datos = {
4 d" \" I% S ]( N& c - "Cater_Name":Cater_Name,5 i6 o% X R/ r1 V% R
- "Book_author":Book_author,
: _/ N. u V) n8 u7 `1 ~ - "Book_Introduction":Book_Introduction,
* T' [ {0 q( r" W& ~! v* Z - "Book_Synopsis":Book_Synopsis,6 J! c9 u' I/ P( U/ g; t- q
- "Book_Palabras":Book_Palabras,
8 n# d6 @% K$ \5 i6 l a1 N - "img":img,9 g& G G1 l5 m/ I0 ~+ j- b
- }
. T9 V. R% C7 X; c! X2 ~ - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布8 d# \/ E2 I& T/ P& O, O
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
* m4 o) o0 D2 L- C - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
# s* F* |. G$ ~7 i3 N' _6 q - 6 t4 n9 W6 x6 i8 M; o8 I3 Q+ A
- @config(age=8 * 60 * 60)
% r+ g2 ?; N4 [; R - def index_page(self, response): * u/ s; R/ f+ [6 e# ?2 `
- Datos = {) t; C7 n3 Z7 K+ c F/ C; j, ]
- "Cater_Name":response.save['Cater_Name'],* G, x- D6 H1 H8 d+ L9 t5 Z1 o @6 l
- "Book_author":response.save['Book_author'],
! u% s' u v6 z: k% b! s: |5 R; ?& F - "Book_Introduction":response.save['Book_Introduction'],9 a$ ~8 ~6 X4 ?- o8 W
- "Book_Synopsis":response.save['Book_Synopsis'],# A9 S7 u1 Z" @& A2 i9 C/ x% y
- "Book_Palabras":response.save['Book_Palabras'],$ `2 h6 K) H6 E0 Y" _ v; T ?8 k
- "img":response.save['img'],
: l9 g8 @1 b8 W, { - }& g# N2 j( I( c [( `7 B/ s$ J
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
" Y* o; B) w/ m# i% |- J - # for each in response.doc('.chapter-list a[href^="http"]').items(): 0 y& o. p$ }7 R, H
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)" t& j8 u) `' r6 X
- @config(priority=2)
8 @4 f* k5 n: Y3 c1 e - @catch_status_code_error$ R( D( V: c( I2 H, i9 d
- def detail_page(self, response): % m5 J" B+ b9 X( H
- NewRe1 = u'哈书'
0 A+ P' B3 A ~# e* T# G - NewRe2 = u'huhjsd.CC'/ P- o+ W- [/ b) Z
- NewRe3 = r'^\\n\\n'2 H+ C/ N* l7 f* L1 w2 H* X
- NewRe5 = u'小说网'
, D; k6 ^9 h+ R - NewRe6 = u'fgdfgf'
$ U" I; O/ [; O$ E - NewRe7 = u'fgfgf'
& u" d' [4 x$ }' z6 ` - NewRe8 = u'ffhgf'; |2 n" v% p& r; J
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+': U" U( z, }" V+ h1 Q/ p+ @
- ReC1 = u'静思'
0 E* o3 u w( i3 ?' v - ReC2 = u'aghgf.com'
3 ]1 D3 R# I) q4 q: [ - ReC3 = u'aghgfh.com'9 B$ V% A! T6 d3 l1 I
- ReC4 = u''
4 q: N0 ^$ S g/ t - ReC5 = u'文学网'3 Z5 m' @6 A! {: p" m# Z& l# {
- ReC6 = r'<BR>'% K# q" `& w. U6 D9 y" U+ w
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
' s& B) k! H" ~! {: ? - print Bookname. ? T% A" N5 |* ? X( v
- Cater_Name = response.save['Cater_Name'] # 小说分类4 }2 I7 v4 F7 {' ~3 ]# G
- Book_author = response.save['Book_author'] #小说作者; P# N7 e2 C& z0 V' I% Q9 G
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介0 c7 h6 s1 u& @1 f, {
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新1 a! g) k9 w8 u" ]* ?
- Book_Palabras = response.save['Book_Palabras'] #小说字数
9 ^" j6 c4 X! s' a) \$ I9 S0 ]2 ? - Bookurl = response.url #小说网址: }5 \$ k5 n( H( K+ H/ n! t
- Booktitle = response.doc('.article-title').text() #章节名称* k8 r$ E6 ~8 e! t2 H4 j# j
- BookID = response.doc('.readset-r span').text() #小说ID8 t+ S1 ~2 J8 R0 C8 v- [
- BookConte1 = response.doc('.article-con').text() #小说章节内容" u% y" n1 k+ @4 f+ h
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
2 X& }! i d; r* U7 p0 @5 N+ @ - Book_Date = str(datetime.datetime.now()) # 采集时间
5 W" q) O5 q2 ]' Z2 J- b) M5 e: R - BookConte2 = BookConte1.replace(NewRe1 , ReC1)! A* B; L& Q+ s3 m
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)7 a! |# W' }7 c9 T1 I7 `
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
, Z0 @2 l1 ~7 K1 m0 ?5 k; I - BookConte6 = BookConte5.replace(NewRe6 , ReC2)7 I& c9 s; Z0 W# M6 d0 ?( s; M* p4 h
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)* l# r |- R9 k2 p$ V9 d& E
- BookConte8 = BookConte7.replace(NewRe3 , ReC6): F7 c" u+ }* |( }: O3 P3 o
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
: f: i; B" n. y, f1 L. G - BookConte = BookConte4.replace("\n\n","<br>")
% z3 v" M c* m+ `, M3 K( O - print BookConte; I9 V% K. K% X/ Z, j" R* A
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
- Y4 ?5 U7 I. ]9 _5 w; I - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
7 P1 W+ F6 J( n8 g+ q8 k. p - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
7 o5 I* ^; G7 M: Y6 J - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)3 S, _( }( {) O7 ^8 {, H
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] - v1 U& ^: B* n( ~0 E; |& _& Y
- Book_img = response.save['img'], #小说图片
& Q/ ~4 I7 V# a/ T" ^+ U3 E+ T -
2 a3 y( P) r# }" R: N7 e% _1 u - #insert into MySQL 小说入库. [- e# C# q: _6 J+ ?! R
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
' X8 X, H" U$ Q - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
( M2 @3 i* z/ q - #post提交发布! k7 v' a$ b* c2 L
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消4 z+ W' k' |5 L* A& F
- Datos = {
$ o6 W# n0 O* }8 } - "Cater_Name":response.save['Cater_Name'],
5 ?3 a; y4 ]! h; F! ` - "Book_author":response.save['Book_author'],+ E/ w9 K6 s! K3 D& |& g
- "Book_Introduction":response.save['Book_Introduction'],
3 @ z# H5 t5 Q6 N6 L0 l" J' z- o' P - "Book_Synopsis":response.save['Book_Synopsis'],. D0 Y5 L8 d s
- "Book_Palabras":response.save['Book_Palabras'],
& ~: p: Z% v' a2 D, c% p: _7 d - "img":response.save['img'],
. v3 p$ y9 \& `3 n - }' {% e6 H/ i7 ]0 d$ x$ b
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
( m& I( ~" r8 o, Y7 J' s0 q - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
2 D( y: M) w, }6 E H: W - return {, I. M- r8 V" \/ `& Y0 e
- "Cater_Name":Cater_Name,
% A8 @2 V% v8 q( W4 T - "Bookname":Bookname,
* j/ p* s. R, ~9 f% o1 y1 o6 P5 V - "Book_author":Book_author,' h9 M9 S, i+ o
- "Book_Introduction":Book_Introduction,9 f7 |. Q# ]( V* j2 g8 R
- "Book_Synopsis":Book_Synopsis,
7 t! r' v# p0 U8 n! p4 _* w - "Book_Palabras":Book_Palabras,
: z* c( f+ }+ H3 `. m- I: i - "Book_img":Book_img, ]5 j* I& C4 I+ N
- "Bookurl": response.url,8 U% p" X6 W6 d/ J; {* @& p0 G
- "Booktitle": Booktitle,
+ q/ y g- ~6 b - "BookID": BookID,
) H9 B! R, t% y, a9 n. M - "BookConte": BookConte,% {; e, m6 m! ^, b2 K) _ `
- "Titleid": Titleid,
& d0 ]; t6 L5 R4 { - "abover":abover,
5 U, f* L. U. G! \/ ^5 R - # "Book_Date" = str(datetime.datetime.now()),
' j Y0 i. {: G3 n - }7 _- g ^/ H: K
- def download(self, P_dir, imgDir, file_name, Book_img):: I. Y" \* ?# Z! }
- if not os.path.exists(imgDir): : ?4 V1 ^% g/ o8 z# u
- os.makedirs(imgDir)2 W$ R! A+ N: s9 {& ?0 s9 K3 ^0 `
- file = imgDir + "/" + file_name
- G) v9 O+ C$ D6 E - # print file
0 l" `! D3 p& u* ] K - f = open(file, 'wb+')
/ E% U c$ Y h+ W6 X8 Z3 r/ G+ x - imag = requests.get(Book_img) 6 a# o6 O2 ]" ?; f6 h* U4 K
- f.write(imag.content)1 F! q3 y( v8 a$ g6 m* d
- f.close()
' G# g. n- X, r - #保存图片前
- j2 j7 X# c0 g5 F - def save_imgs(self,response):9 }; V% z, r& E2 Z
- content = response.content& c4 J" w2 u8 y1 w9 G& t" O2 J, z( j( _
- file_name = response.save["file_name"] Z( ]8 t3 x$ p+ Y- x) ~
- imgDir = response.save["imgDir"]- j: }0 @# c! A
- file_path = imgDir + file_name/ ]- O2 l6 t5 Q$ d
- self.save_img(content,imgDir,file_path)
! d# h: a- u2 I* ^" d+ g - #保存图片
% i& @ T* M- B5 f+ L) t6 } - def save_img(self,content,imgDir,path):
+ e e9 `9 F `* o - if not os.path.exists(imgDir):
3 `* a3 W* i9 } - os.makedirs(imgDir); j4 t) l6 H2 x+ p7 r
- f = open(path,"wb" )3 l/ k$ R8 F0 J" T6 ` K8 I+ k
- f.write(content)5 O7 N" P; B6 f% `3 l$ A" [1 m
- f.close()
6 f' M" W2 q2 r5 _2 p - #获取url后缀名& E4 c X! S7 s$ {' i% ^" w3 k
- def getExtension(self,url):
5 c- J" k& p! [* h - extension = url.split(".")[-1]+ h$ ?# m+ C0 A+ c# V/ e% Z$ b' x
- return extension
: n( b4 e$ [+ A8 c, }, G4 s J' I - ; ]) d* a3 g1 J( ^0 ^' x
- #获取图片名
; B3 F3 M7 ], R+ w: u. o9 a - def getname(self,url):7 {" U4 U% B L% W
- name=url.split("/")[-1].split(".")[0]
- _. Z6 F6 k. E- Z( @* N# V - return name
复制代码 & `. ~6 {( ^8 H6 p7 ?, `
- }" A$ g8 U4 ~4 F
|