Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
1 u" @' `# q5 q; ^- #!/usr/bin/env python
& d$ n; j2 P1 M% n& {/ F - # -*- encoding: utf-8 -*-
* r- K# J k$ T! Z+ t/ c - # Created on 2019-05-05 21:43:11
' g6 X4 v# u* @* T - # Project: XiaoShuo7 L: s( M$ n0 Q& n7 t! @5 `4 m
-
4 _+ {/ q ~. z- e1 H - from pyspider.libs.base_handler import *
: y; F# L9 |% a - import pymysql! A+ l3 D5 z, @" S* M5 ?
- import random6 }* ]! [! Y5 m1 i/ |
- import datetime; F% P/ j" G4 U2 u
- import urllib2,HTMLParser,re8 \: P: @' p* F6 y2 I& [. J
- import os+ v$ q- K- v6 q `4 \2 X
- import sys
( P' `* U8 U) r" b5 | - import re
- g( l k( z9 I- h6 P - import codecs* }* T: ]- u$ I
- import requests0 ^9 F' s0 V) D( U- y
- import json9 b% S$ W& f6 X: K+ r/ O
- . U" m; d. E) w* T2 Y; Q* A5 x
- class Handler(BaseHandler):- U Z! H+ L: e# j8 u
- global Datos
- [7 X: {2 e* c - global P_dir
* f1 E0 O% B( g- x: } - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
8 _: ]/ Q" j; u - global Datos1 \6 q+ ~ N+ \1 u; I* _
- Datos = {}- j, k8 J1 D n. Z, i" k
- headers= { Y7 A+ Z/ X+ Z) {4 @0 u% d
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',& q# s. N% I Q# U8 A! z/ T
- 'Accept-Encoding':'gzip, deflate, sdch', n# C( \* c6 t, f8 b
- 'Accept-Language':'zh-CN,zh;q=0.8',+ Q- ^9 u. u t$ m ^( ^3 R7 y
- 'Cache-Control':'max-age=0',
4 p4 U% `+ I% q3 ?" } - 'Connection':'keep-alive',- S. b. ^) C. K; ]1 ~
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
# ^/ ]8 i0 Z' k$ ~0 @6 s9 h2 ^ - }
. i# L% D. S% N1 J4 N, r3 n - crawl_config = {* v' a+ G6 y9 T0 z4 p. q: N
- 'headers' : headers,
" M0 h* k( |8 T# E2 b4 K8 z - 'timeout' : 300
; s; H L) m9 f* f. f; V - }# Q- J% ]. ]2 d- U0 S
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
( L$ Y2 }% A2 g- ?- n - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
/ j- Y5 d9 N8 u* x9 g+ L( m: y - try:
% }- C; B, N- f3 j1 P5 c - cursor = db.cursor()9 q3 ~0 p5 y5 Q7 z- E0 r* ?
- #注意此处字符串的占位符要加双引号"%s". F T, K+ ?% s0 G1 _$ |
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);2 W7 r3 n! }$ Y% z
- # print(sql)7 N$ R2 k; N& A5 D* {+ d
- cursor.execute(sql)
0 c: [& t, `6 J4 h j - q/ b. j, j; I- G# w0 o1 ~( e
- #qid = cursor.lastrowid
) {9 G/ N$ T8 } - #print(qid)
8 V: r4 ^0 z8 { - : B+ P( N! s. P7 u s
- db.commit()
0 U) H( t! G) T7 j - except Exception as err:3 {9 U X6 C( l, N/ p* [
- print("Error %s for execute sql: %s" % (err, sql))
2 t# }" ?, C- H! q$ {+ r- z - db.rollback()1 {* d. e1 Z$ p0 k: r
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):, L3 f3 G9 X" i8 f
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")0 _/ K9 C; t& b0 ?) N- t
- try:% Y* R' C2 |: N B7 l
- cursor = db.cursor()3 _! x/ x& E( ~2 L, D o
- #注意此处字符串的占位符要加双引号"%s"
- U: U- Q# m& ~9 B; ` - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);$ F5 R% y+ I& q
- # print(sql)
: |$ \. C" U$ K1 W - cursor.execute(sql)
2 |- R. s. C. ^& k -
) Z! p& M. O! W# x5 m, ^ - #qid = cursor.lastrowid
0 u. C, n: [. A2 ~) p - #print(qid)
2 O( v, {" Z* a0 O0 p2 G7 g - 3 E7 n# [2 `& S
- db.commit()
+ q, Y) t7 I/ a; a' m* V - except Exception as err:: B. A) F9 f \4 @! d
- print("Error %s for execute sql: %s" % (err, sql))
6 ^ K6 _2 X+ {4 s - db.rollback()
) \' y# F- V, u2 P1 L - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
. v1 b' Z) U9 I! U: @' }7 f - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")0 Y+ Q% s4 D$ S; o9 p, N
- try:9 A8 N$ f) V, b; R0 [2 G
- cursor = db.cursor()
$ M/ ~: z b+ W9 w0 j - #注意此处字符串的占位符要加双引号"%s"
* \3 M" L4 [( A/ s - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);' t/ H! G3 P8 |
- print(sql)( F, Y- X4 N+ A- ~+ G p- N
- cursor.execute(sql)
. Z! O* h1 Q( B( ^, n - print(cursor.lastrowid)- T/ q. l9 m- Y* X; N' q3 v
- db.commit()3 o h( a6 X3 ~. F
- except Exception as err:0 c& K+ O T: R' L: y5 S4 A
- # except:- x0 q0 K- ], b
- # print('Failed')
; W7 d7 | N/ | - print("Error %s for execute sql: %s" % (err, sql))
3 }( T9 r! ?( N6 c - db.rollback()
7 F' E- u1 `1 N -
, i8 V3 z+ Z; q9 ]. L% T) c6 C - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
; q% E2 S9 \" L& b9 c2 E - reload(sys)- t& w3 o$ x. {- w% l2 j" j
- sys.setdefaultencoding("gbk")
" x y0 b, l5 }2 } - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址* w4 F& [. V( b3 x# [8 U7 f W
- locoy_data = {
. Q* p4 D' Y9 r0 U - 'my_u':'用户名', #后台用户名& L3 \+ v; p# r5 t, }
- 'my_p':'密码', #后台密码4 ?8 k2 o. O" `" e5 v( S7 ~
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),9 y0 C1 }2 D9 X7 Q) n8 b0 z: M8 h
- 'caid':Cater_Name.encode('gbk', 'ignore'),* i5 C' J) a. j9 Y) e) W. j/ s S
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
9 }% G/ l }$ [: T3 p4 c: ~ - 'article':BookConte.encode('gbk', 'ignore'),' L4 B0 ?% O/ U' B0 S2 w
- 'author':Book_author.encode('gbk', 'ignore'),9 n3 c6 t1 O d, d' s; [4 y* Z2 A6 K- R. v
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),, T% }# s6 P5 \% v* |
- 'thumb':Book_img,
, S) I) c9 p9 d) q9 t5 `# L+ _ - 'content':Book_Introduction.encode('gbk', 'ignore'),$ ^+ k6 ~ T% [/ B7 C
- 'abover':abover.encode('gbk', 'ignore')
0 Y3 P# f3 Y" x, f( \! \ - }
5 [$ y5 z) N D& w - res = requests.post(locoy_url, data=locoy_data)
- ~, `1 M$ o/ _8 M$ b - print res.text }9 \) S; O! y# n1 g( }& B( H
- print res.content
7 b% e* d: e6 a' @. ? - # print Dsd
- v9 `4 m, s7 x. i - return res* y& y0 j- ^5 R( [1 u5 y a
- - u5 W$ M I7 I5 s }- g6 s
- def __init__(self):: w( p$ J6 @7 Z3 U6 M2 |
- self.base_url1 = 'https://www.****.cc/'8 [& [; b0 A2 ` @/ c. U* U' o
- self.base_url2 = '/'& G0 ?: b+ w2 e9 @* H( k
- self.CaterId = []: m" v! J% u. H
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
* l* [ j- b+ y0 L0 k - self.page_num = 1
- {( t; A2 S6 s9 \7 g1 U - self.total_num = 200 Q% T# B9 t* w, [, G7 r
- 3 x: j$ b' U" [+ r5 y
- @every(minutes=8 * 60)
$ E2 w {6 h" f. K# q - def on_start(self):
% ~) W3 b% {0 B. l - global Cater_Name
\/ a# ~' n; [* q; l, Q8 O - Cater_Name = []; d/ K/ r4 I0 H9 `! k
- while self.page_num <= self.total_num: ( W( J7 d: W. w! ?
- for self.CaterId in self.CaterIds:6 z2 P7 }8 Q! _
- if self.CaterId == 'xuanhuan':: ^5 f* h! _) P1 P, R/ C! ?
- Cater_Name = '玄幻'- \$ N4 p& b6 A6 j
- if self.CaterId == 'wuxia':
# _7 [: i0 L* f$ D" o5 {% M - Cater_Name = '武侠'
! u( ^. u4 ^% j2 r$ R% @ - if self.CaterId == 'lishi':2 D( _' j) _! k( H) U* Q* h: p
- Cater_Name = '历史'
% G# ?1 q7 m: F8 j9 L$ g - if self.CaterId == 'yanqing':1 g! ?0 R7 {: T- ^8 m
- Cater_Name = '都市'
6 B) r3 U3 W8 X9 Q8 w& V8 |' [ - if self.CaterId == 'nvsheng':
7 }: E' }) Q; N; E - Cater_Name = '都市'
. m$ g V8 V$ N" o - if self.CaterId == 'kehuan':) P. o; m0 {8 F& R, p) V& W* D
- Cater_Name = '科幻'
( v: P* ]0 D- _* ~/ L - if self.CaterId == 'kongbu':
7 {, d$ |2 O$ g& g" V0 ` - Cater_Name = '游戏'
1 ?. J9 K! Q& K" T' h - print self.CaterId7 O/ A2 L# o6 j# x
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
0 D) [1 ^# G; s - self.crawl(url, callback=self.list_Caterg,save=Cater_Name) j4 V% C( b; V9 ~; N8 N3 {9 G
- self.page_num += 1
* o1 T3 v$ | _6 \6 E( F# J -
& k5 s W z; r/ D1 e4 I - def list_Caterg(self, response):) {" v! t! N/ F# B! q7 |. F; |
- Cater_Name = response.save
9 [ @' C$ L, z+ ` - for each in response.doc('.pic-list a[href^="http"]').items():
6 K$ R+ V2 N/ b3 f/ x4 p3 M/ V - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
& M2 ]3 E5 l! l5 s5 D8 W1 q( ?. S -
# M- z& Z# p% z, _1 F! n: A - def list_Caterg_detail(self, response):
( Y5 I2 s, k0 K - Cater_Name = response.save9 |3 k/ s: ~8 `; }3 f3 b
- # print Cater_Name- z2 A7 y5 m: j8 l5 U
- Bookname = response.doc('h1').text()) w, i* V# E% @- ]: j
- print Bookname
. p1 K' ~; e$ t1 b - Book_author = response.doc('.authorname > a').text()
5 W1 {! l B" W5 c7 m1 C; q% ` - # print Book_author+ d8 R3 w9 q: R
- Book_Introduction = response.doc('.book-intro > div').text() K+ U+ Z- }) j
- # print Book_Introduction
; L5 F' y- r0 I" J' L' `2 J( y - Book_Synopsis = response.doc('b').eq(1).text(). m& c; W9 i6 u7 r1 Q
- # print Book_Synopsis
9 z2 Y1 @9 e, k6 Y2 X4 _% a - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
4 F# }" @% W! S* |! K3 | - # print Book_Palabras
0 k& E; w O5 }8 j+ x' @ - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
- T) |; j) Y+ Z9 ~ - # print BookIDs4 ]2 [4 y" |9 d4 s9 O
- Book_Dates = str(datetime.datetime.now())
! T& g0 p: Y7 _ - for imgs in response.doc('.bigpic > img[src^="http"]').items():8 V* I2 D, r+ c2 C6 F
- img = imgs.attr.src# |4 p/ d$ N$ o- Y1 w) N, r. q
- print img5 ^3 a( ~1 U, b' j4 ]$ F
- #小说封面下载* I, t. p G7 i1 m* a
- extension = self.getExtension(img)
0 I+ W' B8 C0 Z4 X+ i7 @ - name = self.getname(img)7 v$ S* ?" h) G, ]; V K0 X4 {: @
- file_name = name + "." + extension1 o( B8 M4 A; j; [# e% P( ?
- imgDir = P_dir + name7 X N7 ]- A L B5 f2 n+ u
- Locaimg = imgDir + "/" + file_name
; I7 u) A$ ~; e: ]. K' L5 c - print Locaimg N0 o6 W* E3 W: l/ m! R5 r. m
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地. m& x* K3 T+ @6 G0 s
- print('attachment url is ' + img) #
7 y& p* a- o p' x3 v7 ` - Datos = {* G" c/ i3 t! U! v; P+ S5 }5 W0 D
- "Cater_Name":Cater_Name,
. Y# }! g3 }1 l1 y9 W2 d2 Q - "Book_author":Book_author,
/ |( }0 L, {% V; E) G - "Book_Introduction":Book_Introduction,
4 q: `: M& ~5 Y4 n# H! r. ` - "Book_Synopsis":Book_Synopsis,% _' Q, R/ z) p
- "Book_Palabras":Book_Palabras,4 T( L( C( o* M V/ h
- "img":img,, Z# C0 d% M+ Z- k+ p4 _
- }
7 h) T( Q. ]+ X- l8 U& q# \- z - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
5 k1 _# t/ N `3 P - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():3 s5 Q# @% n7 _6 U6 @
- self.crawl(each.attr.href, callback=self.index_page,save=Datos): d) J5 I, \0 m1 M( t
- * E3 A6 i, r& x( u4 r
- @config(age=8 * 60 * 60) & C( `2 {: w/ h k5 I# i/ n
- def index_page(self, response): * X0 F) H! m; M* |' L2 r7 D
- Datos = {
; [0 J; M& @/ x; h1 C, ^% D" \1 u3 @ - "Cater_Name":response.save['Cater_Name'],+ w$ Z2 Y& f( b. s: C/ _
- "Book_author":response.save['Book_author'],: V0 e5 X" D, y% n" H5 b$ A7 _
- "Book_Introduction":response.save['Book_Introduction'],! K. x B; P! s" c/ q$ {6 u
- "Book_Synopsis":response.save['Book_Synopsis']," j3 X" s% u0 W4 `
- "Book_Palabras":response.save['Book_Palabras'],% d# r% \0 a5 l5 q. C5 Z
- "img":response.save['img']," z+ f1 K+ |' |6 h6 p2 c5 A
- }8 H0 V; z; |6 T
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():5 @3 @8 T7 I8 {/ C' P
- # for each in response.doc('.chapter-list a[href^="http"]').items():
, ^; ]) l9 o. V; i$ ]; B+ f - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)3 B! j1 X) i2 N9 G5 o% q1 h
- @config(priority=2)3 t. @. w% @$ i. J- f
- @catch_status_code_error
L8 K! M, Q* O6 G t( W2 Z - def detail_page(self, response):
) S& n' H$ u5 S# d - NewRe1 = u'哈书'
K* E$ a* D& N- g7 {8 S7 J+ p - NewRe2 = u'huhjsd.CC'
. U2 k* W; [0 p4 Q5 V1 M. @ - NewRe3 = r'^\\n\\n'% q1 n$ W! y- m+ `4 {% a
- NewRe5 = u'小说网' B0 O$ w6 s6 H6 _' z
- NewRe6 = u'fgdfgf'
. P# Z8 k# Z$ @2 I$ n q6 K - NewRe7 = u'fgfgf'+ z" o c4 q' d6 b
- NewRe8 = u'ffhgf'2 F" w% G( y0 a# q# K
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'5 \. A# U0 L* M2 A
- ReC1 = u'静思'
" G+ O& f x: q# k% ?' z - ReC2 = u'aghgf.com'
: ]* q" f% {! L4 Y - ReC3 = u'aghgfh.com'7 W3 _1 |* Q7 J2 P
- ReC4 = u''
& M% _- z' ]1 s( F' J( H/ ^/ E - ReC5 = u'文学网'
4 _9 g2 x" g# M* ?5 c# k: Z' }& F - ReC6 = r'<BR>' d! h6 ?. j& c' M3 O$ m
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
3 U" C2 V+ s5 J( k - print Bookname* u# p$ B0 m3 b) _0 ?+ [
- Cater_Name = response.save['Cater_Name'] # 小说分类: T0 f7 {& F" q) k% d; h
- Book_author = response.save['Book_author'] #小说作者
9 D% I( q* I; L3 p0 ^$ r - Book_Introduction1 = response.save['Book_Introduction'] #小说简介# A7 i+ \: L) Z" _- g6 X/ \+ G
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
! k3 _4 M# L! j: a - Book_Palabras = response.save['Book_Palabras'] #小说字数
k2 P2 x% D: J. @ - Bookurl = response.url #小说网址
1 p: D2 X0 z' L6 E8 Y5 L* ? - Booktitle = response.doc('.article-title').text() #章节名称5 n/ Z ], _ ^! S, ^- z
- BookID = response.doc('.readset-r span').text() #小说ID, O! Z1 s) ] B/ N; q
- BookConte1 = response.doc('.article-con').text() #小说章节内容
3 {! E- b0 y1 R' O. w: ` - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)1 F b7 ^$ A/ N9 Z \* U
- Book_Date = str(datetime.datetime.now()) # 采集时间
R4 p0 b) W- q% c - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
/ C" |2 q& `$ @. L y) J$ ] - BookConte3 = BookConte2.replace(NewRe2 , ReC2). T( Q P: `6 l" A
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
k I8 O$ x# ~+ v - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
3 o+ o5 M0 M j- _ - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
& `: L/ R9 k0 \. ] - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
; |( X2 i+ X0 j: e+ H - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)$ o7 G1 I- p' V7 P+ n# o8 I
- BookConte = BookConte4.replace("\n\n","<br>")
" |5 S7 t5 Z4 z% Y! E - print BookConte
" ]9 }( t0 r+ L8 K) c - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
`2 [0 g( @2 _; c+ r) |1 s& K1 U - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)* ~5 M( e4 W7 z& R
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3) M0 a1 k% N1 P* o4 v* Q
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
+ L# J( v3 V! o$ Y - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] ! B5 g! U$ M' B- c% S5 ^2 X; m
- Book_img = response.save['img'], #小说图片
' H' u7 N5 }4 N8 e -
5 p) C/ b; o/ m0 b - #insert into MySQL 小说入库$ u, R d Z# u3 V
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布6 e7 k9 [: v6 h) s1 ~
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布/ M. c4 o& a3 U7 S8 Z7 f6 H$ Z/ H
- #post提交发布4 K8 }" I- o4 K! t# U- D( R
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消4 x+ E$ Z3 Q- N$ W
- Datos = {
+ b. }: n3 |- M0 S7 T0 C" e$ D - "Cater_Name":response.save['Cater_Name'],
( D$ ]1 ^3 @7 |0 W% h - "Book_author":response.save['Book_author'],. W$ F/ W& |7 b0 w& q, n- Q0 R0 u
- "Book_Introduction":response.save['Book_Introduction'],
_5 v# w( }* ]. ~# W - "Book_Synopsis":response.save['Book_Synopsis'],
' A( x- B$ R/ n0 ~, R - "Book_Palabras":response.save['Book_Palabras'],; q+ V; |) J O4 p ^: u# O2 C
- "img":response.save['img'],
( {; S- j, ~& Z2 F% f" W - }* t) M8 e6 q! A# y5 r/ \- M' R
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():4 e5 n3 z0 @* H
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) 7 q) [5 x* t! V5 a
- return {
4 L1 L u. J7 L# S - "Cater_Name":Cater_Name,
1 S/ @+ f8 _ C1 v Q, O2 O - "Bookname":Bookname,
; m1 b0 H, K1 i, s# k' R5 z+ v: ^ - "Book_author":Book_author,
- @& g# _$ P9 C( ^. P - "Book_Introduction":Book_Introduction,
; l# Z. w4 l1 R% ^6 u- C4 L: l - "Book_Synopsis":Book_Synopsis,3 M( E# C d7 p
- "Book_Palabras":Book_Palabras,/ [) q* W8 v9 I
- "Book_img":Book_img,
6 M! B3 c! u# W2 D* ~ - "Bookurl": response.url,
5 m+ r- q1 ^/ z - "Booktitle": Booktitle,
3 v# l; }2 a1 {3 ` - "BookID": BookID,5 P& R2 Q/ i6 i$ S' O9 @$ I0 g! G
- "BookConte": BookConte,
7 Z) X& t1 g6 E' N - "Titleid": Titleid,: m4 [5 C& K8 ~" r
- "abover":abover,
) V0 C4 O1 ?" P9 |% {4 L( x - # "Book_Date" = str(datetime.datetime.now()),
( j/ f. U4 {; O - }+ l3 i0 y9 r1 R2 O
- def download(self, P_dir, imgDir, file_name, Book_img):
6 s! d5 n! L: Y K2 H - if not os.path.exists(imgDir): 9 ^! r# E5 l; l+ _3 F3 A1 l8 }5 l
- os.makedirs(imgDir) U0 s( N, X/ O& _( `& X
- file = imgDir + "/" + file_name
( _# [0 O7 B& u7 _) F0 H - # print file. b& c2 c" b0 [3 Y# W$ M
- f = open(file, 'wb+')
; y) V2 z0 O& b2 I: E - imag = requests.get(Book_img)
0 }% v* N' V- e9 R- K2 e+ J$ q2 _ - f.write(imag.content)
* U( C1 e6 R4 O( k1 S6 L - f.close()5 `# l4 p% s L" N
- #保存图片前* C* c a" h5 i. t# f
- def save_imgs(self,response):2 j9 N( P3 Y+ F1 L
- content = response.content; C9 I& C: y. p
- file_name = response.save["file_name"]/ x; w) M6 P& ^
- imgDir = response.save["imgDir"] o7 U3 R! m, g
- file_path = imgDir + file_name
$ C2 \6 |# w+ ? k0 z% V! e$ m - self.save_img(content,imgDir,file_path)
$ B! K7 y: Y, \9 l0 [9 F - #保存图片
5 t+ m: r; I: \1 r/ b - def save_img(self,content,imgDir,path):- W5 u, q. `8 b3 x8 _
- if not os.path.exists(imgDir):
( f. j/ ~! S2 ]3 ?# Y+ {4 S - os.makedirs(imgDir)
8 A# N# s, P, V' Q& r( S6 K' L - f = open(path,"wb" )$ i( F u( N4 D) v
- f.write(content)0 Z4 v* o" V/ N% k. B, v5 H
- f.close()
6 u: n* Q. g2 v& f0 F/ m1 A - #获取url后缀名9 b' k& @" z3 g' C- @; W( P5 k
- def getExtension(self,url): # F% N1 L2 Q0 m( I n$ z: p5 ^
- extension = url.split(".")[-1]
$ @1 W8 ~2 p. K1 u: q - return extension L4 |8 U0 i7 n0 j1 W
- * O4 Y/ f! Z6 o6 M
- #获取图片名
+ }- `- I* r1 n7 r T: V - def getname(self,url):% @+ U8 L' N$ }3 [6 P; t0 C. m5 _
- name=url.split("/")[-1].split(".")[0]
9 d9 c, ]; S" k8 ^ - return name
复制代码
0 E4 I" J+ i0 X+ B $ L$ }/ l* |9 T8 {
|