Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!; c6 B/ g% u/ b3 ~5 {
- #!/usr/bin/env python
# V" A& }6 b% Z6 Z" G0 D - # -*- encoding: utf-8 -*-
4 \/ g. m; f& _1 d# l0 ] d - # Created on 2019-05-05 21:43:11
. n* S `' Q G. M - # Project: XiaoShuo v: I2 j5 l, H9 C
-
( e% S2 w: G6 p- V6 w - from pyspider.libs.base_handler import *
9 u% J* M9 v ]+ i) N2 t: }# r/ o7 _ - import pymysql
& G# u% o2 B* Y2 p4 l$ h - import random
1 |4 ]' ~# P- T - import datetime# G) @$ A6 y. ?1 D4 G2 }
- import urllib2,HTMLParser,re
' Q! E# h8 U+ M ] - import os
7 l, D# @9 C1 Y# h - import sys% s n. k$ h0 t$ T
- import re
H: Q( Q5 d3 W - import codecs, c" L! @7 Y% ]1 v! V* f
- import requests1 W% `3 y* R/ Z; ^
- import json# H! ^* K' ?. A" ~# h. b8 V2 |5 g
-
! N, p! v( [, W9 n) _ I9 W8 O - class Handler(BaseHandler):
: h* k, N- q$ x - global Datos( Q, W v( i8 X! S+ w. | l% j
- global P_dir 6 }, e6 T- ^1 t
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径) z6 I! o) e: ? m6 ^ c
- global Datos
+ l# r$ S: H# C1 B Z4 n - Datos = {}
( t: J f4 _) _; i" L5 _8 W( a# k1 b1 F - headers= {
% N8 \. u1 o0 t" X- e - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
) \/ @ z" Q/ I% z; k, V, f - 'Accept-Encoding':'gzip, deflate, sdch',
1 D) d' s4 w1 b4 m( P; e$ r - 'Accept-Language':'zh-CN,zh;q=0.8',4 y2 @9 v/ S, j& c% ?
- 'Cache-Control':'max-age=0',6 s# g# c( W6 y6 \/ Z' X: u
- 'Connection':'keep-alive',: s" ~' k% U. I/ s" e+ C. L. l' L" v
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'( e9 Q P6 V7 {( u/ z
- }
2 n4 p+ G& G/ D8 \! C% W - crawl_config = {
4 E) o# m4 Y2 ~& P% Z3 Q - 'headers' : headers,8 d7 t7 S# n9 t& t& }: }0 ]) R
- 'timeout' : 300
- @4 H9 e; p9 F% p2 g - }
" e% Z3 s8 L8 K- `7 Q( x( A - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
: S/ l9 Q7 S! h3 ^& a9 e - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
$ ?# Y" T U9 l+ k" G, y5 b$ N! w - try:
" v$ L; N( J w' Z4 E1 { - cursor = db.cursor()
, A; T3 c) {2 R1 X+ K3 X; [+ s - #注意此处字符串的占位符要加双引号"%s"$ Q$ W$ O- H" x9 u) N" }3 i
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
' d w+ n7 V& K! l5 o4 w5 _/ w, |, v - # print(sql)0 i# j! a% F/ Z& _& p" B
- cursor.execute(sql)
( X; I. O/ a9 E) s, O& t* r" p! C -
7 n, a( d1 c! ?. k9 F - #qid = cursor.lastrowid f, W1 V) x1 P4 B- G
- #print(qid)
- K2 B8 U- u5 J6 i( [ -
1 s. F& S9 Q3 l7 w4 `! L% z - db.commit()
& u( W: D: u( d9 f - except Exception as err:9 U6 I4 V0 o# \
- print("Error %s for execute sql: %s" % (err, sql))
" d/ v& M1 Z- H, {& F# Q& b% @$ u - db.rollback()
7 q7 w( d. F( a5 j% Q* J - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
9 u. K* r" \2 j) s0 t; b# [1 t9 } - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
4 C7 _& G: F% J1 @" S) e - try:
) a$ I, ~: v7 i, g; R - cursor = db.cursor()
$ q* e# J$ R% g - #注意此处字符串的占位符要加双引号"%s"
' k4 F0 W' h0 m, Q% W - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
" x Y) b2 c* c - # print(sql)
/ w* w" C; L! q7 ~/ ? - cursor.execute(sql)4 h# _9 ^* ]! d& _$ l) P
-
+ N; U& ]8 y8 s( y - #qid = cursor.lastrowid
+ I" f j" X# s, L) w v" R" p - #print(qid)* m+ O- k- }0 R7 q( {
-
1 ] S2 c2 f, c/ E. A4 ~ - db.commit()
$ ~4 Z3 o# [) U; v+ A0 X - except Exception as err:
& O, j* A$ D. D, N6 Q" V3 m6 H8 H - print("Error %s for execute sql: %s" % (err, sql)); D- I G/ V6 \0 U% C
- db.rollback()9 @+ |$ o- X4 k' g: A! ~
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
) J3 v' W9 |4 G' r: ~1 K+ z/ W! j - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
+ p8 ~+ w/ m& H - try:
. u. o! Z" J. d6 O- n - cursor = db.cursor(); R9 o' H; J6 ~% H
- #注意此处字符串的占位符要加双引号"%s"
. x( r9 b8 z* `* E - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);! s s7 A! Q3 t# i
- print(sql)5 v+ P8 H* H. e8 x5 {
- cursor.execute(sql): W9 f& l+ r# R0 v
- print(cursor.lastrowid)
' n4 p4 M3 x# @& S - db.commit(), a( n. }+ d ^$ q0 n' Q9 K) H3 T. h
- except Exception as err:
$ ^) S2 E2 V% O7 u7 e) i9 p - # except:
& m/ S- R3 ?# x, U1 U4 d - # print('Failed')
7 T0 n/ y9 \# s: [7 O5 i$ j - print("Error %s for execute sql: %s" % (err, sql)): O7 W" L! E( A) p5 n' _* U
- db.rollback()! t7 _: B, P7 k4 l. _
- 3 @* f6 J; m, [, [% K% d
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): * E% Y" N: J3 j: }
- reload(sys)
0 X+ O0 m, Z$ d$ o/ n - sys.setdefaultencoding("gbk")
; @0 B4 G e) b0 ^9 S - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址- r+ Q* t% E B5 D2 H( r4 \# I
- locoy_data = {8 \: V6 n3 |1 l. p$ ^9 {* p! [
- 'my_u':'用户名', #后台用户名
" `' [, n0 b0 Y% `, I _ - 'my_p':'密码', #后台密码. L3 f2 g9 f: S& v# i$ `
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),/ P+ q" f' u. c9 ?
- 'caid':Cater_Name.encode('gbk', 'ignore'),
$ M- D8 U5 J" F - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),: g; ]# ?! g7 [ \, L+ ?
- 'article':BookConte.encode('gbk', 'ignore'), m3 |! V& t5 J: L2 h
- 'author':Book_author.encode('gbk', 'ignore'),
& O, r# v# o, W% `4 k _9 g; W7 G - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
+ l+ w& [/ x: x: L, s - 'thumb':Book_img,8 `" \0 E& z+ G: ^' L
- 'content':Book_Introduction.encode('gbk', 'ignore'),/ O0 ]! R7 W) P: F" p. E3 ?
- 'abover':abover.encode('gbk', 'ignore')
' i! X: s& F c; v - }# q0 z( v8 O7 n% I' ]
- res = requests.post(locoy_url, data=locoy_data)
, ~. Z* O3 K; \9 y - print res.text7 p( w6 g- S, V% n1 d9 I: G) v5 ~
- print res.content
9 w: V8 i! }. W/ [* Q - # print Dsd- @7 t5 S. o; D; W, p+ R+ B" L9 ^
- return res
# `+ |. S: e& g5 q8 E - . B" [: D+ `2 U6 j: K& p* T! W
- def __init__(self):
2 B* ?6 k# O% d5 g) ^- V& l$ [ - self.base_url1 = 'https://www.****.cc/'/ ]/ F/ F" i+ `( |: v& G
- self.base_url2 = '/'
8 p* @, F& E$ l: E - self.CaterId = []3 i, F: g/ `% k' j, C$ M& ^
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
# n9 j0 A3 G0 G& @$ p/ ?$ h* @$ G1 `% \ - self.page_num = 1( Q2 \2 ?$ `, m* b( b& c B: b
- self.total_num = 200 0 o8 s; X6 V8 z% G" c
- 8 a0 y+ W$ \1 b
- @every(minutes=8 * 60)
1 d# |, d3 S7 o- S- Q: [9 p - def on_start(self):
' ^' S, O; a8 L - global Cater_Name' }0 t/ T1 e0 D- j) \! K
- Cater_Name = []- g, M7 k$ I1 U1 v
- while self.page_num <= self.total_num: 9 {: K. X0 D+ _5 {. c& _
- for self.CaterId in self.CaterIds:
3 V4 C8 U) U) ~7 ?$ \ y - if self.CaterId == 'xuanhuan':
8 p9 |! H% H( Q0 p# @ - Cater_Name = '玄幻'* p( n0 M9 X2 G3 B# [+ j: Z0 [
- if self.CaterId == 'wuxia':
4 G4 X0 p: R- N G; w% A3 }7 Z - Cater_Name = '武侠'
9 C) x( F* E, l- N% g+ s9 L - if self.CaterId == 'lishi':3 d( x: a9 j" F7 x. K* ~3 ~ j
- Cater_Name = '历史' 4 P, z* s% {7 s6 d: n: ^5 ^ u- b
- if self.CaterId == 'yanqing':1 |) U. U! t# }9 ]! X7 F2 `
- Cater_Name = '都市' % n4 B: V# B9 O6 {" O
- if self.CaterId == 'nvsheng':3 G! Y2 ]- Q6 |! Z
- Cater_Name = '都市'
: n* s$ Y! n) A' y, ?6 n( A- D" c# e q - if self.CaterId == 'kehuan':: ~1 v% b& K; w- u
- Cater_Name = '科幻'
! }* M! Z: c! ~ - if self.CaterId == 'kongbu':. _, W+ J9 K2 H
- Cater_Name = '游戏' 3 D9 P" @% Y. N, @. W& P( d
- print self.CaterId6 @5 Z, j: e0 R
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
; ?1 @7 b, L9 |2 h% s - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
/ o6 Y' Q- r- I" Q2 y0 ]8 [ - self.page_num += 1
) C6 A7 M3 j* v8 H) N5 b& h, K - 3 m8 e' T7 R. W u
- def list_Caterg(self, response):
! c5 _& E1 j3 o' M! k% S - Cater_Name = response.save% z# L5 j# [% ~- o, s
- for each in response.doc('.pic-list a[href^="http"]').items():7 F+ \& l& p2 `0 {) p: \ K: t
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
4 Z: }8 F$ f) j; S* s, P - 7 E2 u! W$ U& R' @. J" @6 e
- def list_Caterg_detail(self, response):! Y; y- I: B" v* A& h
- Cater_Name = response.save& Y) r7 @1 G1 `
- # print Cater_Name4 e! u! c2 f* N. Q+ T4 x0 n
- Bookname = response.doc('h1').text()
; a N* R. Y6 s* j - print Bookname
* X4 Q* r0 D6 Q1 U" e5 d - Book_author = response.doc('.authorname > a').text()9 t* S f G8 t: D( G
- # print Book_author
' y+ `9 V+ S0 ?0 i - Book_Introduction = response.doc('.book-intro > div').text()/ R9 A; W$ B3 n+ m0 _2 H
- # print Book_Introduction! P2 A1 d2 ]+ f
- Book_Synopsis = response.doc('b').eq(1).text()
# Z7 U) j* W& i3 T+ T( w t - # print Book_Synopsis
" v2 C" k' z* l5 N) v - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]1 u, G }7 a2 K1 j+ t' X
- # print Book_Palabras. D7 q" a c$ R9 S3 U) j
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
+ i( d3 a6 F- o# h. K' p2 k- ? - # print BookIDs- \% {/ D' _" \/ a8 W
- Book_Dates = str(datetime.datetime.now())
0 C- @( f/ \* Z - for imgs in response.doc('.bigpic > img[src^="http"]').items():
; L( \7 l7 W) Z1 g" ]8 a - img = imgs.attr.src
. D8 d* b! ]/ }1 {* v4 l6 r! G - print img
7 t) `: H6 B! Q c2 e/ J; ?9 X - #小说封面下载
, Y9 w* p g9 R+ O1 o) | - extension = self.getExtension(img)
, [1 n: n' P: H5 @( n1 {0 E) _ - name = self.getname(img)
1 S) ]; A& p- p% a4 k. y: V9 [/ e' P$ f* k - file_name = name + "." + extension
, z' g, D V1 X) s - imgDir = P_dir + name6 x. X) M3 L, w9 N1 n% t
- Locaimg = imgDir + "/" + file_name) \8 n& b4 z6 P: ?" o8 @ _5 ?
- print Locaimg
/ n1 r; A; E) |! I: U/ E - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地. `5 i8 P5 X7 ?
- print('attachment url is ' + img) #
. J. G: p6 x5 }" o) [/ o - Datos = {5 u5 E& {( G+ l7 h. H9 n: S
- "Cater_Name":Cater_Name,# u- g1 a2 O- I# r, s* F$ r
- "Book_author":Book_author,
) v4 R, P6 V# _ - "Book_Introduction":Book_Introduction, [$ F [* F+ a4 a! t
- "Book_Synopsis":Book_Synopsis,
v& U) n: ]% n# ~3 K- O - "Book_Palabras":Book_Palabras,
# G2 o4 V# q& Q# b - "img":img,/ N8 V& _4 @# C5 |+ s
- }) |3 a7 j z* V7 H5 W, a& R
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布3 d2 b+ C/ C; f/ A' ?$ l# b! f
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
$ u- h! u& Q0 [4 X7 h. H/ i - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
: J3 ^ V- D- U% J, ^ - 1 J0 z% k' Z8 J# F, V
- @config(age=8 * 60 * 60)
6 O; S. ^. @7 i; | - def index_page(self, response): 5 ~$ w+ ] T/ M7 l
- Datos = {
$ X+ ~* M* k& A0 _# H - "Cater_Name":response.save['Cater_Name'], E+ d) N2 T( Y# N
- "Book_author":response.save['Book_author'],
/ _2 v$ R1 S; v, w; i( u, { - "Book_Introduction":response.save['Book_Introduction'],
: l4 C$ W; Q, @/ t; M1 n - "Book_Synopsis":response.save['Book_Synopsis'],
+ @4 g4 _+ ~- H z! O - "Book_Palabras":response.save['Book_Palabras'],* U6 J. q7 E8 u& [3 I
- "img":response.save['img'],+ h3 j# c% r8 D
- }
4 X0 ^# @5 K2 j: i; l - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
% h8 Y' V. W+ j( _2 |* h4 V! A - # for each in response.doc('.chapter-list a[href^="http"]').items():
( [3 }1 g H5 w9 a& O+ \/ Y } - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
9 c, I& `8 k) S+ f2 I% Y- x - @config(priority=2)
: h. R( U" z$ e) Y( Z - @catch_status_code_error
5 t5 x1 l$ _4 l+ n6 x - def detail_page(self, response): ; O9 Z8 o9 p! h8 n6 S1 x1 J
- NewRe1 = u'哈书'2 U" f& R- i# V; V
- NewRe2 = u'huhjsd.CC'
- S; Z* H) C7 Y. @* d - NewRe3 = r'^\\n\\n'
: ^+ A% e( v2 q2 P - NewRe5 = u'小说网'$ i% O. l; l& O& G- x* B% b
- NewRe6 = u'fgdfgf'
* ?, F M+ L! ~1 x& A" ] - NewRe7 = u'fgfgf'7 O; f( M7 H: p
- NewRe8 = u'ffhgf'
/ g/ G5 X# s8 B) @3 w) R - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
8 V2 n0 |& T3 b* U. b- P - ReC1 = u'静思'# `+ {0 u/ u% c
- ReC2 = u'aghgf.com'. t# Z; s" J% n: @
- ReC3 = u'aghgfh.com'
, `/ c: v3 y5 }- A - ReC4 = u''
1 I) D: K \1 N7 D( D - ReC5 = u'文学网') a" i8 W3 h- c& w, j. R
- ReC6 = r'<BR>'
2 ~$ r7 _! b. f% y! v# [# f# \+ v - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称: d. x8 }' y+ @2 a/ ?4 V
- print Bookname. m+ M2 m* L2 b. k R# ~) M% ~
- Cater_Name = response.save['Cater_Name'] # 小说分类
* M7 e! |3 k" \4 S( R E9 G& ? - Book_author = response.save['Book_author'] #小说作者9 F, T: R- k) K& P Z
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介2 {$ ]/ t2 l- K# o, H( _
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
# }9 c( ?, ]7 s4 K - Book_Palabras = response.save['Book_Palabras'] #小说字数) D$ b1 d \& f6 g& T* x
- Bookurl = response.url #小说网址
8 w+ S/ F/ ^# K; [# O) p - Booktitle = response.doc('.article-title').text() #章节名称/ L) |9 y7 P' B' Y' d: b
- BookID = response.doc('.readset-r span').text() #小说ID4 [& S6 `( n, M+ T
- BookConte1 = response.doc('.article-con').text() #小说章节内容2 B. u) t n3 r
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)7 H4 a7 @: Z6 M0 _- L) C! L
- Book_Date = str(datetime.datetime.now()) # 采集时间
2 h n/ E/ Q2 U! j/ A - BookConte2 = BookConte1.replace(NewRe1 , ReC1)& e: M8 `6 H4 Y( I* }- o
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)
: H' U* n, E. ?% B: E - BookConte5 = BookConte3.replace(NewRe5 , ReC5)
; p& a. t. t, h, F2 a( x- f - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
+ A5 j" j0 m8 f0 X9 j) ?! q - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
* z9 D. }, h4 L4 f - BookConte8 = BookConte7.replace(NewRe3 , ReC6), p: Z0 j$ F8 v- g" f2 ?# I- ]
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)( }" o# r7 N* _. Y+ q
- BookConte = BookConte4.replace("\n\n","<br>")
7 t) C/ B5 F& q, D ?% K s2 B - print BookConte
R% B# E2 m* S. \% w; p# x% K1 V1 C - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)8 @7 J" T, ]7 D1 s9 N
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)6 K' B$ k$ p1 W6 o1 r
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)2 j1 p$ {( @5 k+ O( ~$ p
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4), y: D0 d/ t8 f' y
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
1 `) W& b0 V6 h, Y c - Book_img = response.save['img'], #小说图片& z8 }3 p/ Z7 ^. y/ U _/ m
- ; O/ Z; Q0 a8 f7 m3 |
- #insert into MySQL 小说入库$ E' r: t( w& {) G6 i
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布) k" u! y9 k/ _- a4 Q# v% F
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布1 w/ g4 r+ m9 N7 G3 K) i6 i/ M, ^" a4 h% E
- #post提交发布3 Z* Q0 X/ e& u. l, b
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
6 }; B) b+ ]* F( z4 d) |5 R" z% t - Datos = {& c* e; q7 G; X
- "Cater_Name":response.save['Cater_Name'],
2 m1 P. M6 S* E4 C8 x- S - "Book_author":response.save['Book_author'],0 u! |2 c% b% H: @- X7 B1 ?. N
- "Book_Introduction":response.save['Book_Introduction'],5 Z' Z! V$ q: _
- "Book_Synopsis":response.save['Book_Synopsis'],8 ]+ f- a: X( Y% S9 {& h- @
- "Book_Palabras":response.save['Book_Palabras'],
2 d5 |) z* e! U/ v \) w5 b2 a$ D - "img":response.save['img'],
/ H% }2 i! W& y% q4 N - }
3 K" d+ d8 i- i6 R - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
/ C/ e5 @' m2 s# ` - self.crawl(each.attr.href, callback=self.detail_page,save=Datos) . I% X5 y6 f) P# }( s
- return {
% l3 r# u6 I1 L; F - "Cater_Name":Cater_Name,8 `* [- Q" p( N/ t
- "Bookname":Bookname,
( p6 J3 z" Y! t* E/ ? - "Book_author":Book_author,+ E: P* i1 f" H2 i% z/ y% x* [
- "Book_Introduction":Book_Introduction,* x$ P1 {5 \" ?. i* B
- "Book_Synopsis":Book_Synopsis,
. t) s1 Z% L2 a5 R - "Book_Palabras":Book_Palabras,
`# {# T/ V: t - "Book_img":Book_img,' g* N9 r8 \# D* _
- "Bookurl": response.url,7 ?+ R# @% ]9 q: `7 X) u' z
- "Booktitle": Booktitle,
2 P- {$ c6 w6 @- Y% k$ g" | - "BookID": BookID,
' A/ F! h0 N1 M2 N6 y$ ~ - "BookConte": BookConte,
' ~) j( S0 a5 M: n& `1 [8 _ - "Titleid": Titleid,+ b* r. M/ ?/ G9 s, s) x( |
- "abover":abover,
" b+ M6 e6 h# ~# n: m$ V - # "Book_Date" = str(datetime.datetime.now()),* O, [4 g0 q6 D' |' H8 x" [
- }
& V) B: d z3 O- l7 k; j - def download(self, P_dir, imgDir, file_name, Book_img):
0 b* c3 ^( y1 x - if not os.path.exists(imgDir): 7 ]6 E5 N2 [) }! a n
- os.makedirs(imgDir)# l' H1 n# Y; U. r
- file = imgDir + "/" + file_name
6 s" R7 ]1 z9 V' P# F' } - # print file; ?( J4 I3 C1 d9 w' N" i
- f = open(file, 'wb+')! k- c3 C2 q; G5 L7 s
- imag = requests.get(Book_img)
( j0 u2 m& }, s k - f.write(imag.content)7 Q/ v/ }! T" [8 c4 b
- f.close()
F( \+ P u5 ~8 N x - #保存图片前
0 ]/ L/ J# j% X) ~: d* f - def save_imgs(self,response):; B+ g, |9 R$ S( i
- content = response.content3 j+ _; z+ J$ Y% l# [6 E7 f
- file_name = response.save["file_name"]6 z9 T, ~; M, {: C# J$ t
- imgDir = response.save["imgDir"]
. x. P2 V& i* T9 F& B: B - file_path = imgDir + file_name" P7 f: N) X$ o- F( m7 l
- self.save_img(content,imgDir,file_path)8 a; c# C) k1 [2 a4 ]5 b+ L
- #保存图片 j/ Z! s- u$ e1 X+ P
- def save_img(self,content,imgDir,path):
+ H' ~* \! u- i, S6 K' B - if not os.path.exists(imgDir): % d) \3 E" P0 B6 Z i( I
- os.makedirs(imgDir)
, r. H9 N- D5 o( [7 M8 R - f = open(path,"wb" )
9 t; \% }8 Z: Y8 ~8 `. p - f.write(content)0 J% U4 l+ A' |$ A; }+ K
- f.close()2 W! Z) ?1 v2 `
- #获取url后缀名$ [4 `& Q4 f, b/ L1 A
- def getExtension(self,url):
$ q* b- S8 {' ^* f f% Z) t - extension = url.split(".")[-1]
1 h+ z8 l) i2 N( q% V7 t2 ~& S - return extension . o3 K" V( k$ d( T; Q. e3 F
- - L- i3 P) n9 M1 e* W6 z- H- h3 g7 A
- #获取图片名( {7 J( ]5 U$ p; \3 W
- def getname(self,url):: d/ T2 a7 R3 C# M2 o7 w. Y
- name=url.split("/")[-1].split(".")[0]
( H# o! ` E* y/ {. q' X - return name
复制代码
9 f8 r7 W5 e8 s1 u) B% j/ m8 l
( }/ X0 k' J+ Y" H. Z4 h0 q |