Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!7 i6 T8 C& U: t# ?/ x! \
- #!/usr/bin/env python
# R: J1 X) L0 K9 ?5 P- e6 ^ - # -*- encoding: utf-8 -*-
) ~3 K' _" o- y7 ? E# M% \ - # Created on 2019-05-05 21:43:11
, W# w3 `7 N8 [- e& f1 I' q - # Project: XiaoShuo
+ O3 G: W5 N4 d- U' [) R -
+ U6 r, N% I5 s `. F8 M - from pyspider.libs.base_handler import *
$ j9 e" A- J* C - import pymysql
$ B0 v5 Y0 w, x7 d# L; d - import random
: l) v+ q" }! Q) C' d2 @ - import datetime- u( c* U _+ K D" Y. I
- import urllib2,HTMLParser,re3 e% r3 d9 W# d. ~
- import os
8 C: G' M: [6 L - import sys. D1 l" t9 n% @. J3 b
- import re
( @, w8 h+ O- j7 T - import codecs7 E! p' W" t3 Q& e, y% Z# `
- import requests
f! `3 o& o$ C4 ~0 Q" {* t7 D0 n - import json
6 s" `0 R' x! D1 [+ | - 2 y$ z/ I7 X, a$ O& v7 E. L
- class Handler(BaseHandler):
: s( D+ i# C1 `) e, s - global Datos4 m4 |" p3 o1 l) s8 W( v2 T, ?
- global P_dir 9 ~2 u: G' ]7 c O) f* v
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径- I0 J0 I" P0 s3 a. m
- global Datos
- u4 x; f) s( [) x" f, S) l/ q - Datos = {}
: Y' Z$ c9 W6 I8 n- o - headers= {
: z* z# v6 j- L( @1 X - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',, M0 l w& [) @+ A( w7 Y/ m) t
- 'Accept-Encoding':'gzip, deflate, sdch',
0 a3 n/ t! l- r9 d) `" c9 t - 'Accept-Language':'zh-CN,zh;q=0.8',+ x. O" W/ Y: }
- 'Cache-Control':'max-age=0',! N+ L1 Y5 H8 |7 [
- 'Connection':'keep-alive',
7 J6 q+ s1 z- H. N/ Z4 V% }+ l* u - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
4 u L: d. |8 j" p. {$ Z - }* I+ `; e( D8 l# `% p1 v- Y( t
- crawl_config = {
2 N; R& L+ A) [3 p - 'headers' : headers,$ j# S5 Q- S9 A' @
- 'timeout' : 300. {/ U, N6 q+ ]
- }9 j5 q4 f! G6 O
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
+ [% M- @2 u& ?( n - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
/ J3 c# N3 J# |7 e' A- e# @ - try:
% i( j0 @4 _8 H- G - cursor = db.cursor()9 O& }% w: }# o. h
- #注意此处字符串的占位符要加双引号"%s"( G/ [. p- @2 b" r' H' y
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);6 G7 q1 O* k) {3 z5 s1 ^
- # print(sql)
; |; W# X8 N" {1 u - cursor.execute(sql) l8 G* p+ e i* o f5 }% N
- / h/ P( f- L2 R
- #qid = cursor.lastrowid6 _% A. l' G U5 d( u, W( D
- #print(qid)
( J' ]# i& `) S) a$ @- ~: l+ ] - 5 b; J3 a0 U( p$ M4 w
- db.commit()# w( e: X a3 j4 `, G, s g- Q
- except Exception as err:- d) O' `2 ^" B+ b" E8 ]" T' x
- print("Error %s for execute sql: %s" % (err, sql))
H( Q p* K) X% u2 i - db.rollback()
, ^4 S$ @; D# b( P& F0 u% P& {5 r - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
" Z+ C3 K9 X' Z B4 u3 Z - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
5 G5 z; E, ?# ~) T& |8 s/ }% D - try:/ z! g8 K3 @! q* R
- cursor = db.cursor(), ] H1 s+ z7 T' B
- #注意此处字符串的占位符要加双引号"%s"
( h; W9 @- T' p% ^5 f/ i) S. o6 O - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
& W9 Z( [8 q) c$ z$ R" V6 V. A2 X - # print(sql)
6 G! b( W: ~7 r3 a - cursor.execute(sql)
/ X7 l0 f' a: S) Z9 n -
5 \7 @. L7 ?8 m$ F8 v - #qid = cursor.lastrowid( i4 Q; W# b- U# |3 {, d2 h
- #print(qid)' d5 @( }3 c2 F' M$ u- U
- * V& v% }# M/ r) v' K! `* Q: e
- db.commit()
: g" O e1 o* Q- U ^* q% M! q - except Exception as err:
1 i4 n' z' R: M6 h% ] - print("Error %s for execute sql: %s" % (err, sql))
6 { T% z) H$ Q- Y: u: Y - db.rollback()
+ J% D4 G3 M) C ?( L - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
2 o7 i( C+ E1 n3 g4 W! P/ [- u - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"); p5 o* P' A- D" V1 D& N* ?3 |
- try:
4 X" r. {9 v. z* G, a" G' E - cursor = db.cursor()
) l5 t' L5 d( _ - #注意此处字符串的占位符要加双引号"%s"
7 d p% }! U8 Z0 f0 ^# j - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
; C; ^2 o& F/ S q9 }1 A - print(sql)
6 ^, }- I* f" n! R& N: V - cursor.execute(sql)
$ k! g2 X0 ~0 t5 |& t2 h) p0 N( U - print(cursor.lastrowid)
5 V* Z/ E. O7 B: I v - db.commit()+ c2 W8 P1 P) G; R- V: r2 p. ?
- except Exception as err:9 ?( l' J% L: y; `4 N" m0 P
- # except:* |" g) @* ^7 D) [: l
- # print('Failed')6 _$ p& ?7 A% F4 [
- print("Error %s for execute sql: %s" % (err, sql))
8 k8 F4 k" G- R5 w2 } - db.rollback()
! g; p! t1 M4 T9 }: X/ k - & }9 P* h% w% U; ?# t/ F
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
$ m3 [0 \5 B$ k! M8 A - reload(sys)" _4 K; X8 y8 E
- sys.setdefaultencoding("gbk")3 C/ d; `! h/ R# w- j' ~1 F0 p. x
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址! T+ v7 N& U2 x" u5 s( [/ D
- locoy_data = {% @5 p" E& J* ?. r/ y& R$ [
- 'my_u':'用户名', #后台用户名
0 D. w- j2 v6 K* C - 'my_p':'密码', #后台密码/ c7 E9 m( o* m# B3 P7 b+ D
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
3 r- c* v, L7 @# }# Y* p& |$ V - 'caid':Cater_Name.encode('gbk', 'ignore'),
6 [! Z+ h7 ?1 I( W - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),) e# i) l- Z- u1 [) H
- 'article':BookConte.encode('gbk', 'ignore'),
6 B6 S1 h5 z( Y5 b - 'author':Book_author.encode('gbk', 'ignore'),
0 J' I0 i; g4 Y/ P: z - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
6 ]7 O( V4 i2 r; ~) k b - 'thumb':Book_img,
: |0 S5 }, O' @ - 'content':Book_Introduction.encode('gbk', 'ignore'),) @* w: K' O4 ?
- 'abover':abover.encode('gbk', 'ignore') * d0 ^& _) [; |; L
- }
# n# [( W+ P' i3 ?3 ~7 m - res = requests.post(locoy_url, data=locoy_data)2 L* a2 n# g( m7 `
- print res.text
0 ^% g3 d% `8 M, r. \$ t - print res.content
6 t9 J: u- B1 t - # print Dsd. ]$ ?0 j5 h" `6 r9 s& ~
- return res
& r5 x B% q& _* ?) ~6 ^ -
: g4 g9 ?# e$ L2 S - def __init__(self):
$ [) X% Z8 h. ^ - self.base_url1 = 'https://www.****.cc/'
8 o: ?% B4 w8 r$ s `! d - self.base_url2 = '/'
8 C7 l9 N( N: F: T8 n) D - self.CaterId = []6 j/ M: T6 P$ [1 x2 b) H" f
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
1 H- j- ]$ i+ P& I7 P2 o - self.page_num = 13 X! K: F& E* y8 z+ i& D
- self.total_num = 200
1 n& B# S" R- P$ v4 y0 z* E) Y) X - 5 u3 [- U; E& \7 @* a- ?1 s# Q6 k
- @every(minutes=8 * 60)
, }7 g3 Q1 G: ]% z/ H - def on_start(self):
1 k* H4 \, L+ n) q+ f5 y - global Cater_Name/ ?; @1 d( m7 h) X9 f" D& z
- Cater_Name = []6 {7 o* W9 K9 @( C4 C$ [/ o
- while self.page_num <= self.total_num:
% v8 \- ^- ?: _& @3 v: n- A - for self.CaterId in self.CaterIds:2 t& R# L' U8 w0 Z+ U; g
- if self.CaterId == 'xuanhuan':
: g1 p: H1 s- {! h; E' |6 y% Z - Cater_Name = '玄幻'
; W; {4 |/ t# e5 t! z7 A - if self.CaterId == 'wuxia':
! ~/ z" p. H1 s" P; _ - Cater_Name = '武侠'; N! _% N, o m- o2 ^* |' B
- if self.CaterId == 'lishi':5 Z# j4 K2 T' U. f5 C j
- Cater_Name = '历史'
1 F; K5 [/ z! m' b) } - if self.CaterId == 'yanqing':
" r: ^9 s8 w0 d, \" y# L - Cater_Name = '都市'
* d6 k& _4 h% [" x - if self.CaterId == 'nvsheng':
5 o; I. M/ y9 c6 ^) G - Cater_Name = '都市'
1 z2 V D- J. ~$ I5 _! I7 G% @ - if self.CaterId == 'kehuan':
4 W2 Y! U n2 V% ^- g6 x& E4 Y - Cater_Name = '科幻'
6 \0 _1 g" {( K& U5 `7 A9 b - if self.CaterId == 'kongbu':0 Z5 E( l& D2 [' V3 ?
- Cater_Name = '游戏' 3 v( S/ l' M( Z( {% x
- print self.CaterId) H7 n' T( g* X) I1 ?, A5 k
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
% Q2 Z; U% J- T& W% Y4 ^ - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
0 j' w. Q9 w7 N9 v - self.page_num += 1
: \8 L/ f& i, X$ C( D/ T - % B# W5 v0 a/ @. W3 t' I
- def list_Caterg(self, response):
7 E# m' x$ x. n! W! [ - Cater_Name = response.save8 p; T) o! ]: x
- for each in response.doc('.pic-list a[href^="http"]').items():
# {4 ]! I) S; Y0 d+ p - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name). H2 c- s6 z" Z6 M# a
- " h+ H9 q2 {" l9 b
- def list_Caterg_detail(self, response):
1 K1 _: j l) I. ?. O4 g2 ^% W - Cater_Name = response.save) a5 C5 K% O" ~ D1 Q
- # print Cater_Name
6 M9 [7 }! t( ?& Q* K2 c; P - Bookname = response.doc('h1').text()8 E9 l+ o" n; v
- print Bookname
. U1 g) M7 _: s, c - Book_author = response.doc('.authorname > a').text()& q( ~( w% Y% _% m- r, t
- # print Book_author. p4 B2 n) e- @1 P+ k7 ~9 `
- Book_Introduction = response.doc('.book-intro > div').text()
+ h8 A4 `$ }: V6 j) S. H' H% J0 e6 u - # print Book_Introduction
$ F1 i- e1 `' j( E - Book_Synopsis = response.doc('b').eq(1).text()
+ u/ {3 l$ Y+ s3 @4 C& L - # print Book_Synopsis7 m+ C+ G' ^; n4 k3 T
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
5 g$ d. H) L) q - # print Book_Palabras# Z1 |) M9 Y# C! o5 ~* M
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
: [5 b% f3 a3 T+ x f0 i$ m - # print BookIDs
1 m; J% W* x6 c) i6 \ - Book_Dates = str(datetime.datetime.now())
+ _9 X& H" L \- p5 J$ r - for imgs in response.doc('.bigpic > img[src^="http"]').items():/ `9 O; W# R. p3 y% i( B7 m
- img = imgs.attr.src
" a! y b: ~0 d8 ]; S8 I - print img
: _- c' D8 @& b4 T- \ - #小说封面下载# M8 J1 E. o- C$ N9 Y$ G
- extension = self.getExtension(img)4 Q7 z( e4 J6 T3 _1 X# y* C! m, z
- name = self.getname(img): r' s1 m/ s! r, \# M
- file_name = name + "." + extension
4 a& W- y# F0 W M! Q- m8 | - imgDir = P_dir + name9 w% z$ |1 P; s! o1 n' Y
- Locaimg = imgDir + "/" + file_name
; a% L; K _' m4 ?- ^ - print Locaimg( P& ]8 j* l7 L* e5 j7 J1 T$ N1 E
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地: T4 m1 \2 e6 K% u! S
- print('attachment url is ' + img) #
P5 n1 k, { w$ [9 Q - Datos = {: F( E2 b* `) b% D. z( i! w
- "Cater_Name":Cater_Name,# z1 u) ~: e/ {. s, D! i
- "Book_author":Book_author,+ a) f0 @2 G$ r/ D
- "Book_Introduction":Book_Introduction,
/ ^& Z! O5 [5 h' k; M# W' [2 N - "Book_Synopsis":Book_Synopsis,; G/ u9 n5 }" @/ U7 e9 o/ f( x4 v
- "Book_Palabras":Book_Palabras,
# q, ^4 \: n5 L* o5 p - "img":img,
4 C. L/ j( _+ }4 c" {0 c3 d1 U6 L - }9 i" L( i& i1 N! H, e i. k% G
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布1 L, i6 x# ?- X# ]: e! F2 Y
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
$ V0 j! h1 A, E5 t2 q9 o+ b - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
4 `1 ?) _1 x* K' I7 h& a - / W t$ g8 K; m. p. q8 w
- @config(age=8 * 60 * 60)
4 V3 c% ^9 d. R! @ ?0 r: W1 g0 r - def index_page(self, response):
4 y4 t; D6 g. @& _( @ - Datos = {& }7 j t" r/ x2 K2 Y7 c
- "Cater_Name":response.save['Cater_Name'],( `- F) j0 H+ a9 o0 @6 t) L9 @
- "Book_author":response.save['Book_author'],8 q4 e/ \! b% b
- "Book_Introduction":response.save['Book_Introduction'],% j J+ c3 h6 {0 E! a B' ~$ |
- "Book_Synopsis":response.save['Book_Synopsis'],+ _8 N1 k0 X6 T8 c
- "Book_Palabras":response.save['Book_Palabras'],% O8 a4 o# F& z. t) n0 j: F( o
- "img":response.save['img'],
9 g" R% S' }) a/ ^2 b' ? - }. O2 X4 x% U$ e" }) i% H
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():# ~/ L( n3 Y: B/ R& U8 m1 [
- # for each in response.doc('.chapter-list a[href^="http"]').items(): 9 U1 H; q8 d2 L6 ?3 ?
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)0 g' O Z% Z* u! _% q. z
- @config(priority=2); X2 h" ^1 S1 U
- @catch_status_code_error6 n: d! Y/ M" K# B* q
- def detail_page(self, response):
( |" ]) L* T8 N8 n& Y. N - NewRe1 = u'哈书'2 I. Z" a8 X: `. s& S! @
- NewRe2 = u'huhjsd.CC'
- a; t7 ?' d1 \* i; D - NewRe3 = r'^\\n\\n'$ u0 A8 t6 \- z/ K% k
- NewRe5 = u'小说网': v: f& c4 P) B! }# k7 | Y* a' C
- NewRe6 = u'fgdfgf'$ u" n6 p) b- @ g9 a9 I' c( E6 ]
- NewRe7 = u'fgfgf'
( ?9 K/ X ~1 Q6 N - NewRe8 = u'ffhgf'! Y6 C. @1 [/ m4 R2 ?
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'/ R* I: q- w! r) Q1 C* v
- ReC1 = u'静思'2 [: M+ \% a1 _2 x
- ReC2 = u'aghgf.com'% |* E' k" t! s+ Y1 ~ v. V
- ReC3 = u'aghgfh.com'
; f9 ^' S+ J6 k# |+ M3 \ - ReC4 = u''# W$ e! g2 e# l6 x6 O1 p
- ReC5 = u'文学网'3 p; Q, s5 v# ^. Z7 Q
- ReC6 = r'<BR>'2 @, ?, ^4 i3 Q
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称' }; c! J1 L/ o& w* D, N9 L' a
- print Bookname
c' f4 {+ }9 O - Cater_Name = response.save['Cater_Name'] # 小说分类% ^/ C* ^; n U5 y
- Book_author = response.save['Book_author'] #小说作者
0 Y! d9 h/ F' @$ p, ^7 d3 D - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
; W4 m0 P% P! f( S+ S5 S$ H - Book_Synopsis = response.save['Book_Synopsis'] #最近更新7 `6 ]; u% G8 |6 K! [( ~" A# R
- Book_Palabras = response.save['Book_Palabras'] #小说字数0 q2 h8 P$ X, v: p( D: w
- Bookurl = response.url #小说网址
1 g$ o# j! o* e) B - Booktitle = response.doc('.article-title').text() #章节名称0 U8 g& h& I" `8 Y
- BookID = response.doc('.readset-r span').text() #小说ID' m( G# l; f1 | q) m
- BookConte1 = response.doc('.article-con').text() #小说章节内容8 A/ Q' q/ c+ E: b" ]! l
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
; z; C2 y, X' p& e8 b9 x - Book_Date = str(datetime.datetime.now()) # 采集时间
! n& K6 W% `. y' |% a - BookConte2 = BookConte1.replace(NewRe1 , ReC1)3 ?$ `7 k" C4 O: B
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)3 T$ m* g# _3 U7 o
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)4 y' x: Q1 c, A6 {+ @
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
" _! M" D8 y3 t - BookConte7 = BookConte6.replace(NewRe7 , ReC2)0 q( O- ^/ T- _, ?' t
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)0 B2 l; B* ^/ v$ N: L! O9 C: M
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
/ H# o, ?+ G( ~ r# P8 } - BookConte = BookConte4.replace("\n\n","<br>")! u% P$ e( y! u1 c4 l, t8 r
- print BookConte: d( g) W4 |# H, t# N) b4 K5 K
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)/ v3 }2 T0 M, C$ t( N( E3 n
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)7 ^8 I; ?" S. f& D
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
! O1 w) H+ m" S" j - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)0 D( G3 |; l6 c5 A+ |$ }* R5 o
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
; y) Q5 p% T7 n i - Book_img = response.save['img'], #小说图片
' k+ Z. U# u' R$ y: b; b - / O5 m/ y5 w4 d1 x8 H- d( j/ h
- #insert into MySQL 小说入库
; O/ w1 H. y) w" G$ @# \& R8 @ - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布/ O1 P' j( [) ^% H
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布% \: _2 S5 h/ e
- #post提交发布
. d+ N9 Z6 R" J# V) s% T( ? - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消6 P& M) D% }) F+ ?
- Datos = {
) V% ?3 |1 ]9 O1 Z - "Cater_Name":response.save['Cater_Name'],
" [" d+ J* g S( w( G - "Book_author":response.save['Book_author'],
# _2 s9 V0 U0 N* | - "Book_Introduction":response.save['Book_Introduction'],+ g. ~/ J1 X; H! ?& t# j5 s
- "Book_Synopsis":response.save['Book_Synopsis'],
+ ^9 t9 [ y; r1 w' [ - "Book_Palabras":response.save['Book_Palabras'],% d1 b! n5 E, O1 ~8 q u
- "img":response.save['img'],2 |6 ]! _1 s7 W% j2 W) A8 U
- }# P0 X; i3 }* Y) d0 s. D
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():0 |4 O. j! ]" Q" i% u4 [6 W& ]
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
/ ^) k J w. U% v0 c - return {' m, B6 w6 m$ Y
- "Cater_Name":Cater_Name,
; _2 E( D* p) u% t# k* R - "Bookname":Bookname,
' M$ j: g; J; C) Y) @. t - "Book_author":Book_author,/ J& @5 g" ^7 E9 W' D3 h/ ?* H# c
- "Book_Introduction":Book_Introduction,* C& t1 z$ s' H6 v1 [
- "Book_Synopsis":Book_Synopsis,
" O3 A: B7 X% `2 [$ \" S/ Q* | - "Book_Palabras":Book_Palabras,
8 q+ W* N+ G) M$ l# b5 Y# T - "Book_img":Book_img,) r5 Q( o- C# X) A2 _) D: j
- "Bookurl": response.url, q8 ^0 N3 }& ~' ^! I' ~1 |
- "Booktitle": Booktitle,) k$ t. u" C" J. p+ p) G& J6 Y5 x4 m
- "BookID": BookID,% o, u7 {3 w7 [$ Y2 Q, e
- "BookConte": BookConte,
0 M6 s/ M2 \( c2 { - "Titleid": Titleid,
' p4 g3 J9 H; ? F, R- B; l - "abover":abover,# N0 [0 S3 H" z: b5 V# T
- # "Book_Date" = str(datetime.datetime.now()),
# o4 }, j' p2 V3 t( q - }
- i+ m& g5 M. Z. A9 V - def download(self, P_dir, imgDir, file_name, Book_img):
" e7 _6 [5 x) T0 Q N - if not os.path.exists(imgDir):
9 n% p( @; @/ H, ] - os.makedirs(imgDir)+ G+ c7 m& Y- u2 }# I
- file = imgDir + "/" + file_name! k6 H) Y- {3 n5 }
- # print file3 R7 V$ L( a" D; o2 U, Q
- f = open(file, 'wb+')2 h2 j+ ~- O! A9 a4 q R, }
- imag = requests.get(Book_img)
8 h' J! V. V) `% M - f.write(imag.content)! u' i6 A% M, z+ _* x
- f.close()
, D) P$ h5 r7 d* [- F8 c$ N - #保存图片前
$ y7 p( e4 d7 C* a* M - def save_imgs(self,response):* o( t$ x/ ^; i( F4 A: k4 q+ r
- content = response.content
- G7 O' Z0 F1 h% ] - file_name = response.save["file_name"]
$ n' ?8 L+ e% Q( v0 e: e - imgDir = response.save["imgDir"]- P. v4 ~! b2 F% l
- file_path = imgDir + file_name
+ Y' ~1 }) r" @6 d3 P7 r - self.save_img(content,imgDir,file_path)% Z" a6 s4 W% a* O0 E) p, C5 r! ^7 v
- #保存图片- K; z# X* a$ N+ i% G: o
- def save_img(self,content,imgDir,path):
" _1 l4 r% v( \& J1 w. _ - if not os.path.exists(imgDir):
, O% ~/ {5 |0 t. P9 N - os.makedirs(imgDir)
& [- O* B+ ]" u- C D- B Z! `! T - f = open(path,"wb" )2 z' H" P5 C; H$ N
- f.write(content)
" F* [/ l. x! F! S; M. V' v - f.close()8 B- J' Z8 s1 u) @2 f4 ?$ {% H
- #获取url后缀名
( U- F# E S* o2 | - def getExtension(self,url):
1 a& A+ p) V1 H2 p, E+ ] - extension = url.split(".")[-1]$ D9 }8 z. ~% K& Y1 A0 S
- return extension # q! y9 D7 p7 l5 m P
-
# D5 ~+ X( l9 y2 ]6 z - #获取图片名0 h4 R+ _2 P0 \# x# r2 @
- def getname(self,url):8 }. C( E2 v$ ~7 f/ ?
- name=url.split("/")[-1].split(".")[0]/ v8 r m4 i6 ?
- return name
复制代码
~8 Z% U5 \. e! W \) v 3 i1 Q. [" f6 j# e
|