Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!9 s6 N+ [5 E6 V8 e: o
- #!/usr/bin/env python
( z$ T! S( @' ] - # -*- encoding: utf-8 -*-
2 [" K: M. X B$ r: H* t: j3 s: W- E - # Created on 2019-05-05 21:43:11
0 y. L- R! x" {( c% `- K - # Project: XiaoShuo
( c x- g" t m) n% `, T! F) P - & v- J/ ?$ \8 L5 G" I
- from pyspider.libs.base_handler import *3 l: M- W: @7 X, M! F9 V8 P
- import pymysql
4 k2 ~% i. F" f; T1 I" ^ - import random$ ~# u6 f& \$ u$ }4 @
- import datetime
; |2 m- k1 S$ L% {/ n - import urllib2,HTMLParser,re
, v- g' _& h+ s, K - import os
4 V+ }5 A4 Y" ]$ } - import sys
0 Y6 @$ E/ L; n0 A" E - import re
: I5 k a! ]8 G% B& C% P - import codecs6 ?; B3 Z6 a V0 w' K8 y
- import requests
: o; M) B( D0 U - import json6 i# A) X" I' E. q: j" M$ g
-
: s0 Z2 @0 n& ]+ v! r$ E$ z2 [ - class Handler(BaseHandler):
4 B! i7 M+ J* D' i# M1 x - global Datos( w$ k' Y6 l1 J' U0 k4 {
- global P_dir
' Z( m: g, K" d - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
6 [- A, s! [9 d2 i7 f6 A - global Datos3 E4 X. @2 _* v. K, V8 _
- Datos = {}& G5 f5 b7 S+ B: D) E# C* h
- headers= {
1 g8 u9 |" E5 K g - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',7 _$ v* [4 q9 }4 ?! Q8 D
- 'Accept-Encoding':'gzip, deflate, sdch',9 H9 L/ V# U! n& Q3 C% R! m
- 'Accept-Language':'zh-CN,zh;q=0.8',
1 M! J/ \9 P. S$ A - 'Cache-Control':'max-age=0',
2 M K4 _3 P4 K$ T |) I - 'Connection':'keep-alive',/ B5 G0 Z0 e1 ?& l
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'$ x# T+ k, J! k9 ^
- }) y% I n: {; [) B3 U& r9 n& N
- crawl_config = {4 D1 J6 A+ l% Q$ y! g! H% f$ H0 V
- 'headers' : headers,
! j' I- ?6 j1 v1 ?) P - 'timeout' : 300
2 y5 }* l G+ _% s; m+ ^ - }, W3 J' ]& J; i( H! Z+ S ^
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
# z% x" M7 j( T8 ^& d - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")4 g5 v+ K3 b' i+ p' |
- try:8 g# x3 u) @) Y2 f7 ^# q4 G
- cursor = db.cursor()
l4 R6 D% o# h# G) M - #注意此处字符串的占位符要加双引号"%s"
1 t' ?. B, H. K1 W - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);2 t) F" ], B; X' k" {8 o
- # print(sql)( |( a. `) v( D) v! x+ X
- cursor.execute(sql)
8 H ^. {4 [2 Y ~! x( D - 9 ^3 r! V' D: O6 `3 _% Q
- #qid = cursor.lastrowid
/ Z8 I* ?9 `; M8 X - #print(qid)3 I' E2 B" ~" P7 m4 c1 K& G2 \+ z9 W
- . Q# C# C: |: K3 K' h, m5 J- ]+ z
- db.commit()
5 u! J' a, V6 S7 B - except Exception as err:
0 A& V2 Y, s- I, o$ w) F - print("Error %s for execute sql: %s" % (err, sql))" W% z5 W. u$ P5 x% p# k2 v0 E
- db.rollback()* C0 s1 o( q$ u: {( q7 ~/ T: a
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date): ]) V' }; w- S1 d d: j. |
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")# i T9 V8 g+ O- ]9 @+ g/ x0 |+ J
- try:+ z4 e' d& | e2 g3 y9 w) X* e0 i! g
- cursor = db.cursor()
# q7 R: B5 a9 @; g2 H: p. G% N - #注意此处字符串的占位符要加双引号"%s"8 `, N* S1 y, u
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);8 J) j( \% X1 m6 N
- # print(sql)
5 B3 Z. g" f. @. c - cursor.execute(sql)6 Q8 h2 ?* ^( W; W' t: O9 \
- 8 P. V1 v' K3 d$ C5 t
- #qid = cursor.lastrowid& n3 c% A G% F& [9 _8 D. j0 y, y: S
- #print(qid)
7 t0 E1 ^+ V: N- w q/ G -
' {' U) _7 U* q - db.commit()
# {% _* A8 A. S6 ^+ S - except Exception as err:- B4 O7 ]; S! m+ r( H" ?
- print("Error %s for execute sql: %s" % (err, sql)): A6 ~: s# ^1 u" k" l$ v% t. q* y+ v
- db.rollback()) {% ]6 [7 M, x# Q) Y2 ]3 L
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
- [" |' D1 k6 k) j; o - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
0 w* _* y& R1 ?0 ]7 K! k% T - try: _2 e% C6 c( g: {5 t
- cursor = db.cursor()7 T; R# _" Y, q2 N9 `" }* O
- #注意此处字符串的占位符要加双引号"%s"- R* S. W8 t: d G
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);. w9 Y( e8 l4 {0 [
- print(sql)* c t$ m4 o' W+ N4 F o2 K
- cursor.execute(sql): B6 J( Q2 N% m
- print(cursor.lastrowid)+ T. I" m) z9 w4 p2 K7 R5 w
- db.commit()1 @ R% P4 } @8 x2 N5 J+ h
- except Exception as err:
1 @1 a2 Z2 H8 C' v6 k+ Z - # except:5 J1 Y' G, M& ]
- # print('Failed')
% g( t" l A1 }9 g) _ - print("Error %s for execute sql: %s" % (err, sql))
* z' }. m7 @! g% F% g" u( i, u - db.rollback()5 N2 ]9 C4 N3 B: C5 m
-
4 w; G0 r4 O) O. H7 N. ?! p - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
# g* q' }4 R& t7 z/ l6 O - reload(sys)- _7 F' I3 F" V- {
- sys.setdefaultencoding("gbk")
& [$ C) @6 T5 v' D+ l - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址& U1 d. P6 v* o7 Z
- locoy_data = {- e' ?7 A8 v: K# Y
- 'my_u':'用户名', #后台用户名
2 c+ F* q7 d1 S0 [8 F - 'my_p':'密码', #后台密码- M: f! W) |8 i% z" |# M5 N& m
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),% o4 T# i: B7 C
- 'caid':Cater_Name.encode('gbk', 'ignore'),
; V) R0 M+ ` b0 O, i - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
; E1 h5 M" a4 P' O& o+ s b - 'article':BookConte.encode('gbk', 'ignore'),
3 {; ~3 \) J/ Z1 @6 x) |$ j - 'author':Book_author.encode('gbk', 'ignore'),1 k8 G& M x4 G# `
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),4 K+ h5 k/ _ u# p. v% d4 R
- 'thumb':Book_img,
" ]" S: E: h/ [/ i* @ - 'content':Book_Introduction.encode('gbk', 'ignore'),- Z7 N: I5 D& W
- 'abover':abover.encode('gbk', 'ignore') 8 K8 a5 N& h2 D. M H; h2 Q5 `
- }+ _" E0 U9 A0 t8 W6 ~, `1 c8 u
- res = requests.post(locoy_url, data=locoy_data)/ f! p7 P, A2 R! x
- print res.text% p. _) z# o$ g; X( m8 Z
- print res.content
' o% S2 @) N! @. d: x: q4 l - # print Dsd& K* s! z9 N9 } E6 Q3 ?
- return res
8 V7 ]0 R6 B( r5 W -
0 p Z" ^! S+ }) S) Z( \* s - def __init__(self):
; R' }$ P/ H) M - self.base_url1 = 'https://www.****.cc/'
$ Q2 m8 ~4 F9 _6 b - self.base_url2 = '/'. T C$ Z# Z; {# {' _ C
- self.CaterId = []7 G: e5 Y0 F( o- k6 K g/ I& h
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']; H; O0 h: K& u: u
- self.page_num = 1: F+ K" [. L( T8 Y9 K
- self.total_num = 200
8 J! |( f" U. k3 R& H7 W, T ` - / Y* B/ p0 x8 G/ y, x" X
- @every(minutes=8 * 60)
% o! |7 `# H9 S& k& D - def on_start(self):) }1 {" N' k5 C U3 {* c
- global Cater_Name8 N& v# s( @- t$ i
- Cater_Name = []$ Y( V7 v) x; ~2 p4 i- v
- while self.page_num <= self.total_num: 1 s! z* {/ U/ ~4 D
- for self.CaterId in self.CaterIds:
9 ~$ V1 b# E$ b8 E" z: ] - if self.CaterId == 'xuanhuan':- t& V" |* C2 C1 k. J2 t/ e
- Cater_Name = '玄幻'
9 P2 J& W& u1 p- q) n) G( x( f+ H - if self.CaterId == 'wuxia':
/ K% b! a3 t" b/ I& i - Cater_Name = '武侠'
3 [3 q- g7 T* l9 H - if self.CaterId == 'lishi':- T: c* U! f: z6 ]/ [9 N- r% _
- Cater_Name = '历史' / X# R* T9 v+ H! ^
- if self.CaterId == 'yanqing':
" }9 w' G4 H* k' L+ I, l3 E# n - Cater_Name = '都市'
, E k' x+ }( i" y& m- z) K t - if self.CaterId == 'nvsheng':
" i' a) Q' Z3 ?5 _+ { - Cater_Name = '都市'
9 |$ C8 R$ d( U3 F, e! G - if self.CaterId == 'kehuan':, z: |/ o& N% h+ N! _
- Cater_Name = '科幻' 2 V4 e1 ?' k- ^% W
- if self.CaterId == 'kongbu':8 G# o$ _) d3 @) H$ w
- Cater_Name = '游戏' ; V1 P! B* x! A$ y
- print self.CaterId
1 @" h7 r+ u/ n1 \& _ - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" ) n$ D# @ g6 ? C. K) n0 ^* H
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)& K1 {. ^: t3 H+ T s
- self.page_num += 1 : T4 O% e$ ?# t
- 1 w8 r5 B! C( q6 o/ ]! e" B
- def list_Caterg(self, response):- U+ |0 Z# ?5 T8 `1 E% ?2 \$ C
- Cater_Name = response.save
; j" H0 T, L# W; @ - for each in response.doc('.pic-list a[href^="http"]').items():
7 \9 {6 I4 T9 S* J; v - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name): i; m0 N$ {9 C; e+ a4 q
-
5 {# c" b9 @0 b0 _9 r4 a - def list_Caterg_detail(self, response):) z2 {, \& W9 ^+ T7 D
- Cater_Name = response.save: n8 B$ i( I+ n# K
- # print Cater_Name9 X! V: I) |& ~* W$ w
- Bookname = response.doc('h1').text()
/ O' P4 _4 R+ C# l8 i4 Q" L - print Bookname" k+ H/ P" r9 I. `! s0 F
- Book_author = response.doc('.authorname > a').text(): X8 Y6 ]& s) R1 V, I8 {' \
- # print Book_author+ h' ^7 v/ t$ U' c: \' r8 t
- Book_Introduction = response.doc('.book-intro > div').text()
! g: r H* j- t# p- c4 x% x - # print Book_Introduction" E8 d$ i+ T3 z$ F, E( B1 q* [
- Book_Synopsis = response.doc('b').eq(1).text()) A' h7 v1 _5 i6 d9 T/ l
- # print Book_Synopsis- D J8 p. r& _; j5 o8 a! a
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
( @/ F$ S) h/ }6 T# @& s - # print Book_Palabras
* w9 |$ t9 j( H - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
. r; A2 ~) o& a3 u4 ` - # print BookIDs
1 T$ Z' u" H3 y: X, [8 x - Book_Dates = str(datetime.datetime.now()) & O8 `" I- p) ^0 j
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
, G5 s3 { {/ y: u( C - img = imgs.attr.src
@; ?3 `2 X; w0 s* f - print img& @. e/ t' z- U- q$ g7 @
- #小说封面下载/ Q, N4 ?& M9 G/ E( J: f! O$ T! G
- extension = self.getExtension(img)
" j) E$ l2 B2 |; e" |' w4 |. D - name = self.getname(img)1 Y4 D! t* R( v! ?
- file_name = name + "." + extension
; h7 {% z0 C& k) b# | - imgDir = P_dir + name
; w9 ~, S4 b! R" R& W4 ?0 r. r - Locaimg = imgDir + "/" + file_name
# G( z# y! [" N# R- F; [& m7 t - print Locaimg: t' i0 z# h+ `' g* p8 }5 n
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
" F7 v' G+ j, T U+ K - print('attachment url is ' + img) #
. \! d, i% K9 n/ |; C) w2 a - Datos = {! z+ P0 V( ]6 z! K ~- H3 V
- "Cater_Name":Cater_Name, {' s& ^( O; U8 `1 g9 A
- "Book_author":Book_author,
3 ^6 J+ u& N! L' p& \ - "Book_Introduction":Book_Introduction,
# y; `0 f& b" {; c) K/ h6 n+ _! a - "Book_Synopsis":Book_Synopsis,$ Z; a9 k$ r, ~" s& k
- "Book_Palabras":Book_Palabras,! s' |+ n+ h1 { T! k5 L" K# y
- "img":img,
4 K1 V. ~+ k- t; @7 O- ] - }
3 H# C9 Y7 @) k/ N/ A - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布9 ]5 k9 ]# \* A, s4 X5 o. I
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
: q# [) i9 W! I$ {# [9 p ?) t - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
1 Q) N3 B( F& {. E - * [+ F" b# g" D& E: B. X _3 V
- @config(age=8 * 60 * 60)
( R# e5 R8 M! r) i n - def index_page(self, response):
( z% h9 x' ^3 d2 a - Datos = {
, B4 g2 {# u3 z* O8 M2 H - "Cater_Name":response.save['Cater_Name'],: P9 }- O5 h3 i) S7 p- |1 @& P
- "Book_author":response.save['Book_author'],
3 F/ e- l5 r7 E - "Book_Introduction":response.save['Book_Introduction'],8 ]+ u' F+ Z' P: |; E
- "Book_Synopsis":response.save['Book_Synopsis'],
9 I, B, Q! G6 F7 e. z4 h- j+ c - "Book_Palabras":response.save['Book_Palabras'],
+ z6 z* F5 p$ _& F1 _" Z - "img":response.save['img'],
; w/ S9 [2 u2 C/ V# f) M/ G - }
& u# A. Z, u% @0 @" i i - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
! w+ d' E) ^% F+ H$ Y3 ^. [$ { - # for each in response.doc('.chapter-list a[href^="http"]').items():
, o$ g7 H- |' M% Z - self.crawl(each.attr.href, callback=self.detail_page,save=Datos). q+ ?1 u( @* _2 q; _
- @config(priority=2)
9 h; d! r' B% I7 Q1 t: B - @catch_status_code_error
0 e: P# J& S! s - def detail_page(self, response): " G; L; q; k" s
- NewRe1 = u'哈书'
z0 r9 A- h- B* L# M- n5 T - NewRe2 = u'huhjsd.CC'
2 G& x0 ]+ Z" P8 k0 [& R z" J2 | - NewRe3 = r'^\\n\\n'
- O4 m2 a# ~' g! f% k# d - NewRe5 = u'小说网'9 F9 J0 r. s; s
- NewRe6 = u'fgdfgf'
6 {1 f( j( h/ h* D- I }0 N - NewRe7 = u'fgfgf'
( ]5 N6 } ]6 t5 f' m i - NewRe8 = u'ffhgf'2 m/ q. i% P4 {4 E+ t
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'8 n- |2 n+ s) ?) C3 H
- ReC1 = u'静思'; \+ b- `# h* _" ~* x& A
- ReC2 = u'aghgf.com'
, _: e" \% F a) a - ReC3 = u'aghgfh.com'
0 y5 h7 ?( e+ P' }4 o - ReC4 = u''8 U+ z0 v; e. J; Z
- ReC5 = u'文学网'
# m6 ^& H4 o8 P: z. |$ k# b: J! l4 Y - ReC6 = r'<BR>'5 n) r( M: U9 f6 h( b3 g' @
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
: f5 {6 I: S& B) C Y - print Bookname4 z4 R# C9 {2 @- O" [ [. L
- Cater_Name = response.save['Cater_Name'] # 小说分类7 C) {! t! t, z% q
- Book_author = response.save['Book_author'] #小说作者% y& Z3 r: H4 y; h0 q- z
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
2 @6 e l* z H! i: D( ]* z - Book_Synopsis = response.save['Book_Synopsis'] #最近更新# Z1 v2 m: z0 M. s
- Book_Palabras = response.save['Book_Palabras'] #小说字数( p1 i8 [4 R( w; p
- Bookurl = response.url #小说网址6 Q+ _0 I3 w: p! I2 X
- Booktitle = response.doc('.article-title').text() #章节名称- T' b# s7 R q; W! W5 O; e
- BookID = response.doc('.readset-r span').text() #小说ID) F# a: ^: p' f6 I: x# g
- BookConte1 = response.doc('.article-con').text() #小说章节内容3 W7 V+ Q% D6 p7 U3 H" B: R8 _
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
# U6 g |+ Q4 v/ q: e - Book_Date = str(datetime.datetime.now()) # 采集时间8 E) {: \+ d1 l
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
% W' U2 S( S$ f5 U: b' o - BookConte3 = BookConte2.replace(NewRe2 , ReC2)6 E0 |% l! [0 l% O! k( R7 f
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
6 W. m4 ?3 l7 o7 F$ c( h* l T1 D - BookConte6 = BookConte5.replace(NewRe6 , ReC2)+ G+ b @ |2 G
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)7 v9 d" q y* n4 u- k
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)8 S2 F3 ^- q6 U$ F! R# p
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
% _7 S n) A& Q) @ - BookConte = BookConte4.replace("\n\n","<br>")0 K* i9 F/ w4 d0 d% o& \
- print BookConte% G' E- l; ?( h# W: k
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
& n+ l# G0 o+ D2 N - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
* \$ b4 i4 _. m+ X A2 ] - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)- q7 Z# H* y+ H) Q
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4). \/ Q _: M2 l. |/ r3 n
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
- x6 o3 o( L. P9 z8 G+ R - Book_img = response.save['img'], #小说图片" Z- T: f( ?7 l8 X Y6 C" Q6 O' r
- & Y- |. ~# ~+ r$ w; G( d
- #insert into MySQL 小说入库
& L+ ^' T* `4 b4 c. f - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
~, P- U; Q7 l! _/ W7 r - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
7 u" e2 W( q0 q( ? - #post提交发布
0 w- g6 W# p( ~/ F ~; `2 z2 D& R - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
- M: B( z, ?' n. W& z- `, x5 S - Datos = {
! s4 W! _6 D8 H& F' a! t - "Cater_Name":response.save['Cater_Name'],) I& B/ `) C" [5 A
- "Book_author":response.save['Book_author'],
2 i7 f5 o0 w, N. P - "Book_Introduction":response.save['Book_Introduction'],' K0 _/ X2 `1 p/ Y1 O- l
- "Book_Synopsis":response.save['Book_Synopsis'],! s! d) x/ y Q* a
- "Book_Palabras":response.save['Book_Palabras'],- F1 q4 U" c5 |
- "img":response.save['img'],3 I6 U! |" b! W8 {
- }
7 a- \$ X2 U7 T( q" z - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():# y6 v) _+ M6 H6 p3 g" D7 `6 N
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
: D5 o9 {' D v( v: h) }& P - return {+ H7 U8 @7 E6 P* O; g8 L: m* L$ z, K
- "Cater_Name":Cater_Name,
6 W& m* _: U. A - "Bookname":Bookname,
8 K, X2 N" i6 |, V+ T) V1 J) t - "Book_author":Book_author,
/ Y$ n8 c( B- I0 v1 Q) k - "Book_Introduction":Book_Introduction,$ C2 ^" c7 t' E6 H
- "Book_Synopsis":Book_Synopsis,
% r* G8 v0 ^/ m% |' r2 w - "Book_Palabras":Book_Palabras, z2 l! D4 N1 x% m6 |
- "Book_img":Book_img,- L* g+ S8 {, M; U% ]
- "Bookurl": response.url,( \. I- z$ P( o8 h" I
- "Booktitle": Booktitle,
5 S8 A) G. K! m8 ~) B H3 }2 l Q - "BookID": BookID,! ^4 g Q' t- `
- "BookConte": BookConte,
7 ^: K: c: t/ u - "Titleid": Titleid,- b$ ^, Y3 J( X$ T0 ^
- "abover":abover,
2 j9 u/ p4 w: R; B% h" R1 o - # "Book_Date" = str(datetime.datetime.now()),
' N6 m5 m. @ u7 t# v; ]; W - }* I. i4 r( f2 h% N" P6 e& b
- def download(self, P_dir, imgDir, file_name, Book_img):6 b; |0 O1 D4 P( h: i+ v
- if not os.path.exists(imgDir): : P1 i7 o9 L: w+ R* K" |
- os.makedirs(imgDir)
2 e! ~, v1 R) B$ y j' c - file = imgDir + "/" + file_name
2 q5 n2 r0 R) r5 } - # print file
/ r& f" u" N1 E" I% w: @* X0 b - f = open(file, 'wb+')9 a& ~) ^! U. S
- imag = requests.get(Book_img)
" O# e" B; f; a* M0 A3 o/ J' P - f.write(imag.content)8 r- Z D; \, k: G7 \/ O, J1 {
- f.close()
& L1 S& X$ X% M( [7 K* J - #保存图片前
4 Y$ R: L' {0 r - def save_imgs(self,response):" z+ p' w) E- r5 `2 l: G# Z9 H
- content = response.content+ u" ?6 ~2 a+ D$ P( w, z9 A
- file_name = response.save["file_name"]
# ]2 I" r- c1 E9 } - imgDir = response.save["imgDir"]' V2 |4 a; i# _! Y8 e, x3 m, y/ D
- file_path = imgDir + file_name, q$ F( Y4 t6 n! |. J. q* y6 f
- self.save_img(content,imgDir,file_path)
: ~" H5 g' M( C- z: n - #保存图片1 q1 }1 P! i# S& L
- def save_img(self,content,imgDir,path):0 J: n( {( W1 `1 X! @
- if not os.path.exists(imgDir): : _& @- y, w7 G/ E3 r5 P& |
- os.makedirs(imgDir)
" [6 _, i! q2 {; t* c - f = open(path,"wb" )
* u, Z" c& s, O7 q6 | - f.write(content)6 G6 r8 C F- A# X0 I' `
- f.close()
# B x6 j" W" L5 |2 T - #获取url后缀名
2 q- I0 `) o. c% x5 T# c" V - def getExtension(self,url):
- `' J* h, _) s0 |2 v" I - extension = url.split(".")[-1]
* }2 S/ `, w8 H; H+ N( I, f. f4 C - return extension / c6 \. P; ]9 V' v' X8 [$ C2 ?" c
- 4 K* J( p+ I8 C- x
- #获取图片名
/ ~5 w+ @9 f' `8 b0 {4 ] - def getname(self,url):6 i" [# k! s9 j' F* ]: k. d* C
- name=url.split("/")[-1].split(".")[0]
" \) l. D( _, i0 e8 O% k# U - return name
复制代码 . w3 [( q+ l# G
* O: s+ {9 X- p, p
|