Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
/ Z* Z* }5 F& b9 q- #!/usr/bin/env python( M3 y- G$ v( n+ y) {' A- [7 |" v
- # -*- encoding: utf-8 -*-
- H1 s: p$ Z+ s0 {0 o/ A8 W - # Created on 2019-05-05 21:43:113 ]/ y$ L3 A+ @$ l4 U
- # Project: XiaoShuo% d0 ~+ `; p0 a! R2 N% U8 L! A
- {) b3 `, w' s9 k5 S/ K9 k
- from pyspider.libs.base_handler import *
8 \! I+ t# ?) J( A# b7 O" A - import pymysql
# \0 Q, `! M3 s - import random
2 h) A; Q$ U6 i4 i0 W3 u - import datetime
/ H% c4 `' w7 i - import urllib2,HTMLParser,re
4 _: \9 X$ `5 y2 v4 u7 r( M - import os
2 u- u; Z1 t( S - import sys
. v! R" J" o$ l; i0 _( T/ n* Y - import re
; W$ [9 X' n4 g2 ~! Q - import codecs
1 i, S+ x9 t5 ?- c0 H0 ]( { - import requests
% d/ R, N8 V) G - import json# T4 [5 u; n! U3 c0 q- q+ e
-
' }3 E0 D6 z, s. D, |: z/ l7 n - class Handler(BaseHandler):
; |# Z9 k0 J1 o9 V- d0 @ - global Datos) s/ X/ V7 s9 G( }- H
- global P_dir
- }/ O! b5 B( _$ S - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
4 F& b) `3 O/ l( {, V - global Datos
8 C! m0 g" s+ |( W - Datos = {}$ e, t1 D8 k" s5 p
- headers= {
% |! z9 `3 k$ W - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
; @0 S/ U& d Q2 l8 I2 R! \ - 'Accept-Encoding':'gzip, deflate, sdch',
) t j" ~/ [# q5 P% i - 'Accept-Language':'zh-CN,zh;q=0.8',0 @' O" T# I" \% O' z
- 'Cache-Control':'max-age=0',1 k$ N; L* t* g. m$ t
- 'Connection':'keep-alive',' J2 ~5 x( J. x4 G& L& ` C9 R
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
9 E( |3 @* f" j- T* D - }$ w/ @/ W! ^+ G' y
- crawl_config = {2 i: k; i1 e9 y2 Q/ Q) M5 X& C6 r3 @/ W
- 'headers' : headers,
, \8 s& v' a. `* M5 e% u. c" t; c, L - 'timeout' : 300
) |. w& a( Y6 U0 V9 Z1 z+ h( n# ]0 R - }$ ^; G# M# _. B3 k( K
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):' R5 ]- l3 ]( d: R& E( \
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
& d% ]+ X4 _( C# H( C& ?$ { - try:7 H# `* f; Y' M; n8 g( F
- cursor = db.cursor()3 m L$ j' \" H- x3 Z" H* j
- #注意此处字符串的占位符要加双引号"%s"7 r2 O! ]% `9 i3 }
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
$ A( `' y) c! f/ E! L+ n" J! v - # print(sql)
+ [9 A; R0 N* B8 E0 ]' f& b - cursor.execute(sql)
% C2 }$ N$ [4 f! b7 ]1 E -
7 Y' Z) |0 l) J7 Y$ k! i# s) t - #qid = cursor.lastrowid
' K6 C m4 i2 b - #print(qid). t1 ]) R) \( {
-
* B4 R: ]9 I5 N4 [% v! b6 W - db.commit()" A. V1 u1 x/ {: N0 r. ?
- except Exception as err:, U7 f. l6 }0 l; e
- print("Error %s for execute sql: %s" % (err, sql))
/ |/ q2 T; M7 _6 ?2 I1 } - db.rollback()
" Q% _$ @8 h. o, \1 y7 \6 ] - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):- g Z% ?' x! R3 p b; b4 r
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
5 z; r+ y% E3 ^( x - try:
8 H& B; C/ [% o* i+ y5 K - cursor = db.cursor()
9 Z" U9 }1 u# _1 O; A3 h0 G - #注意此处字符串的占位符要加双引号"%s"
9 S6 |3 m8 f6 r: m3 b( p - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
# ^- k* P8 c0 _* S9 \ - # print(sql)
! W4 D0 m- M" F9 e; z4 k/ J: V; e3 O - cursor.execute(sql)
( O$ o) \0 k) S4 F1 s - 2 e: F$ L1 f: F1 {, Y) H
- #qid = cursor.lastrowid1 F6 s E2 Z7 b2 u6 R! A1 l
- #print(qid)
) J# j* W* |: B" n; R, d -
: n$ M- y% Q( S2 H r - db.commit(). W' ^ U _/ j
- except Exception as err:
* \4 p0 N2 l! z( @) |6 j6 W; f: G - print("Error %s for execute sql: %s" % (err, sql))
; a+ u& U+ y6 X U, C! U - db.rollback()
1 {: K t7 ?) j! {' W$ @ - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):. E7 S/ P6 \7 l- N. i6 y( o- d5 N' v l
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
! e: M" T" w/ z( z! B% F - try:
. ~" v# @( ?) v2 |* v - cursor = db.cursor(), |: ^* }% b9 [# Y, l+ O" v: a G
- #注意此处字符串的占位符要加双引号"%s") R6 z# M+ A0 a. j7 X# J
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);0 w" a( m! V1 ? i& ?0 j x# k
- print(sql); K2 L0 } U) X+ X1 ~. `: I
- cursor.execute(sql)
! s/ H' v+ |: d: |% y- `% y/ L - print(cursor.lastrowid)- _% S8 y' h9 N! O+ |' S5 Q
- db.commit()
, N1 t- |& D; Q9 m% u& L - except Exception as err:- w( z% K# M7 Q2 s* Q
- # except:: X9 z1 v% @% a
- # print('Failed')
' a4 K1 u+ [- Q4 F, T - print("Error %s for execute sql: %s" % (err, sql))# B+ {& ?8 u6 C# w! M! T: _
- db.rollback(): o6 x X% T3 v' g4 w
-
, H9 E+ B9 Q8 `! j - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): ) Y( l0 Z4 ^" A" s3 Z! J6 F9 r
- reload(sys)7 P; e& ^: ~5 h H$ E/ [8 K8 k9 `5 c
- sys.setdefaultencoding("gbk"), o2 m9 W W6 X
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址3 v2 {) H. q' X3 e) D- K/ a) c% @
- locoy_data = {8 F4 l0 i6 r" T4 l
- 'my_u':'用户名', #后台用户名
; M6 P M. Y* J( C; S& r - 'my_p':'密码', #后台密码
1 J! A9 X+ ]" Z+ E - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
[7 G6 U6 U8 @3 I! n - 'caid':Cater_Name.encode('gbk', 'ignore'),
% ^% }2 n& r, } E/ M - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
. x7 f7 Y2 B7 }2 X2 I8 t* R - 'article':BookConte.encode('gbk', 'ignore'),% d1 o6 H" \9 l0 r
- 'author':Book_author.encode('gbk', 'ignore'),
: k9 Q; A) J- ^' A! P* f: v - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
# h, \1 b; R. _8 B/ p' z7 R - 'thumb':Book_img,& ~5 n3 l- r; m
- 'content':Book_Introduction.encode('gbk', 'ignore'),
H' P0 y2 ~: E9 u6 n$ b - 'abover':abover.encode('gbk', 'ignore') # j2 t3 L- }6 h3 t. ^' D
- }$ h3 }, ~. f, d8 B
- res = requests.post(locoy_url, data=locoy_data)) e" I+ ?" j" i% K
- print res.text
% V& ^2 o+ ]8 h - print res.content
: r$ N5 v, [0 _- f0 J4 M; @ - # print Dsd$ \' x9 O9 B' h5 `
- return res$ s( H$ L% X& K+ T ]8 F
- + @4 Y/ L q. d$ ]) c
- def __init__(self):* b; ~7 A! K2 J! I& Q5 \1 `
- self.base_url1 = 'https://www.****.cc/'
, O7 \ D3 s0 d9 U' r; s - self.base_url2 = '/'/ @9 u5 f1 t' ]" W2 I$ b
- self.CaterId = []" o6 @* K3 T8 B, ~- ^2 Q- j
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
1 [2 Y4 c; s" e4 U - self.page_num = 1
4 L% i* E3 b' w - self.total_num = 200 8 w# x0 `: _, o3 D8 Y3 o: k
-
V$ `$ G# \- t+ _ - @every(minutes=8 * 60)+ J1 M) {( Z. k9 i% m
- def on_start(self):
$ d: {4 j6 Z* a - global Cater_Name
# m, x( ?) m1 ` - Cater_Name = []
* M2 T7 l- w2 _' w+ y6 d/ ~8 f1 @ - while self.page_num <= self.total_num: 2 s: p! O7 N& _. z& _+ J6 Y
- for self.CaterId in self.CaterIds:
X! o& v" ?+ k+ i - if self.CaterId == 'xuanhuan':+ T: ~0 \% L7 |
- Cater_Name = '玄幻'5 H( Z9 _" E; S! `! ]8 Q! Z3 ?
- if self.CaterId == 'wuxia':
& H# o3 B1 _; z! n/ U6 @ - Cater_Name = '武侠'
& D2 }4 }3 y+ @* f - if self.CaterId == 'lishi':+ k E, S$ n' T* V: S. k. O( Z
- Cater_Name = '历史' + N% ? z& X7 ~$ D
- if self.CaterId == 'yanqing':
. r) w( w5 j5 y1 Z* l/ X# f - Cater_Name = '都市'
4 [% l: n f$ s5 E. P - if self.CaterId == 'nvsheng':% x$ ~4 J" e. g' a- J' s
- Cater_Name = '都市' ( H; a) ~) E, V: w2 X# ]
- if self.CaterId == 'kehuan':
7 `" v) k) S8 e# F - Cater_Name = '科幻' 6 ~8 O$ E! o" P `; u
- if self.CaterId == 'kongbu':
; o# z# ?- O, D, n# \$ s - Cater_Name = '游戏'
4 b) l7 o/ B+ D& ? - print self.CaterId$ [1 \/ T' ~# X$ N; [$ y
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
9 U" q$ p5 m4 Q D% X' g2 b - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)$ U! P. k" l4 b$ b. R
- self.page_num += 1
" ^1 x* }! s" G" _3 j8 W0 R -
4 K! x, t- z' R3 [& o3 h - def list_Caterg(self, response):
8 j( C$ i+ j& S- d- C0 H( g; U - Cater_Name = response.save
8 ?% x( z6 E3 h4 Y - for each in response.doc('.pic-list a[href^="http"]').items():
, Y" |2 l9 l) }6 W8 o% A$ r% _ - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)* i+ @) [ H" q' C7 C
- ( Y: m7 a/ {5 S# `7 R: m& b
- def list_Caterg_detail(self, response):8 z5 y5 h6 t, m% x. k I$ C0 o
- Cater_Name = response.save9 S* v o( k3 n
- # print Cater_Name
. A( `" M7 P4 P- l1 L$ l W/ @; @ - Bookname = response.doc('h1').text()
0 J1 D. ?2 x6 X/ ?, k2 l - print Bookname3 y0 k; d5 a% T2 h8 v
- Book_author = response.doc('.authorname > a').text()
. d/ \6 {8 Z; A - # print Book_author
+ c7 u9 n e# N" I3 T( v - Book_Introduction = response.doc('.book-intro > div').text()
1 G2 G$ P! @/ J9 H/ | - # print Book_Introduction7 P/ G5 c, C1 v1 B
- Book_Synopsis = response.doc('b').eq(1).text()
0 R8 Q/ i" A& M: }; ? - # print Book_Synopsis& A3 Z, |9 P* C+ i) C% m
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
% y$ n: U0 i% i* m5 t - # print Book_Palabras
3 Q: r0 c1 p& F - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID! y! @! g+ ^8 i8 Z9 ?
- # print BookIDs
# Z/ }5 R3 j9 Q7 W( g" V - Book_Dates = str(datetime.datetime.now())
8 ^8 L) w& W; D$ ^& y5 i - for imgs in response.doc('.bigpic > img[src^="http"]').items():
0 b3 _% h* w' j+ C1 x2 {8 P - img = imgs.attr.src+ d2 G) q" [8 _' m* J8 j9 E6 l2 j
- print img# v1 x, ~- T) e' a* E# \$ v
- #小说封面下载
( G, i$ L8 F4 u! O K - extension = self.getExtension(img)
( @; O% f9 z2 \9 R! y8 M - name = self.getname(img)/ o& t1 i( H3 V0 I7 o2 {1 A
- file_name = name + "." + extension
/ f- V% _4 k7 X - imgDir = P_dir + name/ n; x y# W. y/ G5 T7 Q
- Locaimg = imgDir + "/" + file_name
q! W9 N/ ~5 y4 `$ w. x; g - print Locaimg' I* w5 d5 a; ]/ g( B
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地0 |3 b; c; ], S/ T' U+ y4 H
- print('attachment url is ' + img) #
" w/ u, Z m: { - Datos = {
1 F# v$ w4 V% H - "Cater_Name":Cater_Name,2 W8 \2 |* ^- X. N8 t U! t
- "Book_author":Book_author,
% {! T1 H3 Q$ U6 C0 B2 w/ X* S' } - "Book_Introduction":Book_Introduction,
( I; K$ E. x$ }+ b2 k - "Book_Synopsis":Book_Synopsis,
. `0 y5 s. j- \5 c; P* t - "Book_Palabras":Book_Palabras,
- [1 ~: @ y/ G" I+ T. _ j - "img":img,+ v' N" M; {! @1 \6 H* ?3 I/ j
- }
1 @6 I7 R2 y# b, q9 I6 s" e - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
6 M5 D- H) D0 p, j0 \) y- R+ { - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():9 @ u1 A! d* }4 ]2 p
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
# F+ v5 r: K6 o( I5 y( p2 _ - 8 _9 H/ L3 O" K$ \. [! w4 A( k
- @config(age=8 * 60 * 60)
1 C p; I, B7 l - def index_page(self, response): , d" }# ^2 m: r- B1 O3 I) u1 j
- Datos = {* d5 ^2 _2 |: I$ K/ J4 Y
- "Cater_Name":response.save['Cater_Name'],+ n+ c+ V0 R# j k. x/ I
- "Book_author":response.save['Book_author'],
- x# @5 P: c f+ Z1 ^4 S - "Book_Introduction":response.save['Book_Introduction'],
- l& o! L1 P% d H' p+ M+ t: u0 U6 } - "Book_Synopsis":response.save['Book_Synopsis'],
; L9 _4 U0 S: S - "Book_Palabras":response.save['Book_Palabras'],
; _) {7 y' d1 D j6 \0 M - "img":response.save['img'],
/ [. t! b. G$ h( @3 { L: @& y - }
, g* W* M- a3 |+ e$ z - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
/ a3 {2 C* ]: m& i+ R& V0 O) f7 D - # for each in response.doc('.chapter-list a[href^="http"]').items(): . F; U6 ?# l1 {9 x
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)7 b7 E0 H1 {1 O( z
- @config(priority=2)
# Q+ v O# p. ]8 q( g4 a, M - @catch_status_code_error3 A/ s1 L0 w& Q
- def detail_page(self, response):
1 }% z6 I' p% p8 a - NewRe1 = u'哈书'
7 Z; q% s. r: u. t - NewRe2 = u'huhjsd.CC'
& X- ~. Z' _+ _# Q* C+ g - NewRe3 = r'^\\n\\n'6 R% u; F" E) |6 ^
- NewRe5 = u'小说网'
2 x2 v" g6 \7 v1 D - NewRe6 = u'fgdfgf'; `1 t2 o' k* z$ g
- NewRe7 = u'fgfgf'
' ~3 X8 l+ o8 m) J( [1 L, x - NewRe8 = u'ffhgf'
# E8 [4 W, ~7 p R- u4 d! v8 a - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'& b; Y1 O0 W% D" _) V; k
- ReC1 = u'静思'' i2 ? J( O: z: _6 R, ]7 j
- ReC2 = u'aghgf.com'& B0 v" r1 J8 T
- ReC3 = u'aghgfh.com'/ E1 ^$ D- c3 E8 t
- ReC4 = u''% l1 C+ q6 h' T3 P- \
- ReC5 = u'文学网'; x, E: D0 v% f. j* \, N9 S _
- ReC6 = r'<BR>'* X+ d! e6 e! w
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
4 n( [4 X2 l% c! }7 c - print Bookname
# ~3 v! J/ J. j4 F - Cater_Name = response.save['Cater_Name'] # 小说分类0 _+ y8 Q! p# z$ k# @7 k4 |
- Book_author = response.save['Book_author'] #小说作者% I4 N' E! a$ W, j
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介- ^# U+ C) ^9 N. y3 j
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
) h" O# C& _* C/ j - Book_Palabras = response.save['Book_Palabras'] #小说字数
0 W) S5 O' w& G3 g' |3 q2 S - Bookurl = response.url #小说网址
! V- R+ a" Z8 n: ?( Q - Booktitle = response.doc('.article-title').text() #章节名称& x3 q: U7 G6 O5 G, \! G7 e+ x- q
- BookID = response.doc('.readset-r span').text() #小说ID( j* h! T7 H5 ^! u X
- BookConte1 = response.doc('.article-con').text() #小说章节内容
. w) f) [* d; J2 ?- b w, N' ]$ o& K - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)9 K* `$ h( g: h5 k+ S$ s$ l
- Book_Date = str(datetime.datetime.now()) # 采集时间% X# w* r* s2 f7 {0 J% l
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
5 j: ?8 u. Y* F) h$ J: R - BookConte3 = BookConte2.replace(NewRe2 , ReC2)9 ~, U7 V" o9 z0 P0 N, \
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
1 T$ T1 G4 I' y - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
1 b+ c) k d9 m( q2 e, ]- H - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
7 X6 y5 |3 u4 V- u - BookConte8 = BookConte7.replace(NewRe3 , ReC6)) y, [# g+ X. C" \; Z8 X+ M( V, W# y3 d
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
4 p2 x# N# n6 |5 Y3 } a; s - BookConte = BookConte4.replace("\n\n","<br>")+ l& U# ^! b' u! A2 O4 M
- print BookConte/ Y( F6 n7 N. B- J5 m! G
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
$ y k4 o& x, J8 u& X - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
) J- F) d( [+ N3 N9 r* L - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
8 X- ]/ Y3 F6 R0 C9 e - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
$ }! U: H5 R2 z7 d' G - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] * d A/ s: n# t6 g, X9 ?5 ~: W
- Book_img = response.save['img'], #小说图片
8 Y9 W6 c Q) m -
5 U0 g: }" n/ C! {) q: g, h- a# B8 c - #insert into MySQL 小说入库' {& Z# x1 F% p, i: O9 d
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布% D! N! @, C \( E) D9 F0 f9 T
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布8 C( g$ X8 }2 I4 B- P9 g1 o3 G
- #post提交发布
0 a- C) c R2 k- c u - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
$ n" l n+ P3 p9 ^0 | - Datos = {9 U$ f M6 C2 X3 y
- "Cater_Name":response.save['Cater_Name'],
8 }/ p; x3 d. I+ ]) k; z; f - "Book_author":response.save['Book_author'],& T$ w, S- ]% a& N/ R7 }
- "Book_Introduction":response.save['Book_Introduction'],
5 K# ^% R4 b2 ], ^0 y) X" ? - "Book_Synopsis":response.save['Book_Synopsis'],
/ D6 H! M' Y+ R4 b. |. k9 A - "Book_Palabras":response.save['Book_Palabras'],
3 Q5 w3 m0 _5 I) K& y" W4 } - "img":response.save['img'],0 a5 H" y! E; C+ B- w% ?
- }; I! Y6 a# W8 f" u
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():4 F( @4 L; h" \. P/ _& s) b
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
: G0 O( G8 d2 \+ x* [ - return {+ o4 m& d9 P3 X2 v6 m
- "Cater_Name":Cater_Name,# X5 R: E+ |1 d9 F
- "Bookname":Bookname,
* v. z8 y/ R% B7 H; j+ [& h - "Book_author":Book_author,
+ t8 a6 P* K$ g/ u3 v3 n1 J - "Book_Introduction":Book_Introduction,
/ B, h: t# U' x& y - "Book_Synopsis":Book_Synopsis,# t7 R: |$ p8 X0 N+ e7 j1 h
- "Book_Palabras":Book_Palabras,
' a/ r0 m0 T; H* F4 `5 Q - "Book_img":Book_img,
: _$ k- g4 O, T - "Bookurl": response.url,
5 g7 Y" H, h" V8 ^ - "Booktitle": Booktitle,
$ u' |- g9 k' R: y1 a$ E, I - "BookID": BookID,* H7 C y! G5 V* {; m: N; L
- "BookConte": BookConte,
8 v2 S" N, S- c9 e& I/ L3 ` - "Titleid": Titleid,
1 t u# i% ?! P# X0 [ - "abover":abover,
' n! u- ?% _6 |/ n - # "Book_Date" = str(datetime.datetime.now()),
. q& ?" Z) `9 |. F& Y - }
~3 x* @1 @0 C+ L+ Q5 ] - def download(self, P_dir, imgDir, file_name, Book_img):
% {1 b6 K( `3 O, c - if not os.path.exists(imgDir): 8 c6 \9 B E) `3 {% e$ `# |+ T
- os.makedirs(imgDir)
, o) c' y6 \4 u+ L# ~ - file = imgDir + "/" + file_name* e4 n; P; a/ j" x) _
- # print file2 r& n, j& F3 x7 V# `
- f = open(file, 'wb+')
4 J: o6 w) X, ?& B) B$ Q - imag = requests.get(Book_img)
+ y- P5 d% v6 }& l$ }* R* x* _) N - f.write(imag.content) o9 f4 |$ [% P
- f.close()
% p# p* y& K) v. ~% V! C3 q1 }7 ] - #保存图片前
( T9 @5 e" z/ w9 v% A - def save_imgs(self,response):
+ q5 Q3 Z/ N" h5 ? - content = response.content
; ?8 U+ F* x. K) k) w - file_name = response.save["file_name"]# L _) d- P7 U: g, G. ?
- imgDir = response.save["imgDir"]
8 w8 a& u k) u$ v' z$ Z% ?% B3 } - file_path = imgDir + file_name% z# R% ]1 f3 e
- self.save_img(content,imgDir,file_path) J1 ?4 w# b7 Q, b9 X8 a/ p
- #保存图片7 D* d: G) J9 W. |- |! c
- def save_img(self,content,imgDir,path):
' k" O. l& T( a6 k8 _1 b - if not os.path.exists(imgDir): # V. Y, U! {- N
- os.makedirs(imgDir)8 b; E* j/ x6 g% V4 W5 O T
- f = open(path,"wb" )) i6 N8 u- p" K( T6 j
- f.write(content)
. U$ h# ^; M* p. o$ n5 |9 a - f.close()/ J6 u! A# B3 _( W4 ] ~! e, j
- #获取url后缀名2 s! R3 A' O/ h/ o# Z+ r0 S
- def getExtension(self,url): 7 U9 P5 t& g) Q Y/ E8 n" _3 T t7 O$ _
- extension = url.split(".")[-1]. r* A x% ]9 M r
- return extension
; [6 s7 c( G' b6 e - 4 o y% m6 i5 t$ v
- #获取图片名( N. Z# K6 A2 u/ j
- def getname(self,url):
: J+ L+ K9 E% r3 X: n( B - name=url.split("/")[-1].split(".")[0]
1 B* _0 X, t1 G: b - return name
复制代码 3 M* j. }; m) U
+ H: a3 l6 o* a0 N7 K
|