Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!5 d3 R5 O) f. i$ {1 P
- #!/usr/bin/env python
+ x+ U$ Y2 E8 F* ?9 b5 u0 R; V; l - # -*- encoding: utf-8 -*-
0 n+ r; s3 C8 V4 S - # Created on 2019-05-05 21:43:11" S$ o7 i5 u3 s, c
- # Project: XiaoShuo
4 c M/ V* _5 Q m- _; \ - # a% Y" \0 p v2 y( C4 V
- from pyspider.libs.base_handler import *
6 t4 s. Y9 r& w$ N7 X( x - import pymysql% s# t, A" l; j8 e' g! ^9 J
- import random
+ Y. k6 x1 A E5 }2 M% K - import datetime
: l' V' h+ t9 Z+ m - import urllib2,HTMLParser,re
8 ?6 \( M+ F; X- a" `3 f - import os3 P' a) F, Y* @" z+ ~. k& u' i
- import sys" V6 w$ @2 M, H' L- j
- import re
$ F/ g) l& R# L$ ]2 V! A& ~ - import codecs: v% u3 F- }4 C& {8 }+ ^. v5 I
- import requests' n: K7 n' b7 w& N
- import json; Y( m, v s0 K7 y# i) T+ h' V
-
, }# B7 H9 V# ]7 p; ~$ R3 S( z - class Handler(BaseHandler):
( n- G2 W8 C) y$ J - global Datos0 ^+ {# d8 s* o+ f8 ~& `
- global P_dir ) R/ V& X7 \9 o6 E+ v' t4 o: I
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径! F5 a# @, H/ l5 O! _% l( Q
- global Datos
3 x4 D& c$ x7 H1 x* q' X - Datos = {}
% W% ^9 ]! F" {# t2 S9 t - headers= {
. H+ y9 ^. V2 M" z - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',* X+ |& p( [* j! X7 X% D
- 'Accept-Encoding':'gzip, deflate, sdch',1 c9 g/ b8 a( c) ^8 |
- 'Accept-Language':'zh-CN,zh;q=0.8',. X5 l- u+ A( N+ m9 c3 S
- 'Cache-Control':'max-age=0',' y# k" v5 x ~# |8 m; e1 f) q4 B
- 'Connection':'keep-alive',
4 ^0 b' r! m4 j4 v* w; A0 n2 P - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'3 K- X$ N$ b5 s# C4 y3 N2 u( N
- }* `: b5 `0 W. d) {; x
- crawl_config = {5 R3 H$ z0 j$ O
- 'headers' : headers,
% q! d2 J' c5 @8 d& Y4 u - 'timeout' : 300/ U' v3 K2 X2 ?7 f$ e+ i
- }9 w7 q) w1 M& O6 S2 ?6 r1 ~: p
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):9 e9 g6 c) N8 W* d: p
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")% F9 X0 N) i, o3 T& J x( Q
- try:
( u$ y2 A, h) T' I0 D - cursor = db.cursor(): E) v: b! l+ u) X9 q
- #注意此处字符串的占位符要加双引号"%s"2 y P2 }: H) {% @6 R
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
2 {# J5 f$ y8 Q, J! [, v - # print(sql)
' J. m. F6 q. X3 x+ T' ~ - cursor.execute(sql)
7 u" _, n; s9 T; o: g - + u7 I) v$ O% {$ e
- #qid = cursor.lastrowid
) L# n) w1 r2 P& @0 N: ], A8 s - #print(qid)/ ~" o: P+ U/ ~ P/ C: ]+ e
- 9 V& v! ?* A {1 `
- db.commit(): W0 L4 J# ?# Q) [( p, _4 j/ N
- except Exception as err:( Q$ a8 O7 c/ i. R4 x/ I3 `8 i
- print("Error %s for execute sql: %s" % (err, sql))' y' U1 L0 Q) }2 Y" O
- db.rollback(), f9 ^% b9 p1 z h. O
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
! q8 z) b6 Q$ J2 k( t- s - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")9 U$ V: \$ ?; B- r. [5 l
- try:4 ]+ V2 @5 P/ A3 i
- cursor = db.cursor()
0 d! w) v" R% b: ^( E - #注意此处字符串的占位符要加双引号"%s"( G) F J. S. D- m
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
9 h& I3 M$ Z, t) a - # print(sql)
( |' W" V( m5 ]$ w! ]( b; { - cursor.execute(sql)9 L5 L4 x3 q3 S
- $ C) X0 e7 X/ H
- #qid = cursor.lastrowid
' V9 n( i+ X* s; R# `/ g! [2 j - #print(qid)( P/ F% [& p5 V; v$ A$ }# F) @ X
-
0 B! A1 E5 p/ N) f( i - db.commit()$ q1 l( J# d! R6 t2 l6 e5 L
- except Exception as err:
5 @# s2 @/ ?" G0 L' w - print("Error %s for execute sql: %s" % (err, sql))$ D! y- W8 f/ e o. |% V
- db.rollback()1 H5 s2 }/ v# ^& T1 `3 L" R. p
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):% V3 c3 f- A- o( h/ A
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")$ Q$ r4 N5 t4 F" \/ t& \3 U
- try:
, u( s5 f7 u& |3 o - cursor = db.cursor()+ c! ^0 T! [( ?- t1 p( C
- #注意此处字符串的占位符要加双引号"%s"' A! A5 {$ ^ v/ J% ^
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
O+ P; Q& G+ p* H. U& x5 V1 D - print(sql)
4 n; q' k# q2 X - cursor.execute(sql); N x4 _) O% x2 F; z) V
- print(cursor.lastrowid)
* L6 X& |, U: a7 o( h& B: M - db.commit()
" U& [: H9 R1 `! c& U2 {5 P - except Exception as err:
5 p$ y# A8 S9 P- l4 V3 o - # except:# }- f) P0 U' S" U* q
- # print('Failed')& j7 W# l, J: m8 |
- print("Error %s for execute sql: %s" % (err, sql))/ i3 E3 I* c& j3 @3 P1 F% G
- db.rollback()
* E) Z& u0 a& j2 e - 2 Z6 C% Z4 X% y* P( o5 `
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): ' q { J, a" f$ X; l% H9 U& C
- reload(sys)& _5 F. c f: A
- sys.setdefaultencoding("gbk")" Y+ {' w4 h, Z0 q
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
9 Y9 W8 V F% a; | - locoy_data = {
; K* a7 ]0 h4 ?" {$ @ - 'my_u':'用户名', #后台用户名
0 P7 \% n) d5 t- A0 ~" F- A - 'my_p':'密码', #后台密码
! {! K2 M& M. T - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),, s- M* u$ D% J" X% x* c; K5 r" K
- 'caid':Cater_Name.encode('gbk', 'ignore'),
8 z) w4 ~3 l: i- x' h - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
# s* Z+ Z0 i! w/ ? - 'article':BookConte.encode('gbk', 'ignore'),
! H) w- S0 U6 U" o - 'author':Book_author.encode('gbk', 'ignore'),
( T& ?' a* u5 _) B( ] - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),5 E0 }# ~2 d+ G) D
- 'thumb':Book_img,
: I3 [; D5 n8 F& L4 ?1 i - 'content':Book_Introduction.encode('gbk', 'ignore'),4 X3 M9 r8 \) s
- 'abover':abover.encode('gbk', 'ignore')
$ U: r7 g7 t1 p+ ?1 |4 { W - }: m- g; s# M! F( S+ C
- res = requests.post(locoy_url, data=locoy_data)
( E4 j5 O1 k8 b2 t# @% p - print res.text+ V$ z0 G( Y" r/ s, q: m
- print res.content
5 {* d- ~' A x# d. U8 q5 A - # print Dsd7 ]2 j# t( Y8 S; F+ [
- return res
" p7 r7 K9 k5 X) P8 G -
0 L7 Z3 O& M0 {2 @9 d( _3 q9 C - def __init__(self):% U8 f- }) w9 x; P2 y
- self.base_url1 = 'https://www.****.cc/'
! X( i: ?: W: M& @& ?/ S5 e - self.base_url2 = '/'7 _/ V( |# k4 H" N/ u% }8 ~
- self.CaterId = []
4 ?# a) X" f" Z/ K) `: B - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']$ i" l4 R( ^( i* ~" b4 Z
- self.page_num = 18 q9 A) M5 U1 t
- self.total_num = 200
# U- u$ w" p2 A1 {; i -
( Y& d5 r. A B- @! a - @every(minutes=8 * 60)
5 n, G6 ]% g- ^" a) R7 ? - def on_start(self):; [* B$ K) }8 M
- global Cater_Name# f0 R1 N l3 t* h, O
- Cater_Name = []
; K2 {2 U5 B* e; M7 p+ D* J0 V, v - while self.page_num <= self.total_num: : ~* v( j& ^( f% }0 u& F
- for self.CaterId in self.CaterIds:
% w; G% ~$ _ x' x9 x8 b* Z2 `' T9 g$ w0 b - if self.CaterId == 'xuanhuan':% W. I) H+ H( K6 x& C; i+ p
- Cater_Name = '玄幻'
$ |, ~) V- Q- l( | - if self.CaterId == 'wuxia':) q# u" \& K N6 X/ o f
- Cater_Name = '武侠'8 I! o. q. j7 [3 V$ B
- if self.CaterId == 'lishi':
( R/ S! J& Z! `" g% Y - Cater_Name = '历史'
* E2 M0 ] t3 e - if self.CaterId == 'yanqing':& {; m2 j, ~4 o5 z3 e! s) e
- Cater_Name = '都市'
# a$ s7 x( x8 t9 a* A3 a - if self.CaterId == 'nvsheng':! {% y2 p+ U% \( U6 E$ G6 I; z$ P" w
- Cater_Name = '都市'
& c2 z- f+ h3 J. }, Y - if self.CaterId == 'kehuan':/ Q' ^- ]* t0 o
- Cater_Name = '科幻' 3 S. ~# N" F; Z" b4 f5 C8 c5 C' n6 T6 Z
- if self.CaterId == 'kongbu':
9 E0 ~9 V9 o4 _9 a2 i - Cater_Name = '游戏' - ]4 G- ]3 Z- c6 l
- print self.CaterId
- j4 \. ~% v( ?7 d( N - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" * @' h [6 s Z: ]$ v1 d
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
4 |0 n- z Z `( o9 m - self.page_num += 1 + Z% l% C' U8 \% `
-
6 T+ x5 v1 D/ M) e; Z* D9 B - def list_Caterg(self, response):
* y3 B' e6 H0 r) g6 D5 ^* Q - Cater_Name = response.save
0 P+ w3 S1 f8 I- ~5 ~ Z - for each in response.doc('.pic-list a[href^="http"]').items():3 V/ a% E1 i1 X% s- X2 B. Q: ~* i
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
: I. r3 c, x8 @+ i N* X -
6 t# c7 _; _& i - def list_Caterg_detail(self, response):
: y4 p* F. p/ F6 F- ]( }7 Y - Cater_Name = response.save0 Q b' l1 ^! l6 |9 n$ }
- # print Cater_Name8 q$ |, [! M) ?; S I4 j6 L
- Bookname = response.doc('h1').text() t, _" h: k/ ~9 _ i- ]! F0 f
- print Bookname
3 z) {1 I$ i( ?5 g. V - Book_author = response.doc('.authorname > a').text()
; I& F& s0 a l2 n& M- P& o, {, N - # print Book_author
, I9 @8 F7 b- j5 P y5 m9 D - Book_Introduction = response.doc('.book-intro > div').text()2 i( o, C' P" s5 q4 y
- # print Book_Introduction
+ \- t2 n! T U3 ?: Q - Book_Synopsis = response.doc('b').eq(1).text()' e1 [2 E- R0 g+ ~
- # print Book_Synopsis9 A/ }- O2 b0 I2 J" q- I
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
, _/ a* R4 N/ w3 D& [# c" e- t- M6 z - # print Book_Palabras
/ L2 `1 k4 ?8 G& d' L - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
4 G' v( X2 E) Q0 |8 l5 U$ g - # print BookIDs1 ]9 u1 k ]5 Q/ c1 o
- Book_Dates = str(datetime.datetime.now()) ; n+ _# k# u6 x0 V h
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
* m* \( [1 [5 k1 W8 Y; H - img = imgs.attr.src9 Z& ~' W2 t: x8 j8 w, P5 s2 K
- print img- ~ g8 h/ ?+ q! f
- #小说封面下载
) |4 |9 z! v' K! \) h+ ]* p - extension = self.getExtension(img)" f/ I' }" @$ k% L4 J& y2 @/ a! f
- name = self.getname(img)
8 W# ]" @* ?8 T/ A% J, w4 ^ [ - file_name = name + "." + extension+ u7 w- P+ o W; A" c
- imgDir = P_dir + name+ I- H0 ~5 d4 `
- Locaimg = imgDir + "/" + file_name* u2 [ g: g4 |
- print Locaimg- ?' K3 t+ x9 j$ [6 n( m. A0 u
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
; A9 K2 w M$ N2 o' D - print('attachment url is ' + img) #
, @+ [* K- ?3 f, S% h - Datos = {) `3 P S% M) y8 x# x5 u- S
- "Cater_Name":Cater_Name,
3 R( Z0 I% f& l+ f - "Book_author":Book_author,/ d: C9 n, }$ [+ X
- "Book_Introduction":Book_Introduction,& J' X; y* j1 {8 ]4 c) r0 r
- "Book_Synopsis":Book_Synopsis,
8 ~, F3 j$ |$ H - "Book_Palabras":Book_Palabras,
$ H2 B( L6 K' x$ F; ]8 ] - "img":img,
- C3 N1 S) L$ o - }
7 n# F7 i% x7 s; p2 ? - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布( @0 S1 H3 \; w( W+ D4 X$ G, O- B
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():$ R/ k/ K* h: S9 Y4 P
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
- z5 o% T# A$ C( s3 _9 G -
* ~( a5 K9 W1 O4 g9 a4 |. o; ] - @config(age=8 * 60 * 60)
! e8 {/ w3 K' | - def index_page(self, response):
0 \( j% Q' O1 r# ]% ]8 a7 C/ i - Datos = {
v& Q6 B/ l* u5 E& p - "Cater_Name":response.save['Cater_Name'],
5 W9 }8 J% l# d5 n. V/ l - "Book_author":response.save['Book_author'], ?8 ?& k5 p- W3 {6 X: k3 c2 v
- "Book_Introduction":response.save['Book_Introduction'],
, Q/ f3 K4 Z+ i$ v - "Book_Synopsis":response.save['Book_Synopsis']," c/ l5 W0 E& \# q# { k
- "Book_Palabras":response.save['Book_Palabras'],. |' _2 Y1 O4 ?9 V% q/ I9 I
- "img":response.save['img'],
' c# Z ? j' F5 O6 W1 x& \ - }
/ X }3 W+ J, G% G - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items(): d$ p8 v0 j- W
- # for each in response.doc('.chapter-list a[href^="http"]').items():
0 X3 q* T1 R1 a+ ^2 Q5 i - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)( P5 H* u9 V, D' ^4 Y5 V
- @config(priority=2)
( f- Q2 K" P/ I - @catch_status_code_error$ [" H' Y/ |' b. t
- def detail_page(self, response): 7 n+ w5 X7 V( m. y' g& g
- NewRe1 = u'哈书'; V: T$ N, U( |! `
- NewRe2 = u'huhjsd.CC'
* B% M1 U- x" v! ]: B X - NewRe3 = r'^\\n\\n'
$ U1 @: s; g; d4 ?! X- Y0 J - NewRe5 = u'小说网'
* X; \/ G/ F8 H6 a - NewRe6 = u'fgdfgf'
1 H# \/ g/ n7 t9 n/ R - NewRe7 = u'fgfgf'
/ h/ Q; K. m- ] - NewRe8 = u'ffhgf'
8 ?3 e# ]) Q8 a3 S/ a( l- i$ h' K& s - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'$ M! Z& T1 l% h. k, q4 g
- ReC1 = u'静思') T( z# A! O/ s2 k+ u
- ReC2 = u'aghgf.com' _6 [! x0 J# z$ e( I5 B* p
- ReC3 = u'aghgfh.com'; u9 {) b( @- G7 b5 h* |
- ReC4 = u''
, `4 n& |; P% s9 ]6 y: V7 x - ReC5 = u'文学网'
- f7 k5 |7 {! W/ F4 S- `* F - ReC6 = r'<BR>'
$ s( n. b3 f. g h1 z# @$ a+ p5 W - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称# n9 ]) j0 N, P+ i" o& p8 Q
- print Bookname
! w# O; e0 X4 Q" @$ B6 ?# ?/ A% [ - Cater_Name = response.save['Cater_Name'] # 小说分类" W ]8 x0 O& i4 p) E8 V& S
- Book_author = response.save['Book_author'] #小说作者2 A' j% A, V2 |" Q$ \; I; E
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
1 a7 a6 y# u, s6 i! N* ] - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
) @ z! z1 Y4 } - Book_Palabras = response.save['Book_Palabras'] #小说字数
; @% A! B8 {6 z - Bookurl = response.url #小说网址
, C' V1 o2 z7 o& v' V: N - Booktitle = response.doc('.article-title').text() #章节名称
4 p1 y. d: T1 v( ?; B" C6 I - BookID = response.doc('.readset-r span').text() #小说ID# n5 D! H! i' |+ i2 A9 |
- BookConte1 = response.doc('.article-con').text() #小说章节内容
' _+ n0 F( o8 _9 a6 d' o - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)3 O3 v d I' r+ P; W
- Book_Date = str(datetime.datetime.now()) # 采集时间1 `+ y( p! v8 m; g" a* e* b" B: H
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
# T6 `' a* v0 P4 \, d8 b: ^0 J% I - BookConte3 = BookConte2.replace(NewRe2 , ReC2)" Y$ v' d' P/ p5 W! u
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)% T. r7 F3 f2 ?- s" e, I
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)( S! T+ A- O1 p% g
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
( m- D3 O$ a) u5 L. G5 u* k7 `' j - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
& R3 [, h1 H) I# Y - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
0 |5 E# P+ Y0 J4 |5 ] - BookConte = BookConte4.replace("\n\n","<br>")
) z8 n1 ]5 }8 o3 X5 K - print BookConte
$ [' v$ d0 K1 q; J! b, } - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
6 y- f0 x% B! a - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)' m5 f! P# D2 _8 x3 f! l
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
6 o. I- T4 f2 i3 {' L4 s - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)9 e; m! j8 N/ h, K0 A* O6 X
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] : Z/ X6 y8 |, f& M7 Y
- Book_img = response.save['img'], #小说图片
3 H0 \/ Z. d; C$ ~( X -
- t% n+ {$ J; ^9 V2 L - #insert into MySQL 小说入库* @4 o. J; W# S {' B1 n( g3 G2 q
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布; m+ S/ W( H0 V( \- N
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
* o! S3 l' d0 ?# L( {2 q - #post提交发布+ j: h M( x- b4 l0 K% Q& C
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消/ B3 v' |2 `0 l
- Datos = {; o8 L: B8 d/ ]5 Y& x% O
- "Cater_Name":response.save['Cater_Name'],
/ D1 Y1 O% e& m2 ]$ u - "Book_author":response.save['Book_author'], _6 I- t/ G1 C% I0 Q% V
- "Book_Introduction":response.save['Book_Introduction'],
% A$ u/ k! \! G - "Book_Synopsis":response.save['Book_Synopsis'],0 c: [7 A3 g9 h( u
- "Book_Palabras":response.save['Book_Palabras'],
; f% q% ]! n) A! z - "img":response.save['img'],
- p; |# _0 H" I" V) e - }
; J* h3 `" v$ t3 l# l' p - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
( h3 n% F+ z! v _4 M - self.crawl(each.attr.href, callback=self.detail_page,save=Datos) . [2 Q* ?6 V* N& O0 ]. W
- return {
% K+ d3 ?7 |( U8 x" Y- t - "Cater_Name":Cater_Name,
$ |) [$ g, h. Q Y - "Bookname":Bookname,
! u% w2 a' D4 b, c# a - "Book_author":Book_author,. E: j' ?% }7 l5 C$ S
- "Book_Introduction":Book_Introduction,
4 _, P( Q6 V1 \+ F, O. F% N x - "Book_Synopsis":Book_Synopsis,
* h' k- K& X" s! w9 I - "Book_Palabras":Book_Palabras," M. w5 f2 `) s$ J
- "Book_img":Book_img,) j# a; ~2 S% p+ }1 {
- "Bookurl": response.url,
8 \5 \8 \* E9 O2 D3 f { - "Booktitle": Booktitle,
1 u; `# ?( l8 F6 [5 l# ]- N - "BookID": BookID,
1 W: B4 d! f/ r0 ]3 m* K, X - "BookConte": BookConte,
% d8 U/ O% M3 U8 c9 H8 O" U" j - "Titleid": Titleid,3 I; \+ z7 ]' ^
- "abover":abover,7 p/ f1 l7 z4 I0 a' _
- # "Book_Date" = str(datetime.datetime.now()),
: G' b, W& V' v- h) ~ - }
4 h0 p1 w! I; l& Y; W' F5 L - def download(self, P_dir, imgDir, file_name, Book_img):
3 `) x: q" e* L7 x7 ] - if not os.path.exists(imgDir): % N% t' k' `, @. m& }, B) C. y
- os.makedirs(imgDir)( g3 }9 d+ r, h% k" K8 h9 c8 v) d
- file = imgDir + "/" + file_name! {9 m0 u6 s; }! d6 F( H. D8 A
- # print file
0 Z5 x5 [, F9 @; K/ y - f = open(file, 'wb+')
6 \/ M4 w' G9 l$ }4 s# w8 H - imag = requests.get(Book_img)
4 ?5 ?% }, n' e3 F7 P$ g - f.write(imag.content)
* D) K$ ?: b* C# o* p% w: C - f.close()
, ?; a; Q ?& T% L: ?- G - #保存图片前
( L# k5 y9 Q% N( Z7 q* @% ? - def save_imgs(self,response):" B7 W7 R2 G( K7 q1 v
- content = response.content
5 K! k( i4 D! _5 n r7 B& ? - file_name = response.save["file_name"]3 x9 u5 N" g) o+ R
- imgDir = response.save["imgDir"]
0 Z' A* ` a! B7 _ - file_path = imgDir + file_name) m4 x- b) }. A3 ?; m& p
- self.save_img(content,imgDir,file_path) Z' S+ n) [. r0 }. `$ v d( D
- #保存图片
4 Q+ X/ q5 G3 p8 P9 ] - def save_img(self,content,imgDir,path):" `. d- _8 }' w+ `+ O) s9 M. f
- if not os.path.exists(imgDir):
+ k3 l2 Y, I4 F3 t$ e& S, G - os.makedirs(imgDir)
- C$ ^' F) e) H - f = open(path,"wb" )4 W o1 r3 v" c; |% M6 B
- f.write(content)
7 Y. p# T3 \( N' P( F - f.close()
. E3 w0 L! M7 X% S5 ?2 | - #获取url后缀名
- y, L! k8 G0 H' ? - def getExtension(self,url): 9 r- g& f0 E0 c4 q- A3 W3 N5 d
- extension = url.split(".")[-1]$ r" r A& ?0 ~) G; A1 {
- return extension
- t& J+ Y) v$ x - % |9 M+ o) u: a% s/ T4 L6 F
- #获取图片名8 T. J, F) [$ a- ~8 F
- def getname(self,url):. _# g! `3 b, g) g- O( X
- name=url.split("/")[-1].split(".")[0]+ {+ P9 m: g0 ^# c
- return name
复制代码 - `" c2 q2 `) S/ k. N" b+ p q. A
$ Y; S; c" @* ?, L |