Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!. p" s1 Q- k6 ?8 f v
- #!/usr/bin/env python: G2 H( k7 E1 ]
- # -*- encoding: utf-8 -*-
, Q, ]0 P0 ]- t: r! i, Y - # Created on 2019-05-05 21:43:11% b$ H* C3 ?# R
- # Project: XiaoShuo
- b; E9 b( W+ h - 4 x& P" _5 T! r7 C. ~
- from pyspider.libs.base_handler import *
- x. \; [8 V1 t - import pymysql
' `, K; Y: X% b# n! M( c( x: p# a - import random8 c8 ?; P6 s0 v& i* {7 d; \
- import datetime
+ ?% O. o" \3 l, E8 o - import urllib2,HTMLParser,re
$ o! w0 p+ w9 I' }3 M - import os6 \0 Y/ @; ^/ C4 @, w
- import sys
8 i. q* g1 C8 d3 |( a - import re% I. \" V7 C: ^ g+ W; q5 B& c
- import codecs
$ M5 p& ]. r7 j$ ] - import requests
^6 S1 \0 ]7 ^: }7 v8 ^ - import json
6 e3 {1 _; O/ m9 Y4 K3 b( k4 k - 9 `, o0 G1 w" a! m1 }' \$ B( s
- class Handler(BaseHandler):
, k7 w9 x' c4 }! z5 { - global Datos
+ P0 l1 A; \4 m$ c+ k3 a - global P_dir ! N ]) M% k/ }3 w. Z
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
, m8 i+ z" ~; s0 O6 I0 u3 k* Z# J1 s - global Datos
; q- q1 r! @) w3 T - Datos = {}
/ b6 A: Q: f* y; R H - headers= {3 Q. a0 y! o0 d, L" C# N. w2 y4 e
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',. F* }8 L: Y! S& D' w
- 'Accept-Encoding':'gzip, deflate, sdch',
; j! R4 z6 W% y$ P - 'Accept-Language':'zh-CN,zh;q=0.8',
- f- g- X( \ Z1 O$ u: t - 'Cache-Control':'max-age=0',
/ N: l0 c( O0 D: s - 'Connection':'keep-alive',9 w+ q1 J- C5 Q! h
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'0 E$ q' w+ |$ m8 j$ q9 V; b
- }! S$ y. U; M' C+ W& f* ^6 r
- crawl_config = {! |( ^( K" r9 c" R* ?/ z
- 'headers' : headers,8 ~% g! A7 q# C- d6 j
- 'timeout' : 300: V4 S# m. f! h& H( w( l' m' {
- }& @2 o# B+ l, P1 q
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
2 a2 k. S9 W4 C- R, Z4 R9 C% d - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")% Q# D3 c$ Y5 P0 ^" u( ?$ o2 c
- try:
! l+ D1 T/ E) l- |" \$ n$ P) s - cursor = db.cursor()1 } e% K( ^, {1 d; x6 H
- #注意此处字符串的占位符要加双引号"%s"
- C) M3 |) |; V) M& ? - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);6 Z% E) s b- J( O
- # print(sql)! G, i! @; Z/ d. R+ M6 p
- cursor.execute(sql)
# d } w" w! O' q& ]. ^ g -
- W3 `. I0 i9 Y, g' V3 F# J - #qid = cursor.lastrowid5 Y# k2 t( w+ [! o* \8 d' [
- #print(qid)+ @( J9 A' L. \% h, ~7 \ M8 J
-
. [! U$ B g2 {1 O: z, U - db.commit()
4 I, q0 V# Y2 F' C& A' t& p- L1 Y - except Exception as err:
) q7 M# v- A7 M- c$ h+ v- W$ e& ~ - print("Error %s for execute sql: %s" % (err, sql))
& S! K: i3 }0 N3 p5 s& R' q' x - db.rollback()$ y3 g9 b h( X( e
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
0 L+ b. h) c0 a5 G" t - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
( k% Z! {2 N+ Z4 k0 ~& }9 D - try:
/ O+ ~0 e: d4 D" e/ K - cursor = db.cursor()
: d7 C* [; E8 ~9 s - #注意此处字符串的占位符要加双引号"%s"
q3 h3 N! B9 }0 E4 s0 H - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
" c! F4 m9 d' h9 E1 h6 [/ Q - # print(sql)
) w( K7 P& Q# l3 u1 k - cursor.execute(sql)
3 n) |3 o7 R3 @& _& a9 z; t -
{1 M1 [0 U4 c& Z1 q/ o - #qid = cursor.lastrowid
7 x, W. F4 B4 m" D0 _ - #print(qid)1 P- n& C3 l6 A6 b
- ( _; f& k m% u; [4 H3 g( J( g
- db.commit()
+ t2 @2 }6 t# A: {# t% r+ D - except Exception as err:' s6 ?* K: q( y: P' Y
- print("Error %s for execute sql: %s" % (err, sql))1 |3 v9 r( i1 }; L
- db.rollback()7 J7 a2 p6 j! c; c/ x# O; Y
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
n+ b, _6 i, C. S - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
% @0 I e) h/ L( |4 O( ` - try:' o' x0 u; g* w0 O0 q
- cursor = db.cursor()
4 A7 n1 A# l5 F. U3 ` - #注意此处字符串的占位符要加双引号"%s"
; ~9 Q2 @' d; H* \! j9 r - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);4 x" J) Y+ d; ]/ e! n
- print(sql)+ d1 M: c: O. v0 @: a; @( W
- cursor.execute(sql)$ i1 T; V- D) t1 ]
- print(cursor.lastrowid). M% N A# f; T# L2 M" I8 p7 V
- db.commit()) H3 S6 l* s4 E$ \- F4 `. |
- except Exception as err:$ N: A# P: H5 C# S
- # except:
! S2 {' U5 j, `' q; m! s - # print('Failed')& a. \& Y2 }5 j- I+ ^
- print("Error %s for execute sql: %s" % (err, sql))* [2 I* D/ z; H, h4 `3 R# _$ |
- db.rollback()
+ \" q# M6 j" c( M+ E& m -
! t6 I8 N/ c4 i, A4 n3 Q. y$ c U - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
( z% d. D( y* b+ l4 i - reload(sys)
) N3 {- I7 H4 l& r8 Z - sys.setdefaultencoding("gbk")
4 _0 I6 c1 M( r' V+ u - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址0 D; ]" n) r% W! \8 L# s5 Y
- locoy_data = {
( _0 j3 c+ S8 N c - 'my_u':'用户名', #后台用户名
: U! x7 p- l9 |. w - 'my_p':'密码', #后台密码
' _; c6 V2 ]2 L. w1 q& M0 a/ ~+ Z - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
3 ?9 r0 |" y6 I2 r9 m4 W8 G - 'caid':Cater_Name.encode('gbk', 'ignore'),
& R8 |( ?- G% f' u1 A - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
. C x t, [$ z' V; N" {7 r! N: c - 'article':BookConte.encode('gbk', 'ignore'),4 V$ C- Q8 t- q
- 'author':Book_author.encode('gbk', 'ignore'),* ?. ], E8 }8 R7 T; I& v5 p
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
+ w3 q* K1 ~, b2 M+ l0 e: K - 'thumb':Book_img,
% W B5 B8 |( x4 m5 {1 G2 h5 Z$ j - 'content':Book_Introduction.encode('gbk', 'ignore'),5 m9 V% R- E$ [9 V# S7 u, t# A" k
- 'abover':abover.encode('gbk', 'ignore') % x @7 o, A, W! R" t% A
- }
2 s P1 i0 \) I: D - res = requests.post(locoy_url, data=locoy_data)
( j b( U6 B# e* k& g1 z - print res.text5 y3 x; V* k' }) L
- print res.content
; \. u) M1 {5 d0 z m2 W- s - # print Dsd
$ i6 z' H, E) Q+ @. A6 R - return res
" a' J7 B' w+ f2 y, v! l -
, \; [8 m0 W- V - def __init__(self):! \& W$ w& q. K, V
- self.base_url1 = 'https://www.****.cc/': C: n) u8 L. o q
- self.base_url2 = '/'2 f2 g$ q6 `: o8 V- ]7 E
- self.CaterId = []
J7 C2 g% c7 B* h - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
. ]- o& p; |- v/ i1 u5 _ - self.page_num = 1
% T4 s# {" B, n( |: K, e - self.total_num = 200
, @" N3 F% n3 f% c4 v -
/ ]4 v; S4 Y: x z! l - @every(minutes=8 * 60)0 p5 J/ w: @* t2 `, F3 R
- def on_start(self):$ j* }! [7 ^2 ]! a- [! T, ]- u
- global Cater_Name
+ N5 C) K/ @0 [* d, g) ]; d! k - Cater_Name = []7 |# ~' t1 x G) I* K; x: j9 O: W
- while self.page_num <= self.total_num:
( l4 Y( D( f. j' M8 \' X0 z1 M - for self.CaterId in self.CaterIds:
9 q, `+ {1 ~* \5 L - if self.CaterId == 'xuanhuan':/ f5 f6 w- C% Z4 Z5 s" k) P
- Cater_Name = '玄幻'4 d0 Z( v0 k# @9 w! q' ]' j( O( z
- if self.CaterId == 'wuxia':/ `/ M$ P( M0 g) t2 W% Y- }! t
- Cater_Name = '武侠'. Z R I6 `$ t4 C: z; L6 Z* J
- if self.CaterId == 'lishi':
" K* _# J4 `" r; f1 r4 l - Cater_Name = '历史' " I; b o: Z2 |1 p: c2 V2 F1 H3 d; ^1 i
- if self.CaterId == 'yanqing':
& a. m5 C! H- Z( U; P8 ?( e# n - Cater_Name = '都市' ' {& v8 B( c4 O* F& l
- if self.CaterId == 'nvsheng':. \" B! Z r. g
- Cater_Name = '都市' * D/ F9 w$ I( m( k6 u# p* k
- if self.CaterId == 'kehuan':
) {, m4 W g8 G) T# k- L, M - Cater_Name = '科幻'
% }: D, g& [# G" Q8 _ - if self.CaterId == 'kongbu':1 R/ D) F1 p( V% Y6 E
- Cater_Name = '游戏' % t5 [) z7 x+ w" t; b
- print self.CaterId0 N. {* f! J4 L* [. ^" f
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" . d: ~% ?( \( G$ R- a0 d. i& Y2 `; M
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
3 n2 F4 R" k: K - self.page_num += 1 - j( Y: Y: `5 P+ {+ z3 e8 B3 P t
-
+ I! O8 i1 }! D8 ?# S( _+ w+ u - def list_Caterg(self, response):7 k9 L6 t0 G4 z: Y. R# {1 w7 N
- Cater_Name = response.save
! E; Y+ K7 k6 U: a I - for each in response.doc('.pic-list a[href^="http"]').items():8 Z. t$ O" z w
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
, v/ J) o# q6 P -
7 K* t+ H1 D+ Q2 e, s - def list_Caterg_detail(self, response):
" A& w1 E: F% n0 i - Cater_Name = response.save
+ I2 w* p- x* @. Y6 n - # print Cater_Name$ l0 Y& P- h! I; W# z
- Bookname = response.doc('h1').text()) E6 T( v; N x% A' }
- print Bookname
6 ^+ M3 v5 g- c( ]4 S( `$ d - Book_author = response.doc('.authorname > a').text()4 L" h) F6 S/ p: \# x! D# ]& A
- # print Book_author
1 q0 W! o% x4 Z7 @4 k! N2 _- z - Book_Introduction = response.doc('.book-intro > div').text()
" S4 \& J% f% W4 I3 k - # print Book_Introduction+ r& ~1 G4 M- p- A/ h* P
- Book_Synopsis = response.doc('b').eq(1).text()
0 K2 N' ]' {& l- \7 f - # print Book_Synopsis
% t# X9 W* i8 @9 ^4 n7 H- t; x2 \ - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]0 _6 D3 X+ u$ Y
- # print Book_Palabras ~2 z8 [& J$ ~1 b* Y8 W8 _; X" q
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
) Z# `' J: P1 z+ P - # print BookIDs% G" m( I( B# ]8 w, e- ]* V6 s
- Book_Dates = str(datetime.datetime.now())
5 t" O3 O* _* B- j: D - for imgs in response.doc('.bigpic > img[src^="http"]').items():9 M# P! a! w" O/ i# B4 v
- img = imgs.attr.src5 W' z4 R: e, O( h7 ]( D9 w) I
- print img
9 e3 G$ M- ?- f# o/ @ - #小说封面下载
5 r. ^% N. \+ L, Y - extension = self.getExtension(img)
. l4 `9 s3 i9 g5 ]5 T0 T - name = self.getname(img)4 Y" |) Y/ p1 x/ }/ _: R1 _
- file_name = name + "." + extension2 }6 P# v, r. O7 n% H/ d* L5 q) p7 V
- imgDir = P_dir + name
^( Y* _$ J6 J; [: g0 U7 z - Locaimg = imgDir + "/" + file_name
0 r6 D- W- i& k - print Locaimg& L% E" L3 T7 j+ {/ Q
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
n( m8 i: |. z( e8 u R/ e - print('attachment url is ' + img) #
: v0 x' z8 U" b6 K: ?2 I - Datos = {
8 b# ~- E3 B( w - "Cater_Name":Cater_Name, J/ s7 ]/ T/ s% G- T$ @
- "Book_author":Book_author,
) M2 @0 e% e/ n4 R( v7 l - "Book_Introduction":Book_Introduction,
" x- e& h! x* a - "Book_Synopsis":Book_Synopsis,
" G; G% Z* k. f - "Book_Palabras":Book_Palabras,
; u1 _" L+ `( p3 R! o3 Q - "img":img,
1 x, v9 C" R7 F8 f: S - }
* o) o% v" @1 w, N - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布9 H6 \7 w) p- D, b% t: U
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():$ J$ c8 V/ }% N6 y
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
0 O( \7 F$ R. o -
. r4 \2 P4 |4 X1 J5 ` s! V0 Y - @config(age=8 * 60 * 60)
. M4 K+ H1 H ~) ^* S - def index_page(self, response): - p4 {' J O' a7 s5 ^" A
- Datos = {
3 v& X8 Y+ R2 O. g8 b' I - "Cater_Name":response.save['Cater_Name'],
4 Z, k% i! u4 i' j - "Book_author":response.save['Book_author'], Q$ C: p: N2 d* `1 y
- "Book_Introduction":response.save['Book_Introduction'],
) q$ I8 E8 d$ { - "Book_Synopsis":response.save['Book_Synopsis'],, c# @/ d! _3 b1 [1 h
- "Book_Palabras":response.save['Book_Palabras'],; i* F* o( t, q" L; ~
- "img":response.save['img'],& B n% Y" d9 q& u
- }
$ R. i8 m' i' U( c - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():' j! ~6 f- U5 ~# J
- # for each in response.doc('.chapter-list a[href^="http"]').items(): ( n m0 X i" b4 ?& x
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
* Q, {3 h- ]7 s5 e - @config(priority=2)
: N- `! `. @! B4 R5 W9 h9 u A - @catch_status_code_error
4 {! m5 V0 o. o# e/ z - def detail_page(self, response): 0 W6 a, v$ Y- s- p
- NewRe1 = u'哈书'- h5 ~+ X! r1 g) H, ^
- NewRe2 = u'huhjsd.CC'% W! W" z, ]0 T7 ?' ~1 ?
- NewRe3 = r'^\\n\\n'9 B b Q9 M9 S9 T8 ~
- NewRe5 = u'小说网'
1 ?- h" p' O6 e# B$ W) {6 i - NewRe6 = u'fgdfgf'0 g( b4 n: h8 U* D
- NewRe7 = u'fgfgf'
2 [3 G) U; O' V8 E1 {! A5 @ - NewRe8 = u'ffhgf'
) e, p5 O( s2 ~+ { - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'% c9 R4 r8 Z; ^. r9 ^; x" v
- ReC1 = u'静思', c( M" f, Q3 p/ r+ k
- ReC2 = u'aghgf.com'
+ H; ~* e' m- J" ~. Y - ReC3 = u'aghgfh.com'
& v0 B5 Q0 Z( r) P, G9 G6 J8 s& \ - ReC4 = u''
4 W! @2 U# K7 V, m; | - ReC5 = u'文学网'2 m6 l5 @( {4 l
- ReC6 = r'<BR>'
& c7 N, D; e( T# y - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
5 O' L! \$ [" F1 H% Z* J) ^( R - print Bookname
$ |( ? h7 {: n0 \" { - Cater_Name = response.save['Cater_Name'] # 小说分类. t+ J9 x( i, M3 ]
- Book_author = response.save['Book_author'] #小说作者
& D/ J! R0 |4 U% h# t" R& O( w - Book_Introduction1 = response.save['Book_Introduction'] #小说简介- R+ d, S. Q! A; I9 h m
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新5 ?: a# x2 C) p
- Book_Palabras = response.save['Book_Palabras'] #小说字数, A: m; n7 Z& s6 Z& o
- Bookurl = response.url #小说网址0 ]9 s% b6 x5 ]7 a
- Booktitle = response.doc('.article-title').text() #章节名称: C3 m: ~' w: Y
- BookID = response.doc('.readset-r span').text() #小说ID4 r) @1 \8 i a+ S# R" f% j$ D( ^9 @
- BookConte1 = response.doc('.article-con').text() #小说章节内容
5 J- R' v( L1 ^0 l# C: g, D9 C7 R1 w - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
: D3 t7 @4 b" ]: u" }: S% s9 [ - Book_Date = str(datetime.datetime.now()) # 采集时间
" H* |4 ]) B0 g J2 O' l1 N - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
; B4 M) g1 r0 t" ` - BookConte3 = BookConte2.replace(NewRe2 , ReC2)( _1 Q; V" z9 z- e0 L* D" j9 D) L+ m
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)2 G# W: T6 P- N; X/ h: q$ G; g
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)4 w& D/ D3 [/ X0 k5 d5 E$ C' u: {
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
L _( U* M' \7 M7 Z - BookConte8 = BookConte7.replace(NewRe3 , ReC6)8 H4 f" z8 ?" q5 } @+ P6 M
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
3 J# y, |- }# x% G% r - BookConte = BookConte4.replace("\n\n","<br>")1 U" I! v+ F. r8 h. h8 `" j" @, a# c
- print BookConte
5 I; L$ F. C O8 T. ^7 Y - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)$ ]9 U0 z3 Z$ X1 I3 n4 F
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
* h S4 `) b/ c3 K% \( a - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
$ Y% X" u+ S& |" D7 b - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4); P& i ?5 g6 W' g* |$ m
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
J g4 w- R! T( s0 S - Book_img = response.save['img'], #小说图片1 K; a4 p% V7 E# F
-
* O! i8 s A2 H8 t0 X9 \: \: t; c1 c - #insert into MySQL 小说入库- I Z# h0 z5 s p
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布# I7 F. |6 P) G9 y
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布. y) n! k% a7 _# g W0 N1 |! o9 L+ l: D
- #post提交发布
X9 {5 j+ \, w# k - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消- p9 m7 G- T0 H* N9 f
- Datos = {
/ U: j! e3 n. c. e; L/ [# x - "Cater_Name":response.save['Cater_Name'],
8 ]+ I0 d" u% S - "Book_author":response.save['Book_author'],
# i2 a( J! b; p- ]$ q5 V7 z4 R - "Book_Introduction":response.save['Book_Introduction'],4 P, x9 |, E) x
- "Book_Synopsis":response.save['Book_Synopsis'],7 o( i/ C- ~) M5 H; a" L. s
- "Book_Palabras":response.save['Book_Palabras'],
2 W& ~/ H, r, g! ?. m+ j( v - "img":response.save['img'],
5 E3 z& N; D' W, ?) w - }0 @/ `/ W( \9 o
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():) a$ N. g1 Q7 ?$ x8 j$ i
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) ! _7 V3 \" t3 A% U w- O1 G
- return {& x$ H. Z3 \$ i6 K4 h' E' h4 K7 O
- "Cater_Name":Cater_Name,- Z2 y- D% L5 {) z+ i
- "Bookname":Bookname,: ~, k" @7 n; I- ]! k
- "Book_author":Book_author,* D% v: T- Y) ?/ R$ I* `
- "Book_Introduction":Book_Introduction,$ v3 _: f& r f* Y
- "Book_Synopsis":Book_Synopsis,) |: V! K1 n9 _6 C+ ]5 n8 U1 W1 n! Z
- "Book_Palabras":Book_Palabras,& i* W2 K0 T' u& B
- "Book_img":Book_img,' e5 m% v" W9 T b% N- z1 h$ Y
- "Bookurl": response.url,2 w. }. z" M0 g2 _7 L* |/ t& {
- "Booktitle": Booktitle,
* s6 K; x* j- p9 G$ w - "BookID": BookID,& z( e. E. g1 T9 Q4 K; J( U" m
- "BookConte": BookConte,# k8 z3 g* K' a/ ^! l
- "Titleid": Titleid,4 i0 y; k: }1 A1 ?
- "abover":abover,
9 m J# }: C a+ R% o! c" f0 Y - # "Book_Date" = str(datetime.datetime.now()),
. n) Y. M9 E& n - }
, Q5 p) R! B, i! u6 V3 ?! g& i4 N( ] - def download(self, P_dir, imgDir, file_name, Book_img):
2 s) ^0 s6 b2 n( _% } - if not os.path.exists(imgDir):
/ {2 w3 ]0 W+ l! F3 _$ ]1 o - os.makedirs(imgDir)6 C( u( l5 ?6 `! L1 K9 z
- file = imgDir + "/" + file_name
3 b0 L/ C. n: Q- G - # print file
* K3 p# X, G: T/ \ - f = open(file, 'wb+')
- t1 O4 h% W0 U' i4 c; ~, [ - imag = requests.get(Book_img)
1 {7 w+ N+ B0 W: ^8 u P - f.write(imag.content)
7 b, ~2 Z8 A& U; p- U& S% i - f.close()
2 N& A' S8 O- m9 w7 o8 P7 @& i - #保存图片前* i, e/ k9 J" N1 Q& p& B; y9 m
- def save_imgs(self,response): Q9 {5 T- D# B4 f
- content = response.content2 O" ?+ d3 N; I* r) p/ U. U
- file_name = response.save["file_name"]* q( {: x1 N1 T7 H# E
- imgDir = response.save["imgDir"]
) }: Z2 C4 `7 H* d$ g9 u - file_path = imgDir + file_name
% x. [4 Q/ x# Q% I5 K. V' N, k j: U - self.save_img(content,imgDir,file_path)
2 y1 @. G( J& `3 N - #保存图片
) V2 x1 k+ t( M- W9 j7 X - def save_img(self,content,imgDir,path):
( Q3 P& q* R l, S4 x+ } - if not os.path.exists(imgDir): - r0 D7 @& N( K5 h
- os.makedirs(imgDir)
; q6 W8 _; p; W% l0 y - f = open(path,"wb" )" ?' I4 i' ]) W5 W: r/ F
- f.write(content). ?! v1 j/ h) r- r" d
- f.close()
7 y5 { ^1 F% `3 R- ^# I7 X - #获取url后缀名
; c4 x! K- }& _" r - def getExtension(self,url):
1 Y" L8 y. k7 n1 Q( r - extension = url.split(".")[-1]
1 l5 a5 T* Z7 r8 h4 J& o9 y - return extension
6 U0 R7 G6 r! |& G! ~ - # \ j& K [0 S3 b( i0 @& R& f6 g
- #获取图片名
4 l z) [3 l1 E# G: i$ z K& B - def getname(self,url):
8 B/ Q1 v0 U# r6 F/ Q - name=url.split("/")[-1].split(".")[0]
/ l, w* y4 X6 |% @& ]0 N8 ^! }$ u% N - return name
复制代码
1 u& v1 E) C1 T% _2 u# q' p 2 ~8 x# X! `; b8 f: K
|