本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程 douban 和图片例程 douban_imgs ,具体如下。
例程1: douban
目录树
1
2
3
4
5
6
7
8
9
10
11
12
|
douban - - douban - - spiders - - __init__.py - - bookspider.py - - douban_comment_spider.py - - doumailspider.py - - __init__.py - - items.py - - pipelines.py - - settings.py - - scrapy.cfg |
–spiders–init.py
1
2
3
4
|
# This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. |
bookspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
# -*- coding:utf-8 -*- '''by sudo rm -rf http://imchenkun.com''' import scrapy from douban.items import DoubanBookItem class BookSpider(scrapy.Spider): name = 'douban-book' allowed_domains = [ 'douban.com' ] start_urls = [ 'https://book.douban.com/top250' ] def parse( self , response): # 请求第一页 yield scrapy.Request(response.url, callback = self .parse_next) # 请求其它页 for page in response.xpath( '//div[@class="paginator"]/a' ): link = page.xpath( '@href' ).extract()[ 0 ] yield scrapy.Request(link, callback = self .parse_next) def parse_next( self , response): for item in response.xpath( '//tr[@class="item"]' ): book = DoubanBookItem() book[ 'name' ] = item.xpath( 'td[2]/div[1]/a/@title' ).extract()[ 0 ] book[ 'content' ] = item.xpath( 'td[2]/p/text()' ).extract()[ 0 ] book[ 'ratings' ] = item.xpath( 'td[2]/div[2]/span[2]/text()' ).extract()[ 0 ] yield book |
douban_comment_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
# -*- coding:utf-8 -*- import scrapy from faker import Factory from douban.items import DoubanMovieCommentItem import urlparse f = Factory.create() class MailSpider(scrapy.Spider): name = 'douban-comment' allowed_domains = [ 'accounts.douban.com' , 'douban.com' ] start_urls = [ 'https://www.douban.com/' ] headers = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Encoding' : 'gzip, deflate, br' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' , 'Connection' : 'keep-alive' , 'Host' : 'accounts.douban.com' , 'User-Agent' : f.user_agent() } formdata = { 'form_email' : '你的邮箱' , 'form_password' : '你的密码' , # 'captcha-solution': '', # 'captcha-id': '', 'login' : '登录' , 'redir' : 'https://www.douban.com/' , 'source' : 'None' } def start_requests( self ): return [scrapy.Request(url = 'https://www.douban.com/accounts/login' , headers = self .headers, meta = { 'cookiejar' : 1 }, callback = self .parse_login)] def parse_login( self , response): # 如果有验证码要人为处理 if 'captcha_image' in response.body: print 'Copy the link:' link = response.xpath( '//img[@class="captcha_image"]/@src' ).extract()[ 0 ] print link captcha_solution = raw_input ( 'captcha-solution:' ) captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True )[ 'id' ] self .formdata[ 'captcha-solution' ] = captcha_solution self .formdata[ 'captcha-id' ] = captcha_id return [scrapy.FormRequest.from_response(response, formdata = self .formdata, headers = self .headers, meta = { 'cookiejar' : response.meta[ 'cookiejar' ]}, callback = self .after_login )] def after_login( self , response): print response.status self .headers[ 'Host' ] = "www.douban.com" yield scrapy.Request(url = 'https://movie.douban.com/subject/22266320/reviews' , meta = { 'cookiejar' : response.meta[ 'cookiejar' ]}, headers = self .headers, callback = self .parse_comment_url) yield scrapy.Request(url = 'https://movie.douban.com/subject/22266320/reviews' , meta = { 'cookiejar' : response.meta[ 'cookiejar' ]}, headers = self .headers, callback = self .parse_next_page, dont_filter = True ) #不去重 def parse_next_page( self , response): print response.status try : next_url = response.urljoin(response.xpath( '//span[@class="next"]/a/@href' ).extract()[ 0 ]) print "下一页" print next_url yield scrapy.Request(url = next_url, meta = { 'cookiejar' : response.meta[ 'cookiejar' ]}, headers = self .headers, callback = self .parse_comment_url, dont_filter = True ) yield scrapy.Request(url = next_url, meta = { 'cookiejar' : response.meta[ 'cookiejar' ]}, headers = self .headers, callback = self .parse_next_page, dont_filter = True ) except : print "Next page Error" return def parse_comment_url( self , response): print response.status for item in response.xpath( '//div[@class="main review-item"]' ): comment_url = item.xpath( 'header/h3[@class="title"]/a/@href' ).extract()[ 0 ] comment_title = item.xpath( 'header/h3[@class="title"]/a/text()' ).extract()[ 0 ] print comment_title print comment_url yield scrapy.Request(url = comment_url, meta = { 'cookiejar' : response.meta[ 'cookiejar' ]}, headers = self .headers, callback = self .parse_comment) def parse_comment( self , response): print response.status for item in response.xpath( '//div[@id="content"]' ): comment = DoubanMovieCommentItem() comment[ 'useful_num' ] = item.xpath( '//div[@class="main-panel-useful"]/button[1]/text()' ).extract()[ 0 ].strip() comment[ 'no_help_num' ] = item.xpath( '//div[@class="main-panel-useful"]/button[2]/text()' ).extract()[ 0 ].strip() comment[ 'people' ] = item.xpath( '//span[@property="v:reviewer"]/text()' ).extract()[ 0 ] comment[ 'people_url' ] = item.xpath( '//header[@class="main-hd"]/a[1]/@href' ).extract()[ 0 ] comment[ 'star' ] = item.xpath( '//header[@class="main-hd"]/span[1]/@title' ).extract()[ 0 ] data_type = item.xpath( '//div[@id="link-report"]/div/@data-original' ).extract()[ 0 ] print "data_type: " + data_type if data_type = = '0' : comment[ 'comment' ] = "\t#####\t" .join( map ( lambda x:x.strip(), item.xpath( '//div[@id="link-report"]/div/p/text()' ).extract())) elif data_type = = '1' : comment[ 'comment' ] = "\t#####\t" .join( map ( lambda x:x.strip(), item.xpath( '//div[@id="link-report"]/div[1]/text()' ).extract())) comment[ 'title' ] = item.xpath( '//span[@property="v:summary"]/text()' ).extract()[ 0 ] comment[ 'comment_page_url' ] = response.url #print comment yield comment |
doumailspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
# -*- coding:utf-8 -*- '''by sudo rm -rf http://imchenkun.com''' import scrapy from faker import Factory from douban.items import DoubanMailItem import urlparse f = Factory.create() class MailSpider(scrapy.Spider): name = 'douban-mail' allowed_domains = [ 'accounts.douban.com' , 'douban.com' ] start_urls = [ 'https://www.douban.com/' ] headers = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Encoding' : 'gzip, deflate, br' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' , 'Connection' : 'keep-alive' , 'Host' : 'accounts.douban.com' , 'User-Agent' : f.user_agent() } formdata = { 'form_email' : '你的邮箱' , 'form_password' : '你的密码' , # 'captcha-solution': '', # 'captcha-id': '', 'login' : '登录' , 'redir' : 'https://www.douban.com/' , 'source' : 'None' } def start_requests( self ): return [scrapy.Request(url = 'https://www.douban.com/accounts/login' , headers = self .headers, meta = { 'cookiejar' : 1 }, callback = self .parse_login)] def parse_login( self , response): # 如果有验证码要人为处理 if 'captcha_image' in response.body: print 'Copy the link:' link = response.xpath( '//img[@class="captcha_image"]/@src' ).extract()[ 0 ] print link captcha_solution = raw_input ( 'captcha-solution:' ) captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True )[ 'id' ] self .formdata[ 'captcha-solution' ] = captcha_solution self .formdata[ 'captcha-id' ] = captcha_id return [scrapy.FormRequest.from_response(response, formdata = self .formdata, headers = self .headers, meta = { 'cookiejar' : response.meta[ 'cookiejar' ]}, callback = self .after_login )] def after_login( self , response): print response.status self .headers[ 'Host' ] = "www.douban.com" return scrapy.Request(url = 'https://www.douban.com/doumail/' , meta = { 'cookiejar' : response.meta[ 'cookiejar' ]}, headers = self .headers, callback = self .parse_mail) def parse_mail( self , response): print response.status for item in response.xpath( '//div[@class="doumail-list"]/ul/li' ): mail = DoubanMailItem() mail[ 'sender_time' ] = item.xpath( 'div[2]/div/span[1]/text()' ).extract()[ 0 ] mail[ 'sender_from' ] = item.xpath( 'div[2]/div/span[2]/text()' ).extract()[ 0 ] mail[ 'url' ] = item.xpath( 'div[2]/p/a/@href' ).extract()[ 0 ] mail[ 'title' ] = item.xpath( 'div[2]/p/a/text()' ).extract()[ 0 ] print mail yield mail |
init.py
(此文件内无代码)
items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
# -*- coding: utf-8 -*- import scrapy class DoubanBookItem(scrapy.Item): name = scrapy.Field() # 书名 price = scrapy.Field() # 价格 edition_year = scrapy.Field() # 出版年份 publisher = scrapy.Field() # 出版社 ratings = scrapy.Field() # 评分 author = scrapy.Field() # 作者 content = scrapy.Field() class DoubanMailItem(scrapy.Item): sender_time = scrapy.Field() # 发送时间 sender_from = scrapy.Field() # 发送人 url = scrapy.Field() # 豆邮详细地址 title = scrapy.Field() # 豆邮标题 class DoubanMovieCommentItem(scrapy.Item): useful_num = scrapy.Field() # 多少人评论有用 no_help_num = scrapy.Field() # 多少人评论无用 people = scrapy.Field() # 评论者 people_url = scrapy.Field() # 评论者页面 star = scrapy.Field() # 评分 comment = scrapy.Field() # 评论 title = scrapy.Field() # 标题 comment_page_url = scrapy.Field() # 当前页 |
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
# -*- coding: utf-8 -*- class DoubanBookPipeline( object ): def process_item( self , item, spider): info = item[ 'content' ].split( ' / ' ) # [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元 item[ 'name' ] = item[ 'name' ] item[ 'price' ] = info[ - 1 ] item[ 'edition_year' ] = info[ - 2 ] item[ 'publisher' ] = info[ - 3 ] return item class DoubanMailPipeline( object ): def process_item( self , item, spider): item[ 'title' ] = item[ 'title' ].replace( ' ' , ' ').replace(' \\n ', ' ') return item class DoubanMovieCommentPipeline( object ): def process_item( self , item, spider): return item |
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
# -*- coding: utf-8 -*- # Scrapy settings for douban project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'douban' SPIDER_MODULES = [ 'douban.spiders' ] NEWSPIDER_MODULE = 'douban.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent from faker import Factory f = Factory.create() USER_AGENT = f.user_agent() # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Host' : 'book.douban.com' , 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' , 'Accept-Encoding' : 'gzip, deflate, br' , 'Connection' : 'keep-alive' , } #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'douban.middlewares.MyCustomSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'douban.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { #'douban.pipelines.DoubanBookPipeline': 300, #'douban.pipelines.DoubanMailPipeline': 600, 'douban.pipelines.DoubanMovieCommentPipeline' : 900 , } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
scrapy.cfg
1
2
3
4
5
6
7
8
9
10
11
|
# Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] default = douban.settings [deploy] #url = http://localhost:6800/ project = douban |
例程2: douban_imgs
目录树
1
2
3
4
5
6
7
8
9
10
11
|
douban_imgs - - douban - - spiders - - __init__.py - - download_douban.py - - __init__.py - - items.py - - pipelines.py - - run_spider.py - - settings.py - - scrapy.cfg |
–spiders–init.py
1
2
3
4
|
# This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. |
download_douban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# coding=utf-8 from scrapy.spiders import Spider import re from scrapy import Request from douban_imgs.items import DoubanImgsItem class download_douban(Spider): name = 'download_douban' default_headers = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' , 'Accept-Encoding' : 'gzip, deflate, sdch, br' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6' , 'Cache-Control' : 'max-age=0' , 'Connection' : 'keep-alive' , 'Host' : 'www.douban.com' , 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' , } def __init__( self , url = '1638835355' , * args, * * kwargs): self .allowed_domains = [ 'douban.com' ] self .start_urls = [ 'http://www.douban.com/photos/album/%s/' % (url)] self .url = url # call the father base function #super(download_douban, self).__init__(*args, **kwargs) def start_requests( self ): for url in self .start_urls: yield Request(url = url, headers = self .default_headers, callback = self .parse) def parse( self , response): list_imgs = response.xpath( '//div[@class="photolst clearfix"]//img/@src' ).extract() if list_imgs: item = DoubanImgsItem() item[ 'image_urls' ] = list_imgs yield item |
init.py
(此文件内无代码)
items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy import Item, Field class DoubanImgsItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() image_urls = Field() images = Field() image_paths = Field() |
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem from scrapy import Request from scrapy import log class DoubanImgsPipeline( object ): def process_item( self , item, spider): return item class DoubanImgDownloadPipeline(ImagesPipeline): default_headers = { 'accept' : 'image/webp,image/*,*/*;q=0.8' , 'accept-encoding' : 'gzip, deflate, sdch, br' , 'accept-language' : 'zh-CN,zh;q=0.8,en;q=0.6' , 'cookie' : 'bid=yQdC/AzTaCw' , 'referer' : 'https://www.douban.com/photos/photo/2370443040/' , 'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' , } def get_media_requests( self , item, info): for image_url in item[ 'image_urls' ]: self .default_headers[ 'referer' ] = image_url yield Request(image_url, headers = self .default_headers) def item_completed( self , results, item, info): image_paths = [x[ 'path' ] for ok, x in results if ok] if not image_paths: raise DropItem( "Item contains no images" ) item[ 'image_paths' ] = image_paths return item |
run_spider.py
1
2
3
|
from scrapy import cmdline cmd_str = 'scrapy crawl download_douban' cmdline.execute(cmd_str.split( ' ' )) |
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
# -*- coding: utf-8 -*- # Scrapy settings for douban_imgs project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'douban_imgs' SPIDER_MODULES = [ 'douban_imgs.spiders' ] NEWSPIDER_MODULE = 'douban_imgs.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = 'douban_imgs (+http://www.yourdomain.com)' # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS=32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY=3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN=16 # CONCURRENT_REQUESTS_PER_IP=16 # Disable cookies (enabled by default) # COOKIES_ENABLED=False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED=False # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'douban_imgs.middlewares.MyCustomSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'douban_imgs.middlewares.MyCustomDownloaderMiddleware': 543, # } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.telnet.TelnetConsole': None, # } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'douban_imgs.pipelines.DoubanImgDownloadPipeline' : 300 , } IMAGES_STORE = 'D:\\doubanimgs' #IMAGES_STORE = '/tmp' IMAGES_EXPIRES = 90 # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html # NOTE: AutoThrottle will honour the standard settings for concurrency and delay # AUTOTHROTTLE_ENABLED=True # The initial download delay # AUTOTHROTTLE_START_DELAY=5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY=60 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG=False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED=True # HTTPCACHE_EXPIRATION_SECS=0 # HTTPCACHE_DIR='httpcache' # HTTPCACHE_IGNORE_HTTP_CODES=[] # HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' |
scrapy.cfg
1
2
3
4
5
6
7
8
9
10
11
|
# Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] default = douban_imgs.settings [deploy] #url = http://localhost:6800/ project = douban_imgs |
总结
以上就是本文关于scrapy爬虫完整实例的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!
原文链接:http://blog.csdn.net/nnnnnnnnnnnny/article/details/54426779