基本模块
python爬虫,web spider。爬取网站获取网页数据,并进行分析提取。
基本模块使用的是 urllib,urllib2,re,等模块
基本用法,例子:
(1)进行基本GET请求,获取网页html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
#!coding=utf-8 import urllib import urllib2 url = 'http://www.baidu.com/' # 获取请求 request = urllib2.Request(url) try : # 根据request,得到返回response response = urllib2.urlopen(request) except urllib2.HTTPError, e: if hasattr (e, 'reason' ): print e.reason # 读取response的body html = response.read() # 读取response的headers headers = response.info() |
(2)表单提交
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
#!coding=utf-8 import urllib2 import urllib post_url = '' post_data = urllib.urlencode({ 'username' : 'username' , 'password' : 'password' , }) post_headers = { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0' , } request = urllib2.Request( url = post_url, data = post_data, headers = post_headers, ) response = urllib2.urlopen(request) html = response.read() |
(3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
#!coding=utf-8 import urllib2 import re page_num = 1 url = 'http://tieba.baidu.com/p/3238280985?see_lz=1&pn=' + str (page_num) myPage = urllib2.urlopen(url).read().decode( 'gbk' ) myRe = re. compile (r 'class="d_post_content j_d_post_content ">(.*?)</div>' , re.DOTALL) items = myRe.findall(myPage) f = open ( 'baidu.txt' , 'a+' ) import sys reload (sys) sys.setdefaultencoding( 'utf-8' ) i = 0 texts = [] for item in items: i + = 1 print i text = item.replace( '<br>' , '') text.replace( '\n' , ' ').replace(' ', ' ') + ' \n' print text f.write(text) f.close() |
(4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
#coding:utf-8 ''' 模拟登陆163邮箱并下载邮件内容 ''' import urllib import urllib2 import cookielib import re import time import json class Email163: header = { 'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } user = '' cookie = None sid = None mailBaseUrl = 'http://twebmail.mail.163.com' def __init__( self ): self .cookie = cookielib.CookieJar() cookiePro = urllib2.HTTPCookieProcessor( self .cookie) urllib2.install_opener(urllib2.build_opener(cookiePro)) def login( self ,user,pwd): ''' 登录 ''' postdata = urllib.urlencode({ 'username' :user, 'password' :pwd, 'type' : 1 }) #注意版本不同,登录URL也不同 req = urllib2.Request( url = 'https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1&passtype=1&iframe=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=&uid=' + user + '&style=10&net=t&skinid=null' , data = postdata, headers = self .header, ) res = str (urllib2.urlopen(req).read()) #print res patt = re. compile ( 'sid=([^"]+)' ,re.I) patt = patt.search(res) uname = user.split( '@' )[ 0 ] self .user = user if patt: self .sid = patt.group( 1 ).strip() #print self.sid print '%s Login Successful.....' % (uname) else : print '%s Login failed....' % (uname) def getInBox( self ): ''' 获取邮箱列表 ''' print '\nGet mail lists.....\n' sid = self .sid url = self .mailBaseUrl + '/jy3/list/list.do?sid=' + sid + '&fid=1&fr=folder' res = urllib2.urlopen(url).read() #获取邮件列表 mailList = [] patt = re. compile ( '<div\s+class="tdLike Ibx_Td_From"[^>]+>.*?href="([^"]+)"[^>]+>(.*?)<\/a>.*?<div\s+class="tdLike Ibx_Td_Subject"[^>]+>.*?href="[^>]+>(.*?)<\/a>' ,re.I|re.S) patt = patt.findall(res) if patt = = None : return mailList for i in patt: line = { 'from' :i[ 1 ].decode( 'utf8' ), 'url' : self .mailBaseUrl + i[ 0 ], 'subject' :i[ 2 ].decode( 'utf8' ) } mailList.append(line) return mailList def getMailMsg( self ,url): ''' 下载邮件内容 ''' content = '' print '\n Download.....%s\n' % (url) res = urllib2.urlopen(url).read() patt = re. compile ( 'contentURL:"([^"]+)"' ,re.I) patt = patt.search(res) if patt = = None : return content url = '%s%s' % ( self .mailBaseUrl,patt.group( 1 )) time.sleep( 1 ) res = urllib2.urlopen(url).read() Djson = json.JSONDecoder(encoding = 'utf8' ) jsonRes = Djson.decode(res) if 'resultVar' in jsonRes: content = Djson.decode(res)[ 'resultVar' ] time.sleep( 3 ) return content ''' Demon ''' #初始化 mail163 = Email163() #登录 mail163.login( 'lpe234@163.com' , '944898186' ) time.sleep( 2 ) #获取收件箱 elist = mail163.getInBox() #获取邮件内容 for i in elist: print '主题:%s 来自:%s 内容:\n%s' % (i[ 'subject' ].encode( 'utf8' ),i[ 'from' ].encode( 'utf8' ),mail163.getMailMsg(i[ 'url' ]).encode( 'utf8' )) |
(5)需要登陆的情况
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
#1 cookie的处理 import urllib2, cookielib cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar()) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) content = urllib2.urlopen( 'http://XXXX' ).read() #2 用代理和cookie opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler) #3 表单的处理 import urllib postdata = urllib.urlencode({ 'username' : 'XXXXX' , 'password' : 'XXXXX' , 'continueURI' : 'http://www.verycd.com/' , 'fk' :fk, 'login_submit' : '登录' }) req = urllib2.Request( url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/' , data = postdata ) result = urllib2.urlopen(req).read() #4 伪装成浏览器访问 headers = { 'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/' , data = postdata, headers = headers ) #5 反”反盗链” headers = { 'Referer' : 'http://www.cnbeta.com/articles' } |
(6)多线程
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
from threading import Thread from Queue import Queue from time import sleep #q是任务队列 #NUM是并发线程总数 #JOBS是有多少任务 q = Queue() NUM = 2 JOBS = 10 #具体的处理函数,负责处理单个任务 def do_somthing_using(arguments): print arguments #这个是工作进程,负责不断从队列取数据并处理 def working(): while True : arguments = q.get() do_somthing_using(arguments) sleep( 1 ) q.task_done() #fork NUM个线程等待队列 for i in range (NUM): t = Thread(target = working) t.setDaemon( True ) t.start() #把JOBS排入队列 for i in range (JOBS): q.put(i) #等待所有JOBS完成 q.join() |
scrapy框架
Scrapy框架,Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。
刚开始学习这个框架。不太好评论。只是感觉这个框架有些Java的感觉,需要太多的其他模块的支持。
(一)创建 scrapy 项目
1
2
3
4
5
6
7
8
9
10
11
|
# 使用 scrapy startproject scrapy_test ├── scrapy_test │ ├── scrapy.cfg │ └── scrapy_test │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py # 进行创建 scrapy 项目 |
(二)说明
scrapy.cfg: 项目配置文件
items.py: 需要提取的数据结构定义文件
pipelines.py:管道定义,用来对items里面提取的数据做进一步处理,如保存等
settings.py: 爬虫配置文件
spiders: 放置spider的目录
(三)依赖包
依赖包比较麻烦。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
# python-dev 包的安装 apt - get install python - dev # twisted, w3lib, six, queuelib, cssselect, libxslt pip install w3lib pip install twisted pip install lxml apt - get install libxml2 - dev libxslt - dev apt - get install python - lxml pip install cssselect pip install pyOpenSSL sudo pip install service_identity # 安装好之后,便可使用 scrapy startproject test 进行创建项目 |
(四)抓取实例。
(1)创建scrapy项目
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
dizzy@dizzy-pc:~/Python/spit$ scrapy startproject itzhaopin New Scrapy project 'itzhaopin' created in: /home/dizzy/Python/spit/itzhaopin You can start your first spider with: cd itzhaopin scrapy genspider example example.com dizzy@dizzy-pc:~/Python/spit$ dizzy@dizzy-pc:~/Python/spit$ cd itzhaopin dizzy@dizzy-pc:~/Python/spit/itzhaopin$ tree . ├── itzhaopin │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ └── __init__.py └── scrapy.cfg # scrapy.cfg: 项http://my.oschina.net/lpe234/admin/new-blog目配置文件 # items.py: 需要提取的数据结构定义文件 # pipelines.py:管道定义,用来对items里面提取的数据做进一步处理,如保存等 # settings.py: 爬虫配置文件 # spiders: 放置spider的目录 |
(2)定义要抓取的数据结构 items.py
1
2
3
4
5
6
7
8
9
|
from scrapy.item import Item, Field # 定义我们要抓取的数据 class TencentItem(Item): name = Field() # 职位名称 catalog = Field() # 职位类别 workLocation = Field() # 工作地点 recruitNumber = Field() # 招聘人数 detailLink = Field() # 职位详情链接 publishTime = Field() # 发布时间 |
(3)实现Spider类
- Spider是继承自 scarpy.contrib.spiders.CrawlSpider 的Python类,有3个必须定义的成员。
- name : 名称,spider的标识。
- start_urls : 一个url列表,spider从这些网页开始抓取
- parse() : 一个方法。当start_urls里面的网页抓取下来之后需要调用这个方法来解析网页内容,同时需要返回下一个需要抓取的网页,或者返回items列表。
在spiders目录下面新建一个spider,tencent_spider.py :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
#coding=utf-8 from scrapy.spider import BaseSpider class DmozSpider(BaseSpider): name = 'dmoz' allowed_domains = [ 'dmoz.org' ] start_urls = [ 'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/' , 'http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/' ] def parse( self , response): filename = response.url.split( '/' )[ - 2 ] open (filename, 'wb' ).write(response.info) |
这个简单一些。 使用scrapy crawl dmoz # 即可运行spider