本篇博文主要讲解python爬虫实例,重点包括爬虫技术架构,组成爬虫的关键模块:url管理器、html下载器和html解析器。
爬虫简单架构
程序入口函数(爬虫调度段)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#coding:utf8 import time, datetime from maya_spider import url_manager, html_downloader, html_parser, html_outputer class spider_main( object ): #初始化操作 def __init__( self ): #设置url管理器 self .urls = url_manager.urlmanager() #设置html下载器 self .downloader = html_downloader.htmldownloader() #设置html解析器 self .parser = html_parser.htmlparser() #设置html输出器 self .outputer = html_outputer.htmloutputer() #爬虫调度程序 def craw( self , root_url): count = 1 self .urls.add_new_url(root_url) while self .urls.has_new_url(): try : new_url = self .urls.get_new_url() print ( 'craw %d : %s' % (count, new_url)) html_content = self .downloader.download(new_url) new_urls, new_data = self .parser.parse(new_url, html_content) self .urls.add_new_urls(new_urls) self .outputer.collect_data(new_data) if count = = 10 : break count = count + 1 except : print ( 'craw failed' ) self .outputer.output_html() if __name__ = = '__main__' : #设置爬虫入口 root_url = 'http://baike.baidu.com/view/21087.htm' #开始时间 print ( '开始计时..............' ) start_time = datetime.datetime.now() obj_spider = spider_main() obj_spider.craw(root_url) #结束时间 end_time = datetime.datetime.now() print ( '总用时:%ds' % (end_time - start_time).seconds) |
url管理器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
class urlmanager( object ): def __init__( self ): self .new_urls = set () self .old_urls = set () def add_new_url( self , url): if url is none: return if url not in self .new_urls and url not in self .old_urls: self .new_urls.add(url) def add_new_urls( self , urls): if urls is none or len (urls) = = 0 : return for url in urls: self .add_new_url(url) def has_new_url( self ): return len ( self .new_urls) ! = 0 def get_new_url( self ): new_url = self .new_urls.pop() self .old_urls.add(new_url) return new_url |
网页下载器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
import urllib import urllib.request class htmldownloader( object ): def download( self , url): if url is none: return none #伪装成浏览器访问,直接访问的话csdn会拒绝 user_agent = 'mozilla/4.0 (compatible; msie 5.5; windows nt)' headers = { 'user-agent' :user_agent} #构造请求 req = urllib.request.request(url,headers = headers) #访问页面 response = urllib.request.urlopen(req) #python3中urllib.read返回的是bytes对象,不是string,得把它转换成string对象,用bytes.decode方法 return response.read().decode() |
网页解析器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
import re import urllib from urllib.parse import urlparse from bs4 import beautifulsoup class htmlparser( object ): def _get_new_urls( self , page_url, soup): new_urls = set () #/view/123.htm links = soup.find_all( 'a' , href = re. compile (r '/item/.*?' )) for link in links: new_url = link[ 'href' ] new_full_url = urllib.parse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls #获取标题、摘要 def _get_new_data( self , page_url, soup): #新建字典 res_data = {} #url res_data[ 'url' ] = page_url #<dd class="lemmawgt-lemmatitle-title"><h1>python</h1>获得标题标签 title_node = soup.find( 'dd' , class_ = "lemmawgt-lemmatitle-title" ).find( 'h1' ) print ( str (title_node.get_text())) res_data[ 'title' ] = str (title_node.get_text()) #<div class="lemma-summary" label-module="lemmasummary"> summary_node = soup.find( 'div' , class_ = "lemma-summary" ) res_data[ 'summary' ] = summary_node.get_text() return res_data def parse( self , page_url, html_content): if page_url is none or html_content is none: return none soup = beautifulsoup(html_content, 'html.parser' , from_encoding = 'utf-8' ) new_urls = self ._get_new_urls(page_url, soup) new_data = self ._get_new_data(page_url, soup) return new_urls, new_data |
网页输出器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
class htmloutputer( object ): def __init__( self ): self .datas = [] def collect_data( self , data): if data is none: return self .datas.append(data ) def output_html( self ): fout = open ( 'maya.html' , 'w' , encoding = 'utf-8' ) fout.write( "<head><meta http-equiv='content-type' content='text/html;charset=utf-8'></head>" ) fout.write( '<html>' ) fout.write( '<body>' ) fout.write( '<table border="1">' ) # <th width="5%">url</th> fout.write( '''<tr style="color:red" width="90%"> <th>theme</th> <th width="80%">content</th> </tr>''' ) for data in self .datas: fout.write( '<tr>\n' ) # fout.write('\t<td>%s</td>' % data['url']) fout.write( '\t<td align="center"><a href=\'%s\'>%s</td>' % (data[ 'url' ], data[ 'title' ])) fout.write( '\t<td>%s</td>\n' % data[ 'summary' ]) fout.write( '</tr>\n' ) fout.write( '</table>' ) fout.write( '</body>' ) fout.write( '</html>' ) fout.close() |
运行结果
附:完整代码
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/sunhuaqiang1/article/details/66472363