简介
今天试着用ptyhon做了一个抓取网页内容,并生成word文档的功能,功能很简单,做一下记录以备以后用到。
生成word用到了第三方组件html">python-docx,所以先进行第三方组件的安装。由于windows下安装的python默认不带setuptools这个模块,所以要先安装setuptools这个模块。
安装
1、在python官网上找到 https://bootstrap.pypa.io/ez_setup.py ,把代码保存到本地并执行: python ez_setup.py
2、下载python-docx (https://pypi.python.org/pypi/python-docx/0.7.4),下载完成后解压并进入到 XXX\python-docx-0.7.4 安装python-docx : python setup.py install
这样python-docx就安装成功了,可以用它来操作word文档了,word文档的生成参考的这里https://python-docx.readthedocs.org/en/latest/index.html
html解析用到的是sgmllib里的SGMLParser url内容的获取用到的是urllib、urllib2
实现代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
# -*- coding: cp936 -*- from sgmllib import SGMLParser import os import sys import urllib import urllib2 from docx import Document from docx.shared import Inches import time ##获取要解析的url class GetUrl(SGMLParser): def __init__( self ): SGMLParser.__init__( self ) self .start = False self .urlArr = [] def start_div( self ,attr): for name,value in attr: if value = = "ChairmanCont Bureau" : #页面js中的固定值 self .start = True def end_div( self ): self .start = False def start_a( self ,attr): if self .start: for name,value in attr: self .urlArr.append(value) def getUrlArr( self ): return self .urlArr ##解析上面获取的url,获取有用数据 class getManInfo(SGMLParser): def __init__( self ): SGMLParser.__init__( self ) self .start = False self .p = False self .dl = False self .manInfo = [] self .subInfo = [] def start_div( self ,attr): for name,value in attr: if value = = "SpeakerInfo" : #页面js中的固定值 self .start = True def end_div( self ): self .start = False def start_p( self ,attr): if self .dl: self .p = True def end_p( self ): self .p = False def start_img( self ,attr): if self .dl: for name,value in attr: self .subInfo.append(value) def handle_data( self ,data): if self .p: self .subInfo.append(data.decode( 'utf-8' )) def start_dl( self ,attr): if self .start: self .dl = True def end_dl( self ): self .manInfo.append( self .subInfo) self .subInfo = [] self .dl = False def getManInfo( self ): return self .manInfo urlSource = "http://www.XXX" sourceData = urllib2.urlopen(urlSource).read() startTime = time.clock() ##get urls getUrl = GetUrl() getUrl.feed(sourceData) urlArr = getUrl.getUrlArr() getUrl.close() print "get url use:" + str ((time.clock() - startTime)) startTime = time.clock() ##get maninfos manInfos = getManInfo() for url in urlArr: #one url one person data = urllib2.urlopen(url).read() manInfos.feed(data) infos = manInfos.getManInfo() manInfos.close() print "get maninfos use:" + str ((time.clock() - startTime)) startTime = time.clock() #word saveFile = os.getcwd() + "\\xxx.docx" doc = Document() ##word title doc.add_heading( "HEAD" .decode( 'gbk' ), 0 ) p = doc.add_paragraph( "HEADCONTENT:" .decode( 'gbk' )) ##write info for infoArr in infos: i = 0 for info in infoArr: if i = = 0 : ##img url arr1 = info.split( '.' ) suffix = arr1[ len (arr1) - 1 ] arr2 = info.split( '/' ) preffix = arr2[ len (arr2) - 2 ] imgFile = os.getcwd() + "\\imgs\\"+preffix+" ." + suffix if not os.path.exists(os.getcwd() + "\\imgs" ): os.mkdir(os.getcwd() + "\\imgs" ) imgData = urllib2.urlopen(info).read() try : f = open (imgFile, 'wb' ) f.write(imgData) f.close() doc.add_picture(imgFile,width = Inches( 1.25 )) os.remove(imgFile) except Exception as err: print (err) elif i = = 1 : doc.add_heading(info + ":" ,level = 1 ) else : doc.add_paragraph(info,style = 'ListBullet' ) i = i + 1 doc.save(saveFile) print "word use:" + str ((time.clock() - startTime)) |
总结
以上就是本文关于python解析html提取数据,并生成word文档实例解析的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!
原文链接:http://blog.csdn.net/how8586/article/details/39399217