python解析html提取数据，并生成word文档实例解析_Python

简介

今天试着用ptyhon做了一个抓取网页内容，并生成word文档的功能，功能很简单，做一下记录以备以后用到。

生成word用到了第三方组件html">python-docx，所以先进行第三方组件的安装。由于windows下安装的python默认不带setuptools这个模块，所以要先安装setuptools这个模块。

安装

1、在python官网上找到　https://bootstrap.pypa.io/ez_setup.py　　，把代码保存到本地并执行:　 python ez_setup.py

2、下载python-docx 　(https://pypi.python.org/pypi/python-docx/0.7.4)，下载完成后解压并进入到　　XXX\python-docx-0.7.4　安装python-docx :　python setup.py install

这样python-docx就安装成功了，可以用它来操作word文档了，word文档的生成参考的这里https://python-docx.readthedocs.org/en/latest/index.html

html解析用到的是sgmllib里的SGMLParser　　url内容的获取用到的是urllib、urllib2

实现代码

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

									# -*- coding: cp936 -*- 

									from sgmllib import SGMLParser 

									import os 

									import sys 

									import urllib 

									import urllib2 

									from docx import Document 

									from docx.shared import Inches 

									import time 

									##获取要解析的url 

									class GetUrl(SGMLParser): 

									  def __init__(self): 

									    SGMLParser.__init__(self) 

									    self.start=False

									    self.urlArr=[] 

									  def start_div(self,attr): 

									    for name,value in attr: 

									      if value=="ChairmanCont Bureau":#页面js中的固定值 

									        self.start=True

									  def end_div(self): 

									    self.start=False

									  def start_a(self,attr): 

									    if self.start: 

									      for name,value in attr: 

									        self.urlArr.append(value) 

									  def getUrlArr(self): 

									    return self.urlArr 

									##解析上面获取的url，获取有用数据 

									class getManInfo(SGMLParser): 

									  def __init__(self): 

									    SGMLParser.__init__(self) 

									    self.start=False

									    self.p=False

									    self.dl=False

									    self.manInfo=[] 

									    self.subInfo=[] 

									  def start_div(self,attr): 

									    for name,value in attr: 

									      if value=="SpeakerInfo":#页面js中的固定值 

									        self.start=True

									  def end_div(self): 

									    self.start=False

									  def start_p(self,attr): 

									    if self.dl: 

									      self.p=True

									  def end_p(self): 

									    self.p=False

									  def start_img(self,attr): 

									    if self.dl: 

									      for name,value in attr: 

									        self.subInfo.append(value) 

									  def handle_data(self,data): 

									    if self.p: 

									      self.subInfo.append(data.decode('utf-8')) 

									  def start_dl(self,attr): 

									    if self.start: 

									      self.dl=True

									  def end_dl(self): 

									    self.manInfo.append(self.subInfo) 

									    self.subInfo=[] 

									    self.dl=False

									  def getManInfo(self): 

									    return self.manInfo 

									urlSource="http://www.XXX"

									sourceData=urllib2.urlopen(urlSource).read() 

									startTime=time.clock() 

									##get urls 

									getUrl=GetUrl() 

									getUrl.feed(sourceData) 

									urlArr=getUrl.getUrlArr() 

									getUrl.close() 

									print "get url use:" + str((time.clock() - startTime)) 

									startTime=time.clock() 

									##get maninfos 

									manInfos=getManInfo() 

									for url in urlArr:#one url one person 

									  data=urllib2.urlopen(url).read() 

									  manInfos.feed(data) 

									infos=manInfos.getManInfo() 

									manInfos.close() 

									print "get maninfos use:" + str((time.clock() - startTime)) 

									startTime=time.clock() 

									#word 

									saveFile=os.getcwd()+"\\xxx.docx"

									doc=Document() 

									##word title 

									doc.add_heading("HEAD".decode('gbk'),0) 

									p=doc.add_paragraph("HEADCONTENT:".decode('gbk')) 

									##write info 

									for infoArr in infos: 

									  i=0

									  for info in infoArr: 

									    if i==0:##img url 

									      arr1=info.split('.') 

									      suffix=arr1[len(arr1)-1] 

									      arr2=info.split('/') 

									      preffix=arr2[len(arr2)-2] 

									      imgFile=os.getcwd()+"\\imgs\\"+preffix+"."+suffix 

									      if not os.path.exists(os.getcwd()+"\\imgs"): 

									        os.mkdir(os.getcwd()+"\\imgs") 

									      imgData=urllib2.urlopen(info).read() 

									      try: 

									        f=open(imgFile,'wb') 

									        f.write(imgData) 

									        f.close() 

									        doc.add_picture(imgFile,width=Inches(1.25)) 

									        os.remove(imgFile) 

									      except Exception as err: 

									        print (err) 

									    elif i==1: 

									      doc.add_heading(info+":",level=1) 

									    else: 

									      doc.add_paragraph(info,style='ListBullet') 

									    i=i+1

									doc.save(saveFile) 

									print "word use:" + str((time.clock() - startTime))