脚本之家,脚本语言编程技术及教程分享平台!
分类导航

Python|VBS|Ruby|Lua|perl|VBA|Golang|PowerShell|Erlang|autoit|Dos|bat|

服务器之家 - 脚本之家 - Python - Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地

Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地

2021-01-17 00:10quzishen Python

这篇文章主要为大家详细介绍了Python抓取聚划算商品分析页面获取商品信息,并以XML格式保存到本地的方法,具有一定的参考价值,感兴趣的小伙伴们可以参考一下

本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/user/bin/python
# -*- coding: gbk -*-
#Spider.py
 
import urllib2
import httplib
import StringIO
import gzip
import re
import chardet
import sys
import os
import datetime
from xml.dom.minidom import Document
from BeautifulSoup import BeautifulSoup
 
## 这段代码是用于解决控制台打印汉字报错的问题
reload(sys)
sys.setdefaultencoding("utf8")
#####################################################
 
## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志
DEBUG = 1
NO_DEBUG = 0
httplib.HTTPConnection.debuglevel = DEBUG
## 是否显示爬取网页源代码开关
showSrcCode = False
## 压缩方式
ZIP_TYPE = "gzip"
 
fileName = "auctions"
location = "d://spiderData/"
 
## header
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE}
#####################################################
 
 
#############class SpiderConfig #####################
class SpiderConfig:
 """
  configuration for spider name and url
 """
 def __init__(self, name, url):
  self.name = name
  self.url = url
#####################################################
 
##############class SpiderAuctionDomain##############
class SpiderAuctionDomain:
 """
  Store information with auctions spidered by python
 """
 title = ""
 url = ""
 img = ""
 price = ""
 
 def __init__(self):
  pass
 
#####################################################
 
########class SpiderDefaultErrorHandler##############
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
 def http_error_default(self, req, fp, code, msg, hdrs):
  """
   default error process handler for spider
  """
  result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
  result.status = code
  result.url = req.get_full_url()
 
  print "<", result.url, "Exception code :", result.status, ">"
 
  return result
#####################################################
 
#############class SpiderHandler#####################
class SpiderHandler:
 """
  spider handler
 """
 
 def spider(self, spiderConfig):
  try:
   request = urllib2.Request(spiderConfig.url)
 
   ## configure request hreader
   for key,val in headerConfig.items():
    request.add_header(key, val)
 
   ## build opener
   opener = urllib2.build_opener(SpiderDefaultErrorHandler())
 
   ## open request
   openRequest = opener.open(request)
 
   ## read data
   spiderData = openRequest.read()
 
   ## close
   opener.close()
 
   if 0 == len(spiderData):
    return
 
   if ZIP_TYPE== openRequest.headers.get("Content-Encoding"):
    spiderData = SpiderHandler.gzipData(self, spiderData)
 
   if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode:
    print spiderData
 
   # parse html
   SpiderHandler.parse(self, spiderData)
 
  except Exception,x:
   print "spider process Exception:", x
 
 
 
 def parse(self, spiderData):
  """
   parse html content
  """
 
  if httplib.HTTPConnection.debuglevel == DEBUG:
   charsetAnalyze = chardet.detect(spiderData)
   print "analyze spider data encode :",charsetAnalyze["encoding"]
 
  print "执行解析", fileName
 
  soup = BeautifulSoup(spiderData)
  encode = soup.originalEncoding
 
  encoding = lambda x : x.encode(encode)
 
  if httplib.HTTPConnection.debuglevel == DEBUG:
   print "识别到编码:", encode
   title = soup.head.title.string
   print encoding(title)
 
  spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"})
  auctions = ["%s" % s for s in spiderContents]
 
  if auctions is None:
   return
 
  auctionList = []
 
  for auc in auctions:
   auctionDomain = SpiderAuctionDomain()
   # parse auction link
   links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc)
   if links is not None :
    auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0]))
 
   #parse auction title
   titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc)
   if titles is not None:
    auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0]))
 
   #parse auction price
   price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc)
   if price is not None:
    auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0])
 
   #parse image url
   imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc)
   if imgs is not None:
    auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0])
 
   auctionList.append(auctionDomain)
 
  print "成功解析商品信息:"
  for a in auctionList:
   print "--->",a.title
 
  # sort auction list
  auctionList = SpiderHandler.sortAuctionList(self, auctionList)
 
  # save in file
  SpiderHandler.save(self, auctionList)
 
  print "解析完成"
 
  pass
 
 def sortAuctionList(self, auctionList):
  """
   冒泡排序,按照价格排序
  """
  length = len(auctionList)
  if length < 2:
   return auctionList
  else:
   for i in range(length-1):
    for j in range(length - i -1):
     if float(auctionList[j].price) > float(auctionList[j+1].price):
      auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j]
  return auctionList
  pass
 
 def save(self, auctionList):
  if auctionList is not None:
   doc = Document()
 
   auctions = doc.createElement("auctions")
   doc.appendChild(auctions)
 
   for auc in auctionList:
    auction = doc.createElement("auction")
    auctions.appendChild(auction)
 
    SpiderHandler.generateXML(self, doc, auction, "title", auc.title)
    SpiderHandler.generateXML(self, doc, auction, "price", auc.price)
    SpiderHandler.generateXML(self, doc, auction, "img", auc.img)
    SpiderHandler.generateXML(self, doc, auction, "link", auc.link)
 
   if False == os.path.exists(location):
    os.mkdir(location)
 
   file = open(location+fileName+".xml", 'w')
   file.write(doc.toprettyxml())
   file.close()
 
   if httplib.HTTPConnection.debuglevel == DEBUG:
    print doc.toprettyxml()
 
 def generateXML(self, doc, f, name, txt):
  c = doc.createElement(name)
  f.appendChild(c)
  c.appendChild(doc.createTextNode(txt))
 
 def gzipData(self, spiderData):
  """
   get data from gzip
  """
  if 0 == len(spiderData):
   return spiderData
  spiderDataStream = StringIO.StringIO(spiderData)
  spiderData = gzip.GzipFile(fileobj=spiderDataStream).read()
  return spiderData
#####################################################
 
if __name__ == "__main__":
 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒")
 
 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou",
      "hangzhou":"http://ju.taobao.com/hangzhou",
      "shanghai":"http://ju.taobao.com/shanghai",
      "beijing":"http://ju.taobao.com/beijing",
      "chengdu":"http://ju.taobao.com/chengdu"}
 
 configList = []
 for k,v in needSpiderUrl.items():
  spiderConfig = SpiderConfig(k, v)
  configList.append(spiderConfig)
 
 spiderHandler = SpiderHandler()
 
 print "爬虫执行开始时间:",nowtime()
 for spiderConfig in configList:
  fileName = spiderConfig.name
  spiderHandler.spider(spiderConfig)
 
 print "爬虫执行完毕时间:",nowtime()

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。

原文链接:http://blog.csdn.net/quzishen/article/details/6859133

延伸 · 阅读

精彩推荐