Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地_Python

本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码，供大家参考，具体内容如下

				?

									#!/user/bin/python 

									# -*- coding: gbk -*- 

									#Spider.py 

									import urllib2 

									import httplib 

									import StringIO 

									import gzip 

									import re 

									import chardet 

									import sys 

									import os 

									import datetime 

									from xml.dom.minidom import Document 

									from BeautifulSoup import BeautifulSoup 

									## 这段代码是用于解决控制台打印汉字报错的问题 

									reload(sys) 

									sys.setdefaultencoding("utf8") 

									##################################################### 

									## debug模式开关，开启后可以看到Http请求的头部信息以及debug日志 

									DEBUG = 1

									NO_DEBUG = 0

									httplib.HTTPConnection.debuglevel = DEBUG 

									## 是否显示爬取网页源代码开关 

									showSrcCode = False

									## 压缩方式 

									ZIP_TYPE = "gzip"

									fileName = "auctions"

									location = "d://spiderData/"

									## header 

									headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} 

									##################################################### 

									#############class SpiderConfig ##################### 

									class SpiderConfig: 

									 """ 

									  configuration for spider name and url 

									 """

									 def __init__(self, name, url): 

									  self.name = name 

									  self.url = url 

									##################################################### 

									##############class SpiderAuctionDomain############## 

									class SpiderAuctionDomain: 

									 """ 

									  Store information with auctions spidered by python 

									 """

									 title = "" 

									 url = "" 

									 img = "" 

									 price = "" 

									 def __init__(self): 

									  pass

									##################################################### 

									########class SpiderDefaultErrorHandler############## 

									class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 

									 def http_error_default(self, req, fp, code, msg, hdrs): 

									  """ 

									   default error process handler for spider 

									  """

									  result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) 

									  result.status = code 

									  result.url = req.get_full_url() 

									  print "<", result.url, "Exception code :", result.status, ">"

									  return result 

									##################################################### 

									#############class SpiderHandler##################### 

									class SpiderHandler: 

									 """ 

									  spider handler 

									 """

									 def spider(self, spiderConfig): 

									  try: 

									   request = urllib2.Request(spiderConfig.url) 

									   ## configure request hreader 

									   for key,val in headerConfig.items(): 

									    request.add_header(key, val) 

									   ## build opener 

									   opener = urllib2.build_opener(SpiderDefaultErrorHandler()) 

									   ## open request 

									   openRequest = opener.open(request) 

									   ## read data 

									   spiderData = openRequest.read() 

									   ## close 

									   opener.close() 

									   if 0 == len(spiderData): 

									    return

									   if ZIP_TYPE== openRequest.headers.get("Content-Encoding"): 

									    spiderData = SpiderHandler.gzipData(self, spiderData) 

									   if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode: 

									    print spiderData 

									   # parse html 

									   SpiderHandler.parse(self, spiderData) 

									  except Exception,x: 

									   print "spider process Exception:", x 

									 def parse(self, spiderData): 

									  """ 

									   parse html content 

									  """

									  if httplib.HTTPConnection.debuglevel == DEBUG: 

									   charsetAnalyze = chardet.detect(spiderData) 

									   print "analyze spider data encode :",charsetAnalyze["encoding"] 

									  print "执行解析", fileName 

									  soup = BeautifulSoup(spiderData) 

									  encode = soup.originalEncoding 

									  encoding = lambda x : x.encode(encode) 

									  if httplib.HTTPConnection.debuglevel == DEBUG: 

									   print "识别到编码：", encode 

									   title = soup.head.title.string 

									   print encoding(title) 

									  spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"}) 

									  auctions = ["%s" % s for s in spiderContents] 

									  if auctions is None: 

									   return

									  auctionList = [] 

									  for auc in auctions: 

									   auctionDomain = SpiderAuctionDomain() 

									   # parse auction link 

									   links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc) 

									   if links is not None : 

									    auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0])) 

									   #parse auction title 

									   titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc) 

									   if titles is not None: 

									    auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0])) 

									   #parse auction price 

									   price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc) 

									   if price is not None: 

									    auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0]) 

									   #parse image url 

									   imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc) 

									   if imgs is not None: 

									    auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0]) 

									   auctionList.append(auctionDomain) 

									  print "成功解析商品信息："

									  for a in auctionList: 

									   print "--->",a.title 

									  # sort auction list 

									  auctionList = SpiderHandler.sortAuctionList(self, auctionList) 

									  # save in file 

									  SpiderHandler.save(self, auctionList) 

									  print "解析完成"

									  pass

									 def sortAuctionList(self, auctionList): 

									  """ 

									   冒泡排序，按照价格排序 

									  """

									  length = len(auctionList) 

									  if length < 2: 

									   return auctionList 

									  else: 

									   for i in range(length-1): 

									    for j in range(length - i -1): 

									     if float(auctionList[j].price) > float(auctionList[j+1].price): 

									      auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j] 

									  return auctionList 

									  pass

									 def save(self, auctionList): 

									  if auctionList is not None: 

									   doc = Document() 

									   auctions = doc.createElement("auctions") 

									   doc.appendChild(auctions) 

									   for auc in auctionList: 

									    auction = doc.createElement("auction") 

									    auctions.appendChild(auction) 

									    SpiderHandler.generateXML(self, doc, auction, "title", auc.title) 

									    SpiderHandler.generateXML(self, doc, auction, "price", auc.price) 

									    SpiderHandler.generateXML(self, doc, auction, "img", auc.img) 

									    SpiderHandler.generateXML(self, doc, auction, "link", auc.link) 

									   if False == os.path.exists(location): 

									    os.mkdir(location) 

									   file = open(location+fileName+".xml", 'w') 

									   file.write(doc.toprettyxml()) 

									   file.close() 

									   if httplib.HTTPConnection.debuglevel == DEBUG: 

									    print doc.toprettyxml() 

									 def generateXML(self, doc, f, name, txt): 

									  c = doc.createElement(name) 

									  f.appendChild(c) 

									  c.appendChild(doc.createTextNode(txt)) 

									 def gzipData(self, spiderData): 

									  """ 

									   get data from gzip 

									  """

									  if 0 == len(spiderData): 

									   return spiderData 

									  spiderDataStream = StringIO.StringIO(spiderData) 

									  spiderData = gzip.GzipFile(fileobj=spiderDataStream).read() 

									  return spiderData 

									##################################################### 

									if __name__ == "__main__": 

									 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒") 

									 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou", 

									      "hangzhou":"http://ju.taobao.com/hangzhou", 

									      "shanghai":"http://ju.taobao.com/shanghai", 

									      "beijing":"http://ju.taobao.com/beijing", 

									      "chengdu":"http://ju.taobao.com/chengdu"} 

									 configList = [] 

									 for k,v in needSpiderUrl.items(): 

									  spiderConfig = SpiderConfig(k, v) 

									  configList.append(spiderConfig) 

									 spiderHandler = SpiderHandler() 

									 print "爬虫执行开始时间：",nowtime() 

									 for spiderConfig in configList: 

									  fileName = spiderConfig.name 

									  spiderHandler.spider(spiderConfig) 

									 print "爬虫执行完毕时间：",nowtime()