本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
|
#!/user/bin/python # -*- coding: gbk -*- #Spider.py import urllib2 import httplib import StringIO import gzip import re import chardet import sys import os import datetime from xml.dom.minidom import Document from BeautifulSoup import BeautifulSoup ## 这段代码是用于解决控制台打印汉字报错的问题 reload (sys) sys.setdefaultencoding( "utf8" ) ##################################################### ## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志 DEBUG = 1 NO_DEBUG = 0 httplib.HTTPConnection.debuglevel = DEBUG ## 是否显示爬取网页源代码开关 showSrcCode = False ## 压缩方式 ZIP_TYPE = "gzip" fileName = "auctions" location = "d://spiderData/" ## header headerConfig = { "User-Agent" : "taobao-yanyuan.qzs" , "Accept-encoding" :ZIP_TYPE} ##################################################### #############class SpiderConfig ##################### class SpiderConfig: """ configuration for spider name and url """ def __init__( self , name, url): self .name = name self .url = url ##################################################### ##############class SpiderAuctionDomain############## class SpiderAuctionDomain: """ Store information with auctions spidered by python """ title = "" url = "" img = "" price = "" def __init__( self ): pass ##################################################### ########class SpiderDefaultErrorHandler############## class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): def http_error_default( self , req, fp, code, msg, hdrs): """ default error process handler for spider """ result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) result.status = code result.url = req.get_full_url() print "<" , result.url, "Exception code :" , result.status, ">" return result ##################################################### #############class SpiderHandler##################### class SpiderHandler: """ spider handler """ def spider( self , spiderConfig): try : request = urllib2.Request(spiderConfig.url) ## configure request hreader for key,val in headerConfig.items(): request.add_header(key, val) ## build opener opener = urllib2.build_opener(SpiderDefaultErrorHandler()) ## open request openRequest = opener. open (request) ## read data spiderData = openRequest.read() ## close opener.close() if 0 = = len (spiderData): return if ZIP_TYPE = = openRequest.headers.get( "Content-Encoding" ): spiderData = SpiderHandler.gzipData( self , spiderData) if httplib.HTTPConnection.debuglevel = = DEBUG and showSrcCode: print spiderData # parse html SpiderHandler.parse( self , spiderData) except Exception,x: print "spider process Exception:" , x def parse( self , spiderData): """ parse html content """ if httplib.HTTPConnection.debuglevel = = DEBUG: charsetAnalyze = chardet.detect(spiderData) print "analyze spider data encode :" ,charsetAnalyze[ "encoding" ] print "执行解析" , fileName soup = BeautifulSoup(spiderData) encode = soup.originalEncoding encoding = lambda x : x.encode(encode) if httplib.HTTPConnection.debuglevel = = DEBUG: print "识别到编码:" , encode title = soup.head.title.string print encoding(title) spiderContents = soup.findAll(name = "div" , attrs = { "class" : "main-box avil" }) auctions = [ "%s" % s for s in spiderContents] if auctions is None : return auctionList = [] for auc in auctions: auctionDomain = SpiderAuctionDomain() # parse auction link links = re.search(re. compile (r '<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']' , re.IGNORECASE), auc) if links is not None : auctionDomain.link = encoding( "http://ju.taobao.com/tg/life_home.htm?item_id=" + " ".join([" % s" % s for s in links.groups() if len (s) > 0 ])) #parse auction title titles = re.search(re. compile (r "([^>]*)</a></h2>" , re.IGNORECASE), auc) if titles is not None : auctionDomain.title = encoding(" ".join([" % s" % t for t in titles.groups() if len (t) > 0 ])) #parse auction price price = re.search(re. compile (r "<strong class=\"J_juPrices\".*</b>([^<]*)</strong>" , re.IGNORECASE), auc) if price is not None : auctionDomain.price = " ".join([" % s" % p for p in price.groups() if len (p) > 0 ]) #parse image url imgs = re.search(re. compile (r "<img src=[\'\"]([^>]*)[\'\"]" , re.IGNORECASE), auc) if imgs is not None : auctionDomain.img = " ".join([" % s" % i for i in imgs.groups() if len (i) > 0 ]) auctionList.append(auctionDomain) print "成功解析商品信息:" for a in auctionList: print "--->" ,a.title # sort auction list auctionList = SpiderHandler.sortAuctionList( self , auctionList) # save in file SpiderHandler.save( self , auctionList) print "解析完成" pass def sortAuctionList( self , auctionList): """ 冒泡排序,按照价格排序 """ length = len (auctionList) if length < 2 : return auctionList else : for i in range (length - 1 ): for j in range (length - i - 1 ): if float (auctionList[j].price) > float (auctionList[j + 1 ].price): auctionList[j], auctionList[j + 1 ] = auctionList[j + 1 ], auctionList[j] return auctionList pass def save( self , auctionList): if auctionList is not None : doc = Document() auctions = doc.createElement( "auctions" ) doc.appendChild(auctions) for auc in auctionList: auction = doc.createElement( "auction" ) auctions.appendChild(auction) SpiderHandler.generateXML( self , doc, auction, "title" , auc.title) SpiderHandler.generateXML( self , doc, auction, "price" , auc.price) SpiderHandler.generateXML( self , doc, auction, "img" , auc.img) SpiderHandler.generateXML( self , doc, auction, "link" , auc.link) if False = = os.path.exists(location): os.mkdir(location) file = open (location + fileName + ".xml" , 'w' ) file .write(doc.toprettyxml()) file .close() if httplib.HTTPConnection.debuglevel = = DEBUG: print doc.toprettyxml() def generateXML( self , doc, f, name, txt): c = doc.createElement(name) f.appendChild(c) c.appendChild(doc.createTextNode(txt)) def gzipData( self , spiderData): """ get data from gzip """ if 0 = = len (spiderData): return spiderData spiderDataStream = StringIO.StringIO(spiderData) spiderData = gzip.GzipFile(fileobj = spiderDataStream).read() return spiderData ##################################################### if __name__ = = "__main__" : nowtime = lambda :datetime.datetime.strftime(datetime.datetime.now(), "%Y年%m月%d日 %H时%m分%S秒" ) needSpiderUrl = { "suzhou" : "http://ju.taobao.com/suzhou" , "hangzhou" : "http://ju.taobao.com/hangzhou" , "shanghai" : "http://ju.taobao.com/shanghai" , "beijing" : "http://ju.taobao.com/beijing" , "chengdu" : "http://ju.taobao.com/chengdu" } configList = [] for k,v in needSpiderUrl.items(): spiderConfig = SpiderConfig(k, v) configList.append(spiderConfig) spiderHandler = SpiderHandler() print "爬虫执行开始时间:" ,nowtime() for spiderConfig in configList: fileName = spiderConfig.name spiderHandler.spider(spiderConfig) print "爬虫执行完毕时间:" ,nowtime() |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:http://blog.csdn.net/quzishen/article/details/6859133