本文实例为大家分享了Python抓取天猫商品详细信息及交易记录的具体代码,供大家参考,具体内容如下
一、搭建Python环境
本帖使用的是Python 2.7
涉及到的模块:spynner, scrapy, bs4, pymmssql
二、要获取的天猫数据
三、数据抓取流程
四、源代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
|
#coding:utf-8 import spynner from scrapy.selector import Selector from bs4 import BeautifulSoup import random import pymssql #------------------------接数据库-----------------------------# server = "localhost" user = "sa" password = "123456" conn = pymssql.connect(server,user,password, "TmallData" ) if conn: print "DataBase connecting successfully!" else : print "DataBase connecting error!" cursor = conn.cursor() #----------------------定义网页操作函数--------------------------# def py_click_element(browser,pos): #点击网页中的元素 #pos example:'a[href="#description" rel="external nofollow" rel="external nofollow" ]' browser.click(pos) browser.wait(random.randint( 3 , 10 )) return browser def py_click_xpath(browser,xpath): xpath = xpath + '/@href' inner_href = Selector(text = browser.html).xpath(xpath).extract() pos = 'a[href="' + str (inner_href[ 0 ]) + '" rel="external nofollow" ]' browser = py_click_element(browser, pos) return browser def py_webpage_load(browser,url): browser.load(url,load_timeout = 60 ) browser.wait( 10 ) return browser def py_check_element(browser,xpath): #按照xpath查找元素,如果存在则返回True,否则返回False if Selector(text = browser.html).xpath(xpath).extract()! = []: return True else : return False def py_extract_xpath(browser,xpath): if py_check_element(browser, xpath): return Selector(text = browser.html).xpath(xpath).extract()[ 0 ] else : return "none" def py_extract_xpaths(browser,xpaths): #批量提取网页内容 length = len (xpaths) results = [ 0 ] * length for i in range (length): results[i] = py_extract_xpath(browser, xpaths[i]) return results #-----------------------------数据库操作函数---------------------------# #-----------------------------数据提取函数----------------------------# def py_getDealReord(doc): soup = BeautifulSoup(doc, 'lxml' ) tr = soup.find_all( 'tr' ) total_dealRecord = [([ 0 ] * 5 ) for i in range ( len (tr))] i = - 1 for this_tr in tr: i = i + 1 td_user = this_tr.find_all( 'td' ,attrs = { 'class' : "cell-align-l buyer" }) for this_td in td_user: total_dealRecord[i][ 0 ] = this_td.getText().strip( ' ' ) #print username td_style = this_tr.find_all( 'td' ,attrs = { 'class' : "cell-align-l style" }) for this_td in td_style: total_dealRecord[i][ 1 ] = this_td.getText( ',' ).strip( ' ' ) #print style td_quantity = this_tr.find_all( 'td' ,attrs = { 'class' : "quantity" }) for this_td in td_quantity: total_dealRecord[i][ 2 ] = this_td.getText().strip( ' ' ) #print quantity td_dealtime = this_tr.find_all( 'td' ,attrs = { 'class' : "dealtime" }) for this_td in td_dealtime: total_dealRecord[i][ 3 ] = this_td.find( 'p' ,attrs = { 'class' : "date" }).getText() total_dealRecord[i][ 4 ] = this_td.find( 'p' ,attrs = { 'class' : "time" }).getText() return total_dealRecord #--------------------获取要抓取的所有商品链接-----------------------# cursor.execute( """ select * from ProductURLs where BrandName='NB' """ ) file = open ( "H:\\Eclipse\\TmallCrawling\\HTMLParse\\errLog.txt" ) InProductInfo = cursor.fetchall() browser = spynner.Browser() for temp_InProductInfo in InProductInfo: url = 'https:' + temp_InProductInfo[ 2 ] BrandName = temp_InProductInfo[ 0 ] ProductType = temp_InProductInfo[ 1 ] print BrandName, '\t' ,ProductType, '\t' ,url #url= 'https://detail.tmall.com/item.htm?id=524425656711&rn=77636d6db8dea5e30060976fdaf9768d&abbucket=19' try : browser = py_webpage_load(browser, url) except : print "Loading webpage failed." file .write(url) file .write( '\n' ) continue xpaths = [ '//*[@id="J_PromoPrice"]/dd/div/span/text()' ,\ '//*[@id="J_StrPriceModBox"]/dd/span/text()' ,\ '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()' ,\ '//*[@id="J_PostageToggleCont"]/p/span/text()' ,\ '//*[@id="J_EmStock"]/text()' ,\ '//*[@id="J_CollectCount"]/text()' ,\ '//*[@id="J_ItemRates"]/div/span[2]/text()' ,\ '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]/text()' ] out_ProductInfo = py_extract_xpaths(browser,xpaths) browser = py_click_element(browser, 'a[href="#description" rel="external nofollow" rel="external nofollow" ]' ) ProductProperty = py_extract_xpath(browser, '//*[@id="J_AttrUL"]' ) soup = BeautifulSoup(ProductProperty, 'lxml' ) li = soup.find_all( 'li' ) prop = '' for this_li in li: prop = prop + this_li.getText() + '\\' prop = prop[ 0 : len (prop) - 1 ] out_ProductProperty = prop print out_ProductProperty cursor.execute( """ Insert into py_ProductInfo values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ ,(BrandName,ProductType,url,\ out_ProductInfo[ 2 ],out_ProductInfo[ 1 ],\ out_ProductInfo[ 0 ],out_ProductInfo[ 7 ],\ out_ProductInfo[ 1 ],out_ProductInfo[ 3 ],\ out_ProductInfo[ 4 ],out_ProductInfo[ 5 ],\ out_ProductProperty)) conn.commit() Deal_PageCount = 0 browser = py_click_element(browser, 'a[href="#J_DealRecord" rel="external nofollow" ]' ) #browser.browse(True) DealRecord = py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody' ) out_DealRecord = py_getDealReord(DealRecord) for temp_DealRecord in out_DealRecord: if str (temp_DealRecord[ 4 ]) = = '0' : continue cursor.execute( """ Insert into DealRecord values(%s,%s,%s,%s,%s,%s) """ ,(url,temp_DealRecord[ 0 ],temp_DealRecord[ 1 ],\ temp_DealRecord[ 2 ],temp_DealRecord[ 3 ],\ temp_DealRecord[ 4 ])) conn.commit() Deal_PageCount = Deal_PageCount + 1 print "Page " ,Deal_PageCount for i in range ( 6 ): if (i = = 0 ) or (i = = 2 ): continue xpath = '//*[@id="J_showBuyerList"]/div/div/a[' + str (i) + ']' if py_check_element(browser,xpath): browser = py_click_xpath(browser, xpath) DealRecord = py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody' ) out_DealRecord = py_getDealReord(DealRecord) for temp_DealRecord in out_DealRecord: if str (temp_DealRecord[ 4 ]) = = '0' : continue cursor.execute( """ Insert into DealRecord values(%s,%s,%s,%s,%s,%s) """ ,(url,temp_DealRecord[ 0 ],temp_DealRecord[ 1 ],\ temp_DealRecord[ 2 ],temp_DealRecord[ 3 ],\ temp_DealRecord[ 4 ])) conn.commit() Deal_PageCount = Deal_PageCount + 1 print "Page " ,Deal_PageCount while py_check_element(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]' ): browser = py_click_xpath(browser, '//*[@id="J_showBuyerList"]/div/div/a[6]' ) DealRecord = py_extract_xpath(browser, '//*[@id="J_showBuyerList"]/table/tbody' ) out_DealRecord = py_getDealReord(DealRecord) for temp_DealRecord in out_DealRecord: if str (temp_DealRecord[ 4 ]) = = '0' : continue cursor.execute( """ Insert into DealRecord values(%s,%s,%s,%s,%s,%s) """ ,(url,temp_DealRecord[ 0 ],temp_DealRecord[ 1 ],\ temp_DealRecord[ 2 ],temp_DealRecord[ 3 ],\ temp_DealRecord[ 4 ])) conn.commit() Deal_PageCount = Deal_PageCount + 1 print "Page " ,Deal_PageCount |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:http://blog.csdn.net/u014606206/article/details/50307197