京东商品详细的请求处理,是先显示html,然后再ajax请求处理显示价格。
1.可以运行js,并解析之后得到的html
2.模拟js请求,得到价格
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
# -*- coding: utf-8 -*- """ 根据京东url地址,获取商品价格 京东请求处理过程,先显示html页面,然后通过ajax get请求获取相应的商品价格 1.商品的具体数据在html中的格式,如下(示例) # product: { # skuid: 1310118868, # name: '\u9999\u5f71\u77ed\u88d9\u4e24\u4ef6\u5957\u88c5\u5973\u0032\u0030\u0031\u0034\u51ac\u88c5\u65b0\u6b3e\u97e9\u7248\u957f\u8896\u0054\u6064\u4e0a\u8863\u8377\u53f6\u8fb9\u534a\u8eab\u88d9\u6f6e\u0020\u85cf\u9752\u0020\u004d', # skuidkey:'7781F505B71CE37A3AFBADA119D3587F', # href: 'http://item.jd.com/1310118868.html', # src: 'jfs/t385/197/414081450/336886/3070537b/541be890N2995990c.jpg', # cat: [1315,1343,1355], # brand: 18247, # nBrand: 18247, # tips: false, # type: 2, # venderId:38824, # shopId:'36786', # TJ:'0', # specialAttrs:["is7ToReturn-1"], # videoPath:'', # HM:'0' # } 2.ajax请求代码如下: # // 获得数字价格 # var getPriceNum = function(skus, $wrap, perfix, callback) { # skus = typeof skus === 'string' ? [skus]: skus; # $wrap = $wrap || $('body'); # perfix = perfix || 'J-p-'; # $.ajax({ # url: 'http://p.3.cn/prices/mgets?skuIds=J_' + skus.join(',J_') + '&type=1', # dataType: 'jsonp', # success: function (r) { # if (!r && !r.length) { # return false; # } # for (var i = 0; i < r.length; i++) { # var sku = r[i].id.replace('J_', ''); # var price = parseFloat(r[i].p, 10); # # if (price > 0) { # $wrap.find('.'+ perfix + sku).html('¥' + r[i].p + ''); # } else { # $wrap.find('.'+ perfix + sku).html('暂无报价'); # } # # if ( typeof callback === 'function' ) { # callback(sku, price, r); # } # } # } # }); # }; """ import urllib import json import re class JdPrice( object ): """ 对获取京东商品价格进行简单封装 """ def __init__( self , url): self .url = url self ._response = urllib.urlopen( self .url) self .html = self ._response.read() def get_product( self ): """ 获取html中,商品的描述(未对数据进行详细处理,粗略的返回str类型) :return: """ product_re = re. compile (r 'compatible: true,(.*?)};' , re.S) product_info = re.findall(product_re, self .html)[ 0 ] return product_info def get_product_skuid( self ): """ 通过获取的商品信息,获取商品的skuid :return: """ product_info = self .get_product() skuid_re = re. compile (r 'skuid: (.*?),' ) skuid = re.findall(skuid_re, product_info)[ 0 ] return skuid def get_product_name( self ): pass def get_product_price( self ): """ 根据商品的skuid信息,请求获得商品price :return: """ price = None skuid = self .get_product_skuid() url = 'http://p.3.cn/prices/mgets?skuIds=J_' + skuid + '&type=1' price_json = json.load(urllib.urlopen(url))[ 0 ] if price_json[ 'p' ]: price = price_json[ 'p' ] return price # 测试代码 if __name__ = = '__main__' : url = 'http://item.jd.com/1310118868.html' url = 'http://item.jd.com/1044773.html' jp = JdPrice(url) print jp.get_product_price() # htm.decode('gb2312', 'ignore').encode('utf-8') # f = open('jjs.html', 'w') # f.write(htm) # f.close() |
再给大家分享一个京东价格的爬虫:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
fromcreepyimportCrawler fromBeautifulSoupimportBeautifulSoup importurllib2 importjson classMyCrawler(Crawler): defprocess_document( self ,doc): ifdoc.status = = 200 : print [ % d] % s % (doc.status,doc.url) try : soup = BeautifulSoup(doc.text.decode(gb18030).encode(utf - 8 )) exceptExceptionase: printe soup = BeautifulSoup(doc.text) printsoup.find( id = "product-intro" ).div.h1.text url_id = urllib2.unquote(doc.url).decode(utf8).split( / )[ - 1 ].split(.)[ 0 ] f = urllib2.urlopen(http: / / p. 3.cn / prices / get?skuid = J_ + url_id,timeout = 5 ) price = json.loads(f.read()) f.close() printprice[ 0 ][p] else : pass crawler = MyCrawler() crawler.set_follow_mode(Crawler.F_SAME_HOST) crawler.set_concurrency_level( 16 ) crawler.add_url_filter(.(jpg|jpeg|gif|png|js|css|swf)$) crawler.crawl(http: / / item.jd.com / 982040.html ) |