代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#!/usr/bin/env python # -*- coding: utf-8 -*- # @File : HtmlParser.py # @Author: 赵路仓 # @Date : 2020/3/17 # @Desc : # @Contact : 398333404@qq.com import json from lxml import etree import requests from bs4 import BeautifulSoup url = "https://search.jd.com/Search?keyword=ps4&enc=utf-8&wq=ps4&pvid=cf0158c8664442799c1146a461478c9c" head = { 'authority' : 'search.jd.com' , 'method' : 'GET' , 'path' : '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461' , 'scheme' : 'https' , 'referer' : 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0' , 'user-agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' , 'x-requested-with' : 'XMLHttpRequest' , } def page(page): print ( "开始" ) url = "https://search.jd.com/Search?keyword=ps4&enc=utf-8&qrst=1&rt=1&stop=1&vt=1&wq=ps4&page=" + page + "&s=181&click=0" r = requests.get(url,timeout = 3 ,headers = head) r.encoding = r.apparent_encoding # print(r.text) b = BeautifulSoup(r.text, "html.parser" ) #print(b.prettify()) _element = etree.HTML(r.text) datas = _element.xpath( '//li[contains(@class,"gl-item")]' ) print (datas) for data in datas: p_price = data.xpath( 'div/div[@class="p-price"]/strong/i/text()' ) p_comment = data.xpath( 'div/div[5]/strong/a/text()' ) p_name = data.xpath( 'div/div[@class="p-name p-name-type-2"]/a/em/text()' ) p_href = data.xpath( 'div/div[@class="p-name p-name-type-2"]/a/@href' ) comment = ' ' .join(p_comment) name = ' ' .join(p_name) price = ' ' .join(p_price) href = ' ' .join(p_href) print (name,price,p_comment,href) if __name__ = = "__main__" : page( "5" ) |
爬取结果
以上就是python 爬虫爬取某东ps4售卖情况的详细内容,更多关于python 爬虫的资料请关注服务器之家其它相关文章!
原文链接:https://www.cnblogs.com/zlc364624/p/12874090.html