1.获取数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
import requests def drg(url): try : head = { 'User-Agent' : 'Mozilla / 5.0 (Windows NT 10.0 ; Win64; x64) AppleWebKit / \ 537.36 (KHTML, like Gecko) Chrome / \ 91.0 . 4472.164 Safari / 537.36 '} r = requests.get(url,headers = head) r.raise_for_status() # 如果状态不是200,引发HTTPError异常 r.encoding = r.apparent_encoding return r.text except : return "产生异常" url = "https://www.ip138.com/mobile.asp?mobile=13018305773&action=mobile" print (drg(url)) |
2.解析数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
import requests def login(): try : # 登录之后界面的url urllogin = "http://www.cqooc.com/user/login?username=12608199000635&password=48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69&nonce=6BA36BBB1F623279&cnonce=8257070573EFE28F" s = requests.session() r = s.post(urllogin,data = Form,headers = headers) r.encoding = r.apparent_encoding r.raise_for_status() return s except Exception as error: print (error) def get_html(s,url): try : r = s.get(url,headers = headers) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as error: print (error) if __name__ = = "__main__" : # 登录之后的界面user-agent headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36" , } # 跟着自己的改变 Form = { "username" : "12608199000635" , "password" : "48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69" , "nonce" : "6BA36BBB1F623279" , "cnonce" : "8257070573EFE28F" } lin = login() # 个人中心的网址 url = "http://www.cqooc.com/my/learn" html = get_html(lin,url) print (html) |
3.数据保存为CSV格式和存入数据库
保存为CSV
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
import requests from lxml import etree import csv #获取数据 def get_html(url,time = 30 ): try : r = requests.get(url, timeout = time) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as error: print (error) def parser(html): #解析函数 doc = etree.HTML(html) #html转换为soup对象 out_list = [] #解析函数输出数据的列表 #二次查找法 for row in doc.xpath( "//*[@class='book-img-text']//li/*[@class='book-mid-info']" ): row_data = [ row.xpath( "h4/a/text()" )[ 0 ], #书名 row.xpath( "p[@class='author']/a/text()" )[ 0 ], #作者 row.xpath( "p[2]/text()" )[ 0 ].strip(), #介绍 row.xpath( "p[@class='update']/span/text()" )[ 0 ] #更新日期 ] out_list.append(row_data) #将解析的每行数据插入到输出列表中 return out_list def save_csv(item,path): #数据存储,将list数据写入文件,防止乱码 with open (path, "a+" , newline = '',encoding = "utf-8" ) as f: #创建utf8编码文件 csv_write = csv.writer(f) #创建写入对象 csv_write.writerows(item) #一次性写入多行 if __name__ = = "__main__" : for i in range ( 1 , 6 ): url = "https://www.qidian.com/rank/fengyun?style=1&page={0}" . format (i) html = get_html(url) #获取网页数据 out_list = parser(html) #解析网页,输出列表数据 save_csv(out_list, "d:\\book.csv" ) #数据存储 |
存入数据库
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
import pymysql import requests from lxml import etree def get_html(url, time = 3000 ): try : headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31" } r = requests.get(url, timeout = time,headers = headers) r.encoding = r.apparent_encoding r.raise_for_status() return r.text except Exception as err: print (err) result = [] def parse_html(html): html = etree.HTML(html) for row in html.xpath( '//*[@id="content"]/div/div[1]/ul/li' ): Naame = row.xpath( "div[2]/h2/a/text()" )[ 0 ].strip() #//*[@id="content"]/div/div[1]/ul[1]/div[2]/h2/a score = row.xpath( "div[2]/p[2]/span[2]/text()" )[ 0 ].strip() #//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[2]/span[2] price = row.xpath( "div[2]/p[1]/text()" )[ 0 ].strip().split( "/" ) #//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[1]/text() price = price[ 0 ] content = price[ 1 ] a = price[ 2 ] b = price[ - 1 ] detail = [Naame,score,price,content,a,b] result.append(detail) def join_all(sql_insert,vals, * * dbinfo): try : connet = pymysql.connect( * * dbinfo) cursor = connet.cursor() cursor.executemany(sql_insert,vals) connet.commit() print ( '添加成功!' ) except Exception as err: print (err) connet.rollback() cursor.close() if __name__ = = "__main__" : for page in range ( 1 , 16 ): url = "https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={0}" . format ( str (page)) parms = { "host" : "127.0.0.1" , "port" : 3306 , "user" : "root" , "passwd" : "123456" , "db" : "db" , "charset" : "utf8" } html = get_html(url) parse_html(html) sql_insert = "INSERT INTO db(Naame,score,price,content,a,b)\ Values( % s, % s, % s, % s, % s, % s)" join_all(sql_insert,result, * * parms) print (result) |
总结
本篇文章就到这里了,希望能够给你带来帮助,也希望您能够多多关注服务器之家的更多内容!
原文链接:https://blog.csdn.net/qq_50951790/article/details/120643441