安卓最美应用页面爬虫,爬虫很简单,设计的东西到挺多的
文件操作
正则表达式
字符串替换等等
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
import requests import re url = "http://zuimeia.com" r = requests.get( 'http://zuimeia.com/community/app/hot/?platform=2' ) pattern = re. compile (r '<a class="community-app-cover-wrapper" href="(.*?)" target="_blank">' ) urlList = pattern.findall(r.content) def requestsUrl(url): r = requests.get(url) title = re.findall(r '"app-title"><h1>(.*?)</h1>' ,r.content) #print title category = re.findall(r '<a class="app-tag" href="/community/app/category/title/.*?/?platform=2">(.*?)</a>' ,r.content) #print category describe = re.findall(r '<div id="article_content">(.*?)<div class="community-image-wrapper">' ,r.content) #print type(describe[0]) strdescribe = srtReplace(describe[ 0 ]) #print strdescribe downloadUrl = re.findall(r '<a class="download-button direct hidden" href="(.*?)"' ,r.content) #print downloadUrl return title,category,strdescribe,downloadUrl def srtReplace(string): listReplace = [ '<p>' , '<br>' , '<h1>' , '<h2>' , '<h3>' , '<h4>' , '<h5>' , '<h6>' , '<h7>' , '<strong>' , '</p>' , '<br/>' , '</h1>' , '</h2>' , '</h3>' , '</h4>' , '</h5>' , '</h6>' , '</h7>' , '</strong>' , '<b>' , '</b>' ] for eachListReplace in listReplace: string = string.replace( str (eachListReplace), '\n' ) string = string.replace( '\n\n' ,'') return string def categornFinal(category): categoryFinal = '' for eachCategory in category: categoryFinal = categoryFinal + str (eachCategory) + '-->' return categoryFinal def urlReplace(url): url = url.replace( '&' , '&' ) return url requestsUrl( "http://zuimeia.com/community/app/27369/?platform=2" ) for eachUrl in urlList: eachUrl = url + eachUrl content = requestsUrl(eachUrl) categoryFinal = '' title = content[ 0 ][ 0 ] category = categornFinal(content[ 1 ]) strdescribe = content[ 2 ] downloadUrl = urlReplace(content[ 3 ][ 0 ]) with open ( 'c:/wqa.txt' , 'a+' ) as fd: fd.write( 'title:' + title + '\n' + 'category:' + category + '\n' + 'strdescribe:' + strdescribe + '\n' + 'downloadUrl:' + downloadUrl + '\n\n\n-----------------------------------------------------------------------------------------------------------------------------\n\n\n' ) |