在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
#-*-coding:utf-8-*- import os import uuid import urllib2 import cookielib '' '获取文件后缀名' '' def get_file_extension( file ): return os.path.splitext( file )[1] '' '創建文件目录,并返回该目录' '' def mkdir (path): # 去除左右两边的空格 path=path.strip() # 去除尾部 \符号 path=path.rstrip( "\\" ) if not os.path.exists(path): os.makedirs(path) return path '' '自动生成一个唯一的字符串,固定长度为36' '' def unique_str(): return str(uuid.uuid1()) '' ' 抓取网页文件内容,保存到内存 @url 欲抓取文件 ,path+filename '' ' def get_file(url): try: cj=cookielib.LWPCookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) req=urllib2.Request(url) operate=opener. open (req) data=operate. read () return data except BaseException, e: print e return None '' ' 保存文件到本地 @path 本地路径 @file_name 文件名 @data 文件内容 '' ' def save_file(path, file_name, data): if data == None: return mkdir (path) if (not path.endswith( "/" )): path=path+ "/" file = open (path+file_name, "wb" ) file .write(data) file .flush() file .close() #获取文件后缀名 print get_file_extension( "123.jpg" ); #創建文件目录,并返回该目录 #print mkdir("d:/ljq") #自动生成一个唯一的字符串,固定长度为36 print unique_str() url= "http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0" ; save_file( "d:/ljq/" , "123.jpg" , get_file(url)) |
通过Python抓取指定Url中的图片保存至本地
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
# *** encoding: utf-8 *** __author__= 'jiangyt' "" " fetch images from specific url v1.0 "" " import urllib, httplib, urlparse import re import random "" "judge url exists or not" "" def httpExists(url): host, path = urlparse.urlsplit(url)[1:3] if ':' in host: # port specified, try to use it host, port = host. split ( ':' , 1) try: port = int(port) except ValueError: print 'invalid port number %r' % (port,) return False else : # no port specified, use default port port = None try: connection = httplib.HTTPConnection(host, port=port) connection.request( "HEAD" , path) resp = connection.getresponse( ) if resp.status == 200: # normal 'found' status found = True elif resp.status == 302: # recurse on temporary redirect found = httpExists(urlparse.urljoin(url,resp.getheader( 'location' , '' ))) else : # everything else -> not found print "Status %d %s : %s" % (resp.status, resp.reason, url) found = False except Exception, e: print e.__class__, e, url found = False return found "" "get html src,return lines[]" "" def gGetHtmlLines(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.readlines() page.close() return html except Exception, e: print "gGetHtmlLines() error! Exception ==>>" + e return "" "get html src,return string" "" def gGetHtml(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page. read () page.close() return html except Exception, e: print "gGetHtml() error! Exception ==>>" + e return "" "根据url获取文件名" "" def gGetFileName(url): if url==None: return None if url== "" : return "" arr=url. split ( "/" ) return arr[len(arr)-1] "" "生成随机文件名" "" def gRandFilename( type ): fname = '' for i in range(16): fname = fname + chr(random.randint(65,90)) fname = fname + chr(random.randint(48,57)) return fname + '.' + type "" "根据url和其上的link,得到link的绝对地址" "" def gGetAbslLink(url,link): if url==None or link == None : return if url== '' or link== '' : return url addr = '' if link[0] == '/' : addr = gGetHttpAddr(url) + link elif len(link)>3 and link[0:4] == 'http' : addr = link elif len(link)>2 and link[0:2] == '..' : addr = gGetHttpAddrFatherAssign(url,link) else : addr = gGetHttpAddrFather(url) + link return addr "" "根据输入的lines,匹配正则表达式,返回list" "" def gGetRegList(linesList,regx): if linesList==None : return rtnList=[] for line in linesList: matchs = re.search(regx, line, re.IGNORECASE) if matchs!=None: allGroups = matchs. groups () for foundStr in allGroups: if foundStr not in rtnList: rtnList.append(foundStr) return rtnList "" "根据url下载文件,文件名参数指定" "" def gDownloadWithFilename(url,savePath, file ): #参数检查,现忽略 try: urlopen=urllib.URLopener() fp = urlopen. open (url) data = fp. read () fp.close() file = open (savePath + file , 'w+b' ) file .write(data) file .close() except IOError, error: print "DOWNLOAD %s ERROR!==>>%s" % (url, error) except Exception, e: print "Exception==>>" + e "" "根据url下载文件,文件名自动从url获取" "" def gDownload(url,savePath): #参数检查,现忽略 fileName = gGetFileName(url) #fileName =gRandFilename('jpg') gDownloadWithFilename(url,savePath,fileName) "" "根据某网页的url,下载该网页的jpg" "" def gDownloadHtmlJpg(downloadUrl,savePath): lines= gGetHtmlLines(downloadUrl) # 'get the page source' regx = r "" "src\s*=" ?(\S+)\.jpg "" " lists =gGetRegList(lines,regx) #'get the links which match regular express' if lists==None: return for jpg in lists: jpg = gGetAbslLink(downloadUrl, jpg) + '.jpg' gDownload(jpg,savePath) print gGetFileName(jpg) "" "根据url取主站地址" "" def gGetHttpAddr(url): if url== '' : return '' arr=url. split ( "/" ) return arr[0]+ "//" +arr[2] "" "根据url取上级目录" "" def gGetHttpAddrFather(url): if url== '' : return '' arr=url. split ( "/" ) addr = arr[0]+ '//' +arr[2]+ '/' if len(arr)-1>3 : for i in range(3,len(arr)-1): addr = addr + arr[i] + '/' return addr "" "根据url和上级的link取link的绝对地址" "" def gGetHttpAddrFatherAssign(url,link): if url== '' : return '' if link== '' : return '' linkArray=link. split ( "/" ) urlArray = url. split ( "/" ) partLink = '' partUrl = '' for i in range(len(linkArray)): if linkArray[i]== '..' : numOfFather = i + 1 #上级数 else : partLink = partLink + '/' + linkArray[i] for i in range(len(urlArray)-1-numOfFather): partUrl = partUrl + urlArray[i] if i < len(urlArray)-1-numOfFather -1 : partUrl = partUrl + '/' return partUrl + partLink "" "根据url获取其上的相关htm、html链接,返回list" "" def gGetHtmlLink(url): #参数检查,现忽略 rtnList=[] lines=gGetHtmlLines(url) regx = r "" "href=" ?(\S+)\.htm "" " for link in gGetRegList(lines,regx): link = gGetAbslLink(url,link) + '.htm' if link not in rtnList: rtnList.append(link) print link return rtnList "" "根据url,抓取其上的jpg和其链接htm上的jpg" "" def gDownloadAllJpg(url,savePath): #参数检查,现忽略 gDownloadHtmlJpg(url,savePath) #抓取link上的jpg links=gGetHtmlLink(url) for link in links: gDownloadHtmlJpg(link,savePath) "" "test" "" def main(): u= 'http://site.douban.com/196738/room/2462453/' #想要抓取图片的地址 save= '/root/python/tmp/' #图片所要存放的目录 print 'download pic from [' + u + ']' print 'save to [' +save+ '] ...' gDownloadHtmlJpg(u,save) print "download finished" if __name__ == "__main__" : main() else : print "called from intern." |
以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。