python抓取网页中图片并保存到本地_Python

在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法，感兴趣的朋友可以点击了解详情。

				?

									#-*-coding:utf-8-*- 

									import os

									import uuid

									import urllib2

									import cookielib

									'''获取文件后缀名'''

									def get_file_extension(file): 

									  return os.path.splitext(file)[1] 

									'''創建文件目录，并返回该目录'''

									def mkdir(path):

									  # 去除左右两边的空格

									  path=path.strip()

									  # 去除尾部 \符号

									  path=path.rstrip("\\")

									  if not os.path.exists(path):

									    os.makedirs(path)

									  return path

									'''自动生成一个唯一的字符串，固定长度为36'''

									def unique_str():

									  return str(uuid.uuid1())

									'''

									抓取网页文件内容，保存到内存

									@url 欲抓取文件 ，path+filename

									'''

									def get_file(url):

									  try:

									    cj=cookielib.LWPCookieJar()

									    opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

									    urllib2.install_opener(opener)

									    req=urllib2.Request(url)

									    operate=opener.open(req)

									    data=operate.read()

									    return data

									  except BaseException, e:

									    print e

									    return None

									'''

									保存文件到本地

									@path 本地路径

									@file_name 文件名

									@data 文件内容

									'''

									def save_file(path, file_name, data):

									  if data == None:

									    return

									  mkdir(path)

									  if(not path.endswith("/")):

									    path=path+"/"

									  file=open(path+file_name, "wb")

									  file.write(data)

									  file.flush()

									  file.close()

									#获取文件后缀名

									print get_file_extension("123.jpg");

									#創建文件目录，并返回该目录

									#print mkdir("d:/ljq")

									#自动生成一个唯一的字符串，固定长度为36

									print unique_str()

									url="http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0";

									save_file("d:/ljq/", "123.jpg", get_file(url))

通过Python抓取指定Url中的图片保存至本地

				?

									# *** encoding: utf-8 ***

									__author__='jiangyt'

									""" 

									fetch images from specific url

									v1.0

									""" 

									import urllib, httplib, urlparse 

									import re 

									import random 

									"""judge url exists or not"""

									def httpExists(url): 

									  host, path = urlparse.urlsplit(url)[1:3] 

									  if ':' in host: 

									    # port specified, try to use it 

									    host, port = host.split(':', 1) 

									    try: 

									      port = int(port) 

									    except ValueError: 

									      print 'invalid port number %r' % (port,) 

									      return False 

									  else: 

									    # no port specified, use default port 

									    port = None 

									  try: 

									    connection = httplib.HTTPConnection(host, port=port) 

									    connection.request("HEAD", path) 

									    resp = connection.getresponse( ) 

									    if resp.status == 200: # normal 'found' status 

									      found = True 

									    elif resp.status == 302: # recurse on temporary redirect 

									      found = httpExists(urlparse.urljoin(url,resp.getheader('location', ''))) 

									    else: # everything else -> not found 

									      print "Status %d %s : %s" % (resp.status, resp.reason, url) 

									      found = False 

									  except Exception, e: 

									    print e.__class__, e, url 

									    found = False 

									  return found 

									"""get html src,return lines[]"""

									def gGetHtmlLines(url): 

									  if url==None : return

									  if not httpExists(url): return

									  try: 

									    page = urllib.urlopen(url) 

									    html = page.readlines() 

									    page.close() 

									    return html 

									  except Exception, e: 

									    print "gGetHtmlLines() error! Exception ==>>" + e 

									    return

									"""get html src,return string"""

									def gGetHtml(url): 

									  if url==None : return

									  if not httpExists(url): return

									  try: 

									    page = urllib.urlopen(url) 

									    html = page.read() 

									    page.close() 

									    return html 

									  except Exception, e: 

									    print "gGetHtml() error! Exception ==>>" + e 

									    return

									"""根据url获取文件名"""

									def gGetFileName(url): 

									  if url==None: return None 

									  if url=="" : return ""

									  arr=url.split("/") 

									  return arr[len(arr)-1] 

									"""生成随机文件名"""

									def gRandFilename(type): 

									  fname = ''

									  for i in range(16): 

									    fname = fname + chr(random.randint(65,90)) 

									    fname = fname + chr(random.randint(48,57)) 

									  return fname + '.' + type

									"""根据url和其上的link，得到link的绝对地址"""

									def gGetAbslLink(url,link): 

									  if url==None or link == None : return

									  if url=='' or link=='' : return url 

									  addr = ''

									  if link[0] == '/' : 

									    addr = gGetHttpAddr(url) + link 

									  elif len(link)>3 and link[0:4] == 'http': 

									    addr = link 

									  elif len(link)>2 and link[0:2] == '..': 

									    addr = gGetHttpAddrFatherAssign(url,link) 

									  else: 

									    addr = gGetHttpAddrFather(url) + link 

									  return addr 

									"""根据输入的lines，匹配正则表达式，返回list"""

									def gGetRegList(linesList,regx): 

									  if linesList==None : return

									  rtnList=[] 

									  for line in linesList: 

									    matchs = re.search(regx, line, re.IGNORECASE) 

									    if matchs!=None: 

									      allGroups = matchs.groups() 

									      for foundStr in allGroups: 

									        if foundStr not in rtnList: 

									          rtnList.append(foundStr) 

									  return rtnList 

									"""根据url下载文件，文件名参数指定"""

									def gDownloadWithFilename(url,savePath,file): 

									  #参数检查，现忽略 

									  try: 

									    urlopen=urllib.URLopener() 

									    fp = urlopen.open(url) 

									    data = fp.read() 

									    fp.close() 

									    file=open(savePath + file,'w+b') 

									    file.write(data) 

									    file.close() 

									  except IOError, error: 

									    print "DOWNLOAD %s ERROR!==>>%s" % (url, error) 

									  except Exception, e: 

									    print "Exception==>>" + e 

									"""根据url下载文件，文件名自动从url获取"""

									def gDownload(url,savePath): 

									  #参数检查，现忽略 

									  fileName = gGetFileName(url) 

									  #fileName =gRandFilename('jpg') 

									  gDownloadWithFilename(url,savePath,fileName) 

									"""根据某网页的url,下载该网页的jpg"""

									def gDownloadHtmlJpg(downloadUrl,savePath): 

									  lines= gGetHtmlLines(downloadUrl) # 'get the page source' 

									  regx = r"""src\s*="?(\S+)\.jpg""" 

									  lists =gGetRegList(lines,regx) #'get the links which match regular express' 

									  if lists==None: return

									  for jpg in lists: 

									    jpg = gGetAbslLink(downloadUrl, jpg) + '.jpg'

									    gDownload(jpg,savePath) 

									    print gGetFileName(jpg) 

									"""根据url取主站地址"""

									def gGetHttpAddr(url): 

									  if url== '' : return ''

									  arr=url.split("/") 

									  return arr[0]+"//"+arr[2] 

									"""根据url取上级目录"""

									def gGetHttpAddrFather(url): 

									  if url=='' : return ''

									  arr=url.split("/") 

									  addr = arr[0]+'//'+arr[2]+ '/'

									  if len(arr)-1>3 : 

									    for i in range(3,len(arr)-1): 

									      addr = addr + arr[i] + '/'

									  return addr 

									"""根据url和上级的link取link的绝对地址"""

									def gGetHttpAddrFatherAssign(url,link): 

									  if url=='' : return ''

									  if link=='': return ''

									  linkArray=link.split("/") 

									  urlArray = url.split("/") 

									  partLink =''

									  partUrl = ''

									  for i in range(len(linkArray)): 

									    if linkArray[i]=='..': 

									      numOfFather = i + 1 #上级数 

									    else: 

									      partLink = partLink + '/' + linkArray[i] 

									  for i in range(len(urlArray)-1-numOfFather): 

									    partUrl = partUrl + urlArray[i] 

									    if i < len(urlArray)-1-numOfFather -1 : 

									      partUrl = partUrl + '/'

									  return partUrl + partLink 

									"""根据url获取其上的相关htm、html链接，返回list"""

									def gGetHtmlLink(url): 

									  #参数检查，现忽略 

									  rtnList=[] 

									  lines=gGetHtmlLines(url) 

									  regx = r"""href="?(\S+)\.htm""" 

									  for link in gGetRegList(lines,regx): 

									    link = gGetAbslLink(url,link) + '.htm'

									    if link not in rtnList: 

									      rtnList.append(link) 

									      print link 

									  return rtnList 

									"""根据url，抓取其上的jpg和其链接htm上的jpg"""

									def gDownloadAllJpg(url,savePath): 

									  #参数检查，现忽略 

									  gDownloadHtmlJpg(url,savePath) 

									  #抓取link上的jpg 

									  links=gGetHtmlLink(url) 

									  for link in links: 

									    gDownloadHtmlJpg(link,savePath) 

									"""test"""

									def main(): 

									  u='http://site.douban.com/196738/room/2462453/'#想要抓取图片的地址

									  save='/root/python/tmp/' #图片所要存放的目录

									  print 'download pic from [' + u +']'

									  print 'save to [' +save+'] ...'

									  gDownloadHtmlJpg(u,save) 

									  print "download finished"

									if __name__ == "__main__":

									  main()

									else:

									  print "called from intern."