python爬虫系列Selenium定向爬取虎扑篮球图片详解_Python

前言：

作为一名从小就看篮球的球迷，会经常逛虎扑篮球及湿乎乎等论坛，在论坛里面会存在很多精美图片，包括NBA球队、CBA明星、花边新闻、球鞋美女等等，如果一张张右键另存为的话真是手都点疼了。作为程序员还是写个程序来进行吧！

所以我通过Python+Selenium+正则表达式+urllib2进行海量图片爬取。

运行效果：

http://photo.hupu.com/nba/tag/马刺

http://photo.hupu.com/nba/tag/陈露

python爬虫系列Selenium定向爬取虎扑篮球图片详解

源代码：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

									# -*- coding: utf-8 -*- 

									""" 

									Crawling pictures by selenium and urllib

									url: 虎扑 马刺 http://photo.hupu.com/nba/tag/%E9%A9%AC%E5%88%BA

									url: 虎扑 陈露 http://photo.hupu.com/nba/tag/%E9%99%88%E9%9C%B2

									Created on 2015-10-24

									@author: Eastmount CSDN 

									"""

									import time   

									import re   

									import os 

									import sys 

									import urllib 

									import shutil 

									import datetime 

									from selenium import webdriver  

									from selenium.webdriver.common.keys import Keys  

									import selenium.webdriver.support.ui as ui  

									from selenium.webdriver.common.action_chains import ActionChains 

									#Open PhantomJS 

									driver = webdriver.PhantomJS(executable_path="G:\phantomjs-1.9.1-windows\phantomjs.exe")

									#driver = webdriver.Firefox() 

									wait = ui.WebDriverWait(driver,10) 

									#Download one Picture By urllib 

									def loadPicture(pic_url, pic_path): 

									 pic_name = os.path.basename(pic_url) #删除路径获取图片名字

									 pic_name = pic_name.replace('*','') #去除'*' 防止错误 invalid mode ('wb') or filename

									 urllib.urlretrieve(pic_url, pic_path + pic_name)

									#爬取具体的图片及下一张

									def getScript(elem_url, path, nums):

									 try:

									  #由于链接 http://photo.hupu.com/nba/p29556-1.html

									  #只需拼接 http://..../p29556-数字.html 省略了自动点击"下一张"操作

									  count = 1

									  t = elem_url.find(r'.html')

									  while (count <= nums):

									   html_url = elem_url[:t] + '-' + str(count) + '.html'

									   #print html_url

									   '''

									   driver_pic.get(html_url)

									   elem = driver_pic.find_element_by_xpath("//div[@class='pic_bg']/div/img")

									   url = elem.get_attribute("src")

									   '''

									   #采用正则表达式获取第3个<div></div> 再获取图片URL进行下载

									   content = urllib.urlopen(html_url).read()

									   start = content.find(r'<div class="flTab">')

									   end = content.find(r'<div class="comMark" style>')

									   content = content[start:end]

									   div_pat = r'<div.*?>(.*?)<\/div>'

									   div_m = re.findall(div_pat, content, re.S|re.M)

									   #print div_m[2]

									   link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", div_m[2])

									   #print link_list

									   url = link_list[0] #仅仅一条url链接

									   loadPicture(url, path)

									   count = count + 1

									 except Exception,e: 

									  print 'Error:',e 

									 finally: 

									  print 'Download ' + str(count) + ' pictures\n'

									#爬取主页图片集的URL和主题 

									def getTitle(url): 

									 try: 

									  #爬取URL和标题 

									  count = 0

									  print 'Function getTitle(key,url)'

									  driver.get(url) 

									  wait.until(lambda driver: driver.find_element_by_xpath("//div[@class='piclist3']"))

									  print 'Title: ' + driver.title + '\n'

									  #缩略图片url(此处无用) 图片数量 标题(文件名) 注意顺序

									  elem_url = driver.find_elements_by_xpath("//a[@class='ku']/img")

									  elem_num = driver.find_elements_by_xpath("//div[@class='piclist3']/table/tbody/tr/td/dl/dd[1]")

									  elem_title = driver.find_elements_by_xpath("//div[@class='piclist3']/table/tbody/tr/td/dl/dt/a")

									  for url in elem_url: 

									   pic_url = url.get_attribute("src")

									   html_url = elem_title[count].get_attribute("href")

									   print elem_title[count].text

									   print html_url 

									   print pic_url

									   print elem_num[count].text

									   #创建图片文件夹

									   path = "E:\\Picture_HP\\" + elem_title[count].text + "\\"

									   m = re.findall(r'(\w*[0-9]+)\w*', elem_num[count].text) #爬虫图片张数

									   nums = int(m[0])

									   count = count + 1

									   if os.path.isfile(path):   #Delete file 

									    os.remove(path) 

									   elif os.path.isdir(path):  #Delete dir 

									    shutil.rmtree(path, True) 

									   os.makedirs(path)    #create the file directory 

									   getScript(html_url, path, nums) #visit pages

									 except Exception,e: 

									  print 'Error:',e 

									 finally: 

									  print 'Find ' + str(count) + ' pages with key\n'

									#Enter Function 

									def main(): 

									 #Create Folder 

									 basePathDirectory = "E:\\Picture_HP"

									 if not os.path.exists(basePathDirectory): 

									  os.makedirs(basePathDirectory) 

									 #Input the Key for search str=>unicode=>utf-8 

									 key = raw_input("Please input a key: ").decode(sys.stdin.encoding) 

									 print 'The key is : ' + key 

									 #Set URL List Sum:1-2 Pages 

									 print 'Ready to start the Download!!!\n\n'

									 starttime = datetime.datetime.now() 

									 num=1

									 while num<=1:

									  #url = 'http://photo.hupu.com/nba/tag/%E9%99%88%E9%9C%B2?p=2&o=1'

									  url = 'http://photo.hupu.com/nba/tag/%E9%A9%AC%E5%88%BA'

									  print '第'+str(num)+'页','url:'+url 

									  #Determine whether the title contains key 

									  getTitle(url) 

									  time.sleep(2) 

									  num = num + 1

									 else: 

									  print 'Download Over!!!'

									 #get the runtime 

									 endtime = datetime.datetime.now() 

									 print 'The Running time : ',(endtime - starttime).seconds 

									main()