本文介绍了selenium设置proxy、headers的方法,把phantomjs、Chrome、Firefox几个浏览器的设置方法都总结一下,分享给大家,也给自己留个笔记
phantomjs
设置ip
方法1:
1
2
3
4
5
6
7
8
|
service_args = [ '--proxy=%s' % ip_html, # 代理 IP:prot (eg:192.168.0.28:808) '--proxy-type=http' , # 代理类型:http/https ‘ - - load - images = no', # 关闭图片加载(可选) '--disk-cache=yes' , # 开启缓存(可选) '--ignore-ssl-errors=true' # 忽略https错误(可选) ] driver = webdriver.PhantomJS(service_args = service_args) |
方法2:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
browser = webdriver.PhantomJS(PATH_PHANTOMJS) # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = '1.9.171.51:800' # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS) browser.get( 'http://1212.ip138.com/ic.asp' ) print ( '1: ' ,browser.session_id) print ( '2: ' ,browser.page_source) print ( '3: ' ,browser.get_cookies()) |
还原为系统代理
1
2
3
4
5
6
|
# 还原为系统代理 proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS) browser.get( 'http://1212.ip138.com/ic.asp' ) |
设置请求头
方法2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
import random,requests,json from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.proxy import ProxyType #随机获取一个ip def proxies(): r = requests.get( "http://120.26.166.214:9840/JProxy/update/proxy/scoreproxy" ) rr = json.loads(r.text) hh = rr[ 'ip' ] + ":" + "8907" print (hh) return hh ips = proxies() #设置phantomjs请求头和代理方法一: #------------------------------------------------------------------------------------- # 设置代理 service_args = [ '--proxy=%s' % ips, # 代理 IP:prot (eg:192.168.0.28:808) '--ssl-protocol=any' , #忽略ssl协议 '--load - images = no' , # 关闭图片加载(可选) '--disk-cache=yes' , # 开启缓存(可选) '--ignore-ssl-errors=true' # 忽略https错误(可选) ] #设置请求头 user_agent = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36" ) dcap = dict (DesiredCapabilities.PHANTOMJS) dcap[ "phantomjs.page.settings.userAgent" ] = user_agent driver = webdriver.PhantomJS(executable_path = r "C:\soft\phantomjs-2.1.1-windows\bin\phantomjs.exe" , desired_capabilities = dcap,service_args = service_args) driver.get(url = 'http://www.baidu.com' ) page = driver.page_source print (page) #设置phantomjs请求头和代理方法二: #------------------------------------------------------------------------------------- desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() # 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器 desired_capabilities[ "phantomjs.page.settings.userAgent" ] = (random.choice( '请求头池' )) # 不载入图片,爬页面速度会快很多 desired_capabilities[ "phantomjs.page.settings.loadImages" ] = False # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = random.choice( 'ip池' ) proxy.add_to_capabilities(desired_capabilities) phantomjs_driver = r 'C:\phantomjs-2.1.1-windows\bin\phantomjs.exe' # 打开带配置信息的phantomJS浏览器 driver = webdriver.PhantomJS(executable_path = phantomjs_driver,desired_capabilities = desired_capabilities) driver.start_session(desired_capabilities) driver.get(url = 'http://www.baidu.com' ) page = driver.page_source print (page) # 隐式等待5秒,可以自己调节 driver.implicitly_wait( 5 ) # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项 # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout( 20 ) # 设置10秒脚本超时时间 driver.set_script_timeout( 20 ) #翻页命令 driver.execute_script( 'window.scrollTo(0, document.body.scrollHeight)' ) |
firefox
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
import time from selenium.webdriver.common.proxy import * myProxy = '202.202.90.20:8080' proxy = Proxy({ 'proxyType' : ProxyType.MANUAL, 'httpProxy' : myProxy, 'ftpProxy' : myProxy, 'sslProxy' : myProxy, 'noProxy' : '' }) profile = webdriver.FirefoxProfile() if proxy: profile = get_firefox_profile_with_proxy_set(profile, proxy) if user_agent: profile.set_preference( "general.useragent.override" , user_agent) driver = webdriver.Firefox(proxy = proxy,profile = profile) driver.get( 'https://www.baidu.com' ) time.sleep( 3 ) driver.quit() firefox无头模式 from selenium import webdriver # 创建的新实例驱动 options = webdriver.FirefoxOptions() #火狐无头模式 options.add_argument( '--headless' ) options.add_argument( '--disable-gpu' ) # options.add_argument('window-size=1200x600') executable_path = './source/geckodriver/geckodriver.exe' driver_path = webdriver.Firefox(firefox_options = options,executable_path = executable_path) |
chrome
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
# !/usr/bin/python # -*- coding: utf-8 -*- from selenium import webdriver # 进入浏览器设置 options = webdriver.ChromeOptions() #谷歌无头模式 options.add_argument( '--headless' ) options.add_argument( '--disable-gpu' ) # options.add_argument('window-size=1200x600') # 设置中文 options.add_argument( 'lang=zh_CN.UTF-8' ) # 更换头部 options.add_argument( 'user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"' ) #设置代理 if proxy: options.add_argument( 'proxy-server=' + proxy) if user_agent: options.add_argument( 'user-agent=' + user_agent) browser = webdriver.Chrome(chrome_options = options) url = "https://httpbin.org/get?show_env=1" browser.get(url) browser.quit() |
selenium设置chrome–cookie
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
# !/usr/bin/python # -*- coding: utf-8 -*- from selenium import webdriver browser = webdriver.Chrome() url = "https://www.baidu.com/" browser.get(url) # 通过js新打开一个窗口 newwindow = 'window.open("https://www.baidu.com");' # 删除原来的cookie browser.delete_all_cookies() # 携带cookie打开 browser.add_cookie({ 'name' : 'ABC' , 'value' : 'DEF' }) # 通过js新打开一个窗口 browser.execute_script(newwindow) input ( "查看效果" ) browser.quit() |
selenium设置chrome-图片不加载
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
from selenium import webdriver options = webdriver.ChromeOptions() prefs = { 'profile.default_content_setting_values' : { 'images' : 2 } } options.add_experimental_option( 'prefs' , prefs) browser = webdriver.Chrome(chrome_options = options) # browser = webdriver.Chrome() url = "http://image.baidu.com/" browser.get(url) input ( "是否有图" ) browser.quit() |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/xc_zhou/article/details/80823855