要爬取一个网站遇到了极验的验证码,这周都在想着怎么破解这个,网上搜了好多知乎上看到有人问了这问题,我按照这思路去大概实现了一下。
1.使用htmlunit(这种方式我没成功,模拟鼠标拖拽后轨迹没生成,可以跳过)
我用的是java,我首先先想到了用直接用htmlunit,我做了点初始化
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
private void initWebClient() { if (webClient != null ) { return ; } webClient = new WebClient(BrowserVersion.FIREFOX_24); webClient.getOptions().setProxyConfig( new ProxyConfig( "127.0.0.1" , 8888 )); webClient.getOptions().setActiveXNative( true ); webClient.getOptions().setUseInsecureSSL( true ); // 配置证书 webClient.getOptions().setJavaScriptEnabled( true ); webClient.getOptions().setCssEnabled( true ); webClient.setCssErrorHandler( new SilentCssErrorHandler()); webClient.getOptions().setThrowExceptionOnScriptError( false ); webClient.getOptions().setThrowExceptionOnFailingStatusCode( false ); CookieManager cookieManager = new CookieManager(); List<org.apache.http.cookie.Cookie> httpCookies = client.getCookies(); //其方式获取的cookie for (org.apache.http.cookie.Cookie cookie : httpCookies) { cookieManager.addCookie( new com.gargoylesoftware.htmlunit.util.Cookie(cookie)); } webClient.setCookieManager(cookieManager); } |
初始化代理,cookie..然后就能正常调用了
1
2
|
HtmlPage page = webClient.getPage( "http://www.qixin.com/login" );//企信宝 gePageInfor(page); |
下面就是我获取图片,还原图片并且模拟拖拽,(这里我觉得是有些问题的,可能是拖拽我模拟的不对导致触发的js并没有生成正确的轨迹,还请大家帮忙看看哪里错了)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
private void gePageInfor(HtmlPage page) { String[] img_slice={ "div" , "class" , "gt_cut_fullbg_slice" }; String[] img_bg_slice={ "div" , "class" , "gt_cut_bg_slice" }; HtmlDivision div = (HtmlDivision) page.getElementById( "captcha" ); int deCAPTCHA = 0 ; try { byte [] img_slice_binary = client.get(getImgUrl(img_slice, div, true )).getBinary(); //获取图片byte byte [] img_bg_slice_binary = client.get(getImgUrl(img_bg_slice, div, false )).getBinary(); //获取还原后的图片 BufferedImage geetestImg = ImgTest.getGeetestImg(img_slice_binary, ImgTest.imgArray); BufferedImage geetestImg2 = ImgTest.getGeetestImg(img_bg_slice_binary, ImgTest.imgArray); //获得图片移动位置(目前还有问题,需改用第三方图片识别) deCAPTCHA =ImgTest.deCAPTCHA(geetestImg,geetestImg2); System.out.println(deCAPTCHA); } catch (IOException | FetchException e) { e.printStackTrace(); } HtmlDivision div_slider_knob = get_div_slider_knob(page, "gt_slider_knob gt_show" ); //获取要移动div HtmlPage mouseOver = (HtmlPage) div_slider_knob.mouseOver(); HtmlPage mouseDownPage = (HtmlPage)div_slider_knob.mouseDown(); div_slider_knob = get_div_slider_knob(mouseDownPage, "gt_slider_knob gt_show moving" ); mouseMoveX(deCAPTCHA, div_slider_knob, mouseDownPage); HtmlPage newPage =(HtmlPage)div_slider_knob.mouseOver(); // newPage =(HtmlPage)div_slider_knob.mouseDown(); System.out.println(newPage.asXml()); div = (HtmlDivision)newPage.getElementById( "captcha" ); HtmlElement htmlElement = div.getElementsByAttribute( "div" , "class" , "gt_slice gt_show moving" ).get( 0 ); System.out.println(htmlElement); newPage =(HtmlPage)div_slider_knob.mouseUp(); //触发js,轨迹没有生成 System.out.println( "---------------" ); System.out.println(newPage.asXml()); if (newPage.getElementById( "captcha" )!= null ) { //错误重试 //gePageInfor(newPage); } } private void mouseMoveX( int deCAPTCHA, HtmlDivision div_slider_knob, HtmlPage mouseDown) { MouseEvent mouseEvent = new MouseEvent(div_slider_knob, MouseEvent.TYPE_MOUSE_MOVE, false , false , false , MouseEvent.BUTTON_LEFT); mouseEvent.setClientX( mouseEvent.getClientX()+((deCAPTCHA!= 0 )?deCAPTCHA: 99 )); //移动x坐标 ScriptResult scriptResult = mouseDown.getDocumentElement().fireEvent(mouseEvent); } private HtmlDivision get_div_slider_knob(HtmlPage page,String classString) { return (HtmlDivision)(((HtmlDivision) page.getElementById( "captcha" )).getElementsByAttribute( "div" , "class" , classString).get( 0 )); } private String getImgUrl(String[] img_slice, HtmlDivision div, boolean isNeedCheckPostion) { String url = "" ; int [] postion = new int [ 2 ]; boolean empty = div.getElementsByAttribute(img_slice[ 0 ],img_slice[ 1 ],img_slice[ 2 ]).isEmpty(); if (div.hasChildNodes() && !empty) { List<HtmlElement> elementsByAttribute = div.getElementsByAttribute(img_slice[ 0 ],img_slice[ 1 ],img_slice[ 2 ]); for ( int i = 0 ;i<elementsByAttribute.size();i++){ HtmlDivision div_img = (HtmlDivision)elementsByAttribute.get(i); String style = div_img.getAttribute( "style" ); String[] imge_url_position = style.split( ";" ); if (StringUtils.isBlank(url)){ //确认url url = StringUtils.replacePattern(imge_url_position[ 0 ], ".*\\(" , "" ).replace( ")" , "" ); } if (isNeedCheckPostion) { //确认图片切割postion,两张图切割方式一样 background-position: -157px -58px // String[] positionS = StringUtils.split(StringUtils.remove(imge_url_position[1], "px").replace("-", "").replaceAll(".*:", ""), null); String[] positionS = StringUtils.split(StringUtils.removePattern(imge_url_position[ 1 ], "[^\\d+ \\s]" ), null ); postion[ 0 ] = Integer.parseInt(positionS[ 0 ]); postion[ 1 ] = Integer.parseInt(positionS[ 1 ]); int [] is = ImgTest.imgArray[i]; if (is[ 0 ]!=postion[ 0 ]||is[ 1 ]!=postion[ 1 ]) { logger.debug( "更新分割postion" ); ImgTest.imgArray[i] = postion; } System.out.println(ImgTest.imgArray); isNeedCheckPostion= false ; } } } return url; } |
对比图片获取位移方法(deCAPTCHA)是错的我就不放代码了,下面是其中还原图片用的方法,目前是其实审查元素后你就明白怎么还原这个图片了,这里是每次读的10px,58px
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
public static BufferedImage getGeetestImg( byte [] binary, int [][] imgArray) throws IOException { BufferedImage img = ImageIO.read( new ByteArrayInputStream(binary)); List<BufferedImage> list = new ArrayList<>(); for ( int i= 0 ;i< imgArray.length;i++) { BufferedImage subimage = img.getSubimage(imgArray[i][ 0 ], imgArray[i][ 1 ], 10 , 58 ); list.add(subimage); // ImageIO.write(subimage, "jpg", new File("d:\\image\\imgs"+i+".jpg")); } BufferedImage mergeImageUp = null ; BufferedImage mergeImageDown = null ; int mid = list.size()>>> 1 ; for ( int i = 0 ; i <mid- 1 ; i++) { mergeImageUp = mergeImage(mergeImageUp== null ?list.get(i):mergeImageUp, list.get(i+ 1 ), true ); } for ( int i = mid;i<list.size()- 1 ;i++){ mergeImageDown = mergeImage(mergeImageDown== null ?list.get(i):mergeImageDown,list.get(i+ 1 ), true ); } img = mergeImage(mergeImageUp, mergeImageDown, false ); return img; } public static BufferedImage mergeImage(BufferedImage img1, BufferedImage img2, boolean isHorizontal) throws IOException { int w1 = img1.getWidth(); int h1 = img1.getHeight(); int w2 = img2.getWidth(); int h2 = img2.getHeight(); // 从图片中读取RGB int [] ImageArrayOne = new int [w1 * h1]; ImageArrayOne = img1.getRGB( 0 , 0 , w1, h1, ImageArrayOne, 0 , w1); // 逐行扫描图像中各个像素的RGB到数组中 int [] ImageArrayTwo = new int [w2 * h2]; ImageArrayTwo = img2.getRGB( 0 , 0 , w2, h2, ImageArrayTwo, 0 , w2); // 生成新图片 BufferedImage DestImage = null ; if (isHorizontal) { // 水平方向合并 DestImage = new BufferedImage(w1+w2, h1, BufferedImage.TYPE_INT_RGB); DestImage.setRGB( 0 , 0 , w1, h1, ImageArrayOne, 0 , w1); // 设置上半部分或左半部分的RGB DestImage.setRGB(w1, 0 , w2, h2, ImageArrayTwo, 0 , w2); } else { // 垂直方向合并 DestImage = new BufferedImage(w1, h1 + h2, BufferedImage.TYPE_INT_RGB); DestImage.setRGB( 0 , 0 , w1, h1, ImageArrayOne, 0 , w1); // 设置上半部分或左半部分的RGB DestImage.setRGB( 0 , h1, w2, h2, ImageArrayTwo, 0 , w2); // 设置下半部分的RGB } return DestImage; } |
2.使用selenium
后来我想着是我模拟鼠标这个动作哪里有问题,我就又找到了selenium(2.42.2),他也能操作htmlunit关键他的鼠标动作好像封装比较完全
但是我尝试了以后发现了这个,HtmlUnitMouse这个动作没有实现
1
2
3
|
public void mouseMove(Coordinates where, long xOffset, long yOffset) { throw new UnsupportedOperationException( "Moving to arbitrary X,Y coordinates not supported." ); } |
好吧,于是调用chrome吧
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
System.setProperty( "webdriver.chrome.driver" , "C:\\chromedriver.exe" ); Proxy proxy = new Proxy(); //设置代理服务器地址 proxy.setHttpProxy( "127.0.0.1:8888" ); // DesiredCapabilities capabilities = DesiredCapabilities.htmlUnitWithJs(); DesiredCapabilities capabilities = DesiredCapabilities.chrome(); capabilities.setCapability(CapabilityType.PROXY, proxy); // final WebDriver driver = new HtmlUnitDriver(capabilities); WebDriver driver = new ChromeDriver(capabilities); driver.get( "http://www.qixin.com/login" ); driver.manage().timeouts().implicitlyWait( 10 , TimeUnit.SECONDS); checkPage(driver, "return $('.gt_cut_fullbg_slice');" ); // 获取 网页的 title System.out.println( "1 Page title is: " + driver.getTitle()); // 通过 id 找到 input 的 DOM String pageSource = driver.getPageSource(); System.out.println(pageSource); org.openqa.selenium.JavascriptExecutor executor = (org.openqa.selenium.JavascriptExecutor)driver; boolean equals = executor.executeScript( "return document.readyState" ).equals( "complete" ); int moveX = 99 ; //移动位置 if (equals) { WebElement element = driver.findElement(By.className( "gt_slider_knob" )); //(".gt_slider_knob")); Point location = element.getLocation(); element.getSize(); Actions action = new Actions(driver); // action.clickAndHold().perform();// 鼠标在当前位置点击后不释放 // action.clickAndHold(element).perform();// 鼠标在 onElement 元素的位置点击后不释放 // action.clickAndHold(element).moveByOffset(location.x+99,location.y).release().perform(); //选中source元素->拖放到(xOffset,yOffset)位置->释放左键 action.dragAndDropBy(element, location.x+moveX,location.y).perform(); // action.dragAndDrop(element,newelement).perform(); pageSource = driver.getPageSource(); } //更新cookie Set<org.openqa.selenium.Cookie> cookies = driver.manage().getCookies(); Set<Cookie> cookies2 = new HashSet<>(); for (org.openqa.selenium.Cookie cookie : cookies) { cookies2.add((Cookie) new Cookie(cookie.getDomain(), cookie.getName(), cookie.getValue(), cookie.getPath(), cookie.getExpiry(), true )); } for (Cookie cookie : cookies2) { org.apache.http.cookie.Cookie httpClient = cookie.toHttpClient(); } System.out.println(pageSource); |
这样提交的表单确实是有轨迹的,这里移动位置我先写了个固定值,可以由上面图片还原,以及一些开源的图片识别工具识别出位置。以上应该就能解决这个滑动验证码了
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:http://www.cnblogs.com/wangly/p/5630069.html