这里用到一个HTML解析辅助类:HtmlAgilityPack,如果没有网上找一个增加到库里,这个插件有很多版本,如果你开发环境是使用VS2005就2.0的类库,VS2010就使用4.0,以此类推..........然后直接创建一个控制台应用,将我下面的代码COPY替换就可以运行,下面就来讲讲我两年前做爬虫经历,当时是给一家公司做,也是用的C#,不过当时遇到一个头痛的问题就是抓的图片有病毒,然后系统挂了几次。所以抓网站图片要注意安全,虽然我这里没涉及到图片,但是还是提醒下看文章的朋友。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
class Program { //存放所有抓取的代理 public static List<proxy> masterPorxyList = new List<proxy>(); //代理IP类 public class proxy { public string ip; public string port; public int speed; public proxy( string pip, string pport, int pspeed) { this .ip = pip; this .port = pport; this .speed = pspeed; } } //抓去处理方法 static void getProxyList( object pageIndex) { string urlCombin = "http://www.xicidaili.com/wt/" + pageIndex.ToString(); string catchHtml = catchProxIpMethord(urlCombin, "UTF8" ); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(catchHtml); HtmlNode table = doc.DocumentNode.SelectSingleNode( "//div[@id='wrapper']//div[@id='body']/table[1]" ); HtmlNodeCollection collectiontrs = table.SelectNodes( "./tr" ); for ( int i = 0; i < collectiontrs.Count; i++) { HtmlAgilityPack.HtmlNode itemtr = collectiontrs[i]; HtmlNodeCollection collectiontds = itemtr.ChildNodes; //table中第一个是能用的代理标题,所以这里从第二行TR开始取值 if (i>0) { HtmlNode itemtdip = (HtmlNode)collectiontds[3]; HtmlNode itemtdport = (HtmlNode)collectiontds[5]; HtmlNode itemtdspeed = (HtmlNode)collectiontds[13]; string ip = itemtdip.InnerText.Trim(); string port = itemtdport.InnerText.Trim(); string speed = itemtdspeed.InnerHtml; int beginIndex = speed.IndexOf( ":" , 0, speed.Length); int endIndex = speed.IndexOf( "%" , 0, speed.Length); int subSpeed = int .Parse(speed.Substring(beginIndex + 1, endIndex - beginIndex - 1)); //如果速度展示条的值大于90,表示这个代理速度快。 if (subSpeed > 90) { proxy temp = new proxy(ip, port, subSpeed); masterPorxyList.Add(temp); Console.WriteLine( "当前是第:" + masterPorxyList.Count.ToString() + "个代理IP" ); } } } } //抓网页方法 static string catchProxIpMethord( string url, string encoding ) { string htmlStr = "" ; try { if (!String.IsNullOrEmpty(url)) { WebRequest request = WebRequest.Create(url); WebResponse response = request.GetResponse(); Stream datastream = response.GetResponseStream(); Encoding ec = Encoding.Default; if (encoding == "UTF8" ) { ec = Encoding.UTF8; } else if (encoding == "Default" ) { ec = Encoding.Default; } StreamReader reader = new StreamReader(datastream, ec); htmlStr = reader.ReadToEnd(); reader.Close(); datastream.Close(); response.Close(); } } catch { } return htmlStr; } static void Main( string [] args) { //多线程同时抓15页 for ( int i = 1; i <= 15; i++) { ThreadPool.QueueUserWorkItem(getProxyList, i); } Console.Read(); } } |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:http://www.cnblogs.com/xiaoliao/p/7436711.html?utm_source=tuicool&utm_medium=referral