C#多线程爬虫抓取免费代理IP的示例代码_C#

C#多线程爬虫抓取免费代理IP的示例代码

2022-01-20 14:14L-H C#

本篇文章主要介绍了C#多线程爬虫抓取免费代理IP的示例代码，小编觉得挺不错的，现在分享给大家，也给大家做个参考。一起跟随小编过来看看吧

这里用到一个HTML解析辅助类：HtmlAgilityPack,如果没有网上找一个增加到库里，这个插件有很多版本,如果你开发环境是使用VS2005就2.0的类库，VS2010就使用4.0,以此类推..........然后直接创建一个控制台应用，将我下面的代码COPY替换就可以运行,下面就来讲讲我两年前做爬虫经历，当时是给一家公司做，也是用的C#，不过当时遇到一个头痛的问题就是抓的图片有病毒，然后系统挂了几次。所以抓网站图片要注意安全，虽然我这里没涉及到图片，但是还是提醒下看文章的朋友。

				?

									class Program

									 {

									   //存放所有抓取的代理

									   public static List<proxy> masterPorxyList = new List<proxy>();

									   //代理IP类

									   public class proxy

									   {

									     public string ip;

									     public string port;

									     public int speed;

									     public proxy(string pip,string pport,int pspeed)

									     {

									       this.ip = pip;

									       this.port = pport;

									       this.speed = pspeed;

									      }

									   }

									   //抓去处理方法

									   static void getProxyList(object pageIndex)

									   {

									     string urlCombin = "http://www.xicidaili.com/wt/" + pageIndex.ToString();

									     string catchHtml = catchProxIpMethord(urlCombin, "UTF8");

									     HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

									     doc.LoadHtml(catchHtml);

									     HtmlNode table = doc.DocumentNode.SelectSingleNode("//div[@id='wrapper']//div[@id='body']/table[1]");

									     HtmlNodeCollection collectiontrs = table.SelectNodes("./tr");  

									       for (int i = 0; i < collectiontrs.Count; i++)

									       {

									         HtmlAgilityPack.HtmlNode itemtr = collectiontrs[i];

									         HtmlNodeCollection collectiontds = itemtr.ChildNodes;

									         //table中第一个是能用的代理标题，所以这里从第二行TR开始取值

									         if (i>0)

									         {

									           HtmlNode itemtdip = (HtmlNode)collectiontds[3];

									           HtmlNode itemtdport = (HtmlNode)collectiontds[5];

									           HtmlNode itemtdspeed = (HtmlNode)collectiontds[13];

									           string ip = itemtdip.InnerText.Trim();

									           string port = itemtdport.InnerText.Trim();

									           string speed = itemtdspeed.InnerHtml;

									           int beginIndex = speed.IndexOf(":", 0, speed.Length);

									           int endIndex = speed.IndexOf("%", 0, speed.Length);

									           int subSpeed = int.Parse(speed.Substring(beginIndex + 1, endIndex - beginIndex - 1));

									           //如果速度展示条的值大于90,表示这个代理速度快。

									          if (subSpeed > 90)

									           {

									             proxy temp = new proxy(ip, port, subSpeed);

									             masterPorxyList.Add(temp);

									             Console.WriteLine("当前是第:" + masterPorxyList.Count.ToString() + "个代理IP");

									           }

									          }

									       }

									   }

									   //抓网页方法

									   static string catchProxIpMethord(string url,string encoding )

									   {

									     string htmlStr = "";

									     try

									     {

									       if (!String.IsNullOrEmpty(url))

									       {

									         WebRequest request = WebRequest.Create(url);  

									         WebResponse response = request.GetResponse();      

									         Stream datastream = response.GetResponseStream(); 

									         Encoding ec = Encoding.Default;

									         if (encoding == "UTF8")

									         {

									           ec = Encoding.UTF8;

									         }

									         else if (encoding == "Default")

									         {

									           ec = Encoding.Default;

									         }

									         StreamReader reader = new StreamReader(datastream, ec);

									         htmlStr = reader.ReadToEnd();        

									         reader.Close();

									         datastream.Close();

									         response.Close();

									       }

									     }

									     catch { }

									     return htmlStr;

									   }

									  static void Main(string[] args)

									    {

									      //多线程同时抓15页

									      for (int i = 1; i <= 15; i++)

									      {

									        ThreadPool.QueueUserWorkItem(getProxyList, i);

									      }

									      Console.Read();

									    }

									 }