上次做了一个帮公司妹子做了爬虫,不是很精致,这次公司项目里要用到,于是有做了一番修改,功能添加了网址图片采集,下载,线程处理界面网址图片下载等。
说说思路:首相获取初始网址的所有内容 在初始网址采集图片 去初始网址采集链接 把采集到的链接放入队列 继续采集图片,然后继续采集链接,无限循环
还是上图片大家看一下,
处理网页内容抓取跟网页网址爬取都做了改进,下面还是大家来看看代码,有不足之处,还请之处!
网页内容抓取HtmlCodeRequest,
网页网址爬取GetHttpLinks,用正则去筛选html中的Links
图片抓取GetHtmlImageUrlList,用正则去筛选html中的Img
都写进了一个封装类里面 HttpHelper
////// 取得HTML中所有图片的 URL。 /// /// HTML代码 ///图片的URL列表 public static string HtmlCodeRequest(string Url) { if (string.IsNullOrEmpty(Url)) { return ""; } try { //创建一个请求 HttpWebRequest httprequst = (HttpWebRequest)WebRequest.Create(Url); //不建立持久性链接 httprequst.KeepAlive = true; //设置请求的方法 httprequst.Method = "GET"; //设置标头值 httprequst.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705"; httprequst.Accept = "*/*"; httprequst.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); httprequst.ServicePoint.Expect100Continue = false; httprequst.Timeout = 5000; httprequst.AllowAutoRedirect = true;//是否允许302 ServicePointManager.DefaultConnectionLimit = 30; //获取响应 HttpWebResponse webRes = (HttpWebResponse)httprequst.GetResponse(); //获取响应的文本流 string content = string.Empty; using (System.IO.Stream stream = webRes.GetResponseStream()) { using (System.IO.StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("utf-8"))) { content = reader.ReadToEnd(); } } //取消请求 httprequst.Abort(); //返回数据内容 return content; } catch (Exception) { return ""; } } ////// 提取页面链接 /// /// ///public static List GetHtmlImageUrlList(string url) { string html = HttpHelper.HtmlCodeRequest(url); if (string.IsNullOrEmpty(html)) { return new List (); } // 定义正则表达式用来匹配 img 标签 Regex regImg = new Regex(@"]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(? [^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); // 搜索匹配的字符串 MatchCollection matches = regImg.Matches(html); List sUrlList = new List (); // 取得匹配项列表 foreach (Match match in matches) sUrlList.Add(match.Groups["imgUrl"].Value); return sUrlList; } /// /// 提取页面链接 /// /// ///public static List GetHttpLinks(string url) { //获取网址内容 string html = HttpHelper.HtmlCodeRequest(url); if (string.IsNullOrEmpty(html)) { return new List (); } //匹配http链接 const string pattern2 = @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; Regex r2 = new Regex(pattern2, RegexOptions.IgnoreCase); //获得匹配结果 MatchCollection m2 = r2.Matches(html); List links = new List (); foreach (Match url2 in m2) { if (StringHelper.CheckUrlIsLegal(url2.ToString()) || !StringHelper.IsPureUrl(url2.ToString()) || links.Contains(url2.ToString())) continue; links.Add(url2.ToString()); } //匹配href里面的链接 const string pattern = @"(?i)]*?href=(['""]?)(?!javascript|__doPostBack)(? [^'""\s*#<>]+)[^>]*>"; ; Regex r = new Regex(pattern, RegexOptions.IgnoreCase); //获得匹配结果 MatchCollection m = r.Matches(html); foreach (Match url1 in m) { string href1 = url1.Groups["url"].Value; if (!href1.Contains("http")) { href1 = Global.WebUrl + href1; } if (!StringHelper.IsPureUrl(href1) || links.Contains(href1)) continue; links.Add(href1); } return links; }
这边下载图片有个任务条数限制,限制是200条。如果超过的话线程等待5秒,这里下载图片是异步调用的委托
public string DownLoadimg(string url) { if (!string.IsNullOrEmpty(url)) { try { if (!url.Contains("http")) { url = Global.WebUrl + url; } HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 2000; request.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705"; //是否允许302 request.AllowAutoRedirect = true; WebResponse response = request.GetResponse(); Stream reader = response.GetResponseStream(); //文件名 string aFirstName = Guid.NewGuid().ToString(); //扩展名 string aLastName = url.Substring(url.LastIndexOf(".") + 1, (url.Length - url.LastIndexOf(".") - 1)); FileStream writer = new FileStream(Global.FloderUrl + aFirstName + "." + aLastName, FileMode.OpenOrCreate, FileAccess.Write); byte[] buff = new byte[512]; //实际读取的字节数 int c = 0; while ((c = reader.Read(buff, 0, buff.Length)) > 0) { writer.Write(buff, 0, c); } writer.Close(); writer.Dispose(); reader.Close(); reader.Dispose(); response.Close(); return (aFirstName + "." + aLastName); } catch (Exception) { return "错误:地址" + url; } } return "错误:地址为空"; }
话不多说,更多的需要大家自己去改进咯!欢迎读者来与楼主进行交流。