频道栏目
首页 > 资讯 > ASP.Net > 正文

用DOM实现文章采集--采集到网页源码

12-04-27        来源:[db:作者]  
收藏   我要投稿
先来个采集网页的代码。
[csharp]
using System; 
using System.Collections.Generic; 
using System.IO; 
using System.IO.Compression; 
using System.Net; 
using System.Text; 
namespace TopWinCMS.Common 

    public class NetHelper 
    { 
 
        //private string _HTTP_USER_AGENT = "Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.2;+SV1;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727)"; 
        private string _UserAgent = "Googlebot/2.1 (+http://www.google.com/bot.html)"; 
        private Encoding _HttpEncoding = null; 
        private string _ProxyHost = string.Empty; 
        private int _ProxyInt = 8080; 
        private int _TimeOut = 200000; 
 
        #region 属性 
        /// <summary> 
        /// 设置UserAgent 
        /// </summary> 
        public string UserAgent 
        { 
            get 
            { 
                return this._UserAgent; 
            } 
            set 
            { 
                this._UserAgent = value; 
            } 
        } 
        /// <summary> 
        /// 设置编码 
        /// </summary> 
        public Encoding HttpEncoding 
        { 
            get 
            { 
                return this._HttpEncoding; 
            } 
            set 
            { 
                this._HttpEncoding = value; 
            } 
        } 
        /// <summary> 
        /// 设置代理服务器 
        /// </summary> 
        public string ProxyHost 
        { 
            get 
            { 
                return this._ProxyHost; 
            } 
            set 
            { 
                this._ProxyHost = value; 
            } 
        } 
        /// <summary> 
        /// 设置代理服务器端口 
        /// </summary> 
        public int ProxyInt 
        { 
            get 
            { 
                return this._ProxyInt; 
            } 
            set 
            { 
                this._ProxyInt = value; 
            } 
        } 
        /// <summary> 
        /// 设置默认超时时间 
        /// </summary> 
        public int TimeOut 
        { 
            get 
            { 
                return this._TimeOut; 
            } 
            set 
            { 
                this._TimeOut = value; 
            } 
        } 
        #endregion 
 
        public RemoteRes Get(string uri) 
        { 
            return Get(new Uri(uri)); 
        } 
        public RemoteRes Get(Uri uri) 
        { 
            RemoteRes info = new RemoteRes(); 
 
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri); 
            request.Timeout = this._TimeOut; 
            request.UserAgent = this._UserAgent; 
            request.Method = "GET";  
            request.Referer = string.Concat("http://", uri.Host); 
 
            if (this._ProxyHost.Length > 0) 
            { 
                request.Proxy = new WebProxy(this._ProxyHost, this._ProxyInt); 
            } 
            HttpWebResponse response = null; 
            Stream responseStream = null; 
            try 
            { 
                Encoding encoding; 
                response = (HttpWebResponse)request.GetResponse(); 
                responseStream = response.GetResponseStream(); 
               
                if (response.Headers["Accept-Encoding"] != null) 
                { 
                    if (MyCollections.Contain(response.Headers["Accept-Encoding"], "*", "gzip", "x-gzip")) 
                    { 
                        responseStream = new GZipStream(responseStream, CompressionMode.Decompress); 
                    } 
                } 
                else if (response.Headers["Content-Encoding"] != null) 
                { 
                    if (MyCollections.Contain(response.Headers["Content-Encoding"], "*", "gzip", "x-gzip")) 
                    { 
                        responseStream = new GZipStream(responseStream, CompressionMode.Decompress); 
                    } 
                } 
                
                if (this._HttpEncoding == null) 
                { 
                    string str = response.CharacterSet.ToLower(); 
                    if (str.Length > 3) 
                    { 
                        if (str.Substring(0, 3) == "iso") 
                        { 
                            encoding = Encoding.Default; 
                        } 
                        else 
                        { 
                            encoding = Encoding.GetEncoding(response.CharacterSet); 
                        } 
                    } 
                    else 
                    { 
                        encoding = Encoding.GetEncoding(response.CharacterSet); 
                    } 
                    if (str.Length == 0) 
                    { 
                        encoding = Encoding.UTF8; 
                    } 
                } 
                else 
                { 
                    encoding = this._HttpEncoding; 
                } 
                info.HTML = new StreamReader(responseStream, encoding).ReadToEnd(); 
                info.ContentType = response.ContentType; 
                info.StatusCode = response.StatusCode; 
 
            } 
            catch (WebException WE) 
            { 
                if (WE.Response != null) 
                { 
                    info.StatusCode = (WE.Response as HttpWebResponse).StatusCode; 
                } 
                else 
                { 
                    info.StatusCode = HttpStatusCode.ServiceUnavailable; 
                } 
                info.Code = "错误:" + WE.Message; 
 
            } 
            catch (Exception ex) 
            { 
                info.Code = "错误:" + ex.Message; 
                info.StatusCode = HttpStatusCode.InternalServerError; 
            } 
            finally 
            { 
                if (responseStream != null) 
                    responseStream.Close(); 
                if (response != null) 
                    response.Close(); 
            } 
 
            return info; 
        } 
 
        #region 取得远程资源 
        /// <summary> 
        /// 取得远程资源 www.2cto.com  
        /// </summary> 
        /// <param name="strUrl">要取的URL</param> 
        /// <returns>网页源代码</returns> 
        public RemoteRes GetRemoteResource(string strUrl) 
        { 
            HttpWebResponse response = null; 
            Stream stream = null; 
            RemoteRes info = new RemoteRes(); 
            try 
            { 
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl); 
                request.AllowAutoRedirect = true; 
                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506)"; 
                request.Referer = "http://" + new Uri(strUrl).Host; 
                response = request.GetResponse() as HttpWebResponse; 
                stream = response.GetResponseStream(); 
                info.ContentType = response.ContentType; 
                MemoryStream ms = new MemoryStream(); 
 
                byte[] buffer = new byte[256]; 
 
                int c = stream.Read(buffer, 0, buffer.Length); 
 
                while (c > 0) 
                { 
                    ms.Write(buffer, 0, c); 
                    c = stream.Read(buffer, 0, buffer.Length); 
                } 
                stream.Close(); 
 
                info.StatusCode = response.StatusCode; 
 
                info.Bytes = ms.ToArray(); 
 
            } 
            catch (WebException WE) 
            { 
                if (WE.Response != null) 
                { 
                    info.StatusCode = (WE.Response as HttpWebResponse).StatusCode; 
                } 
                else 
                { 
                    info.StatusCode = HttpStatusCode.ServiceUnavailable; 
                } 
 
                return null; 
            } 
            catch 
            { 
                info.StatusCode = HttpStatusCode.InternalServerError; 
 
                return null; 
            } 
            finally 
            { 
                if (stream != null) 
                    stream.Close(); 
 
                if (response != null) 
                    response.Close(); 
            } 
            return info; 
        } 
        #endregion 
 
 
        public RemoteRes Post(string strUrl, string postData) 
        { 
            RemoteRes info = new RemoteRes(); 
            Stream responseStream = null; 
            HttpWebResponse response = null; 
            try 
            { 
                byte[] bytes = this._HttpEncoding.GetBytes(postData); 
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl); 
                request.Method = "POST"; 
                request.ContentType = "application/x-www-form-urlencoded"; 
                request.ContentLength = bytes.Length; 
                request.Timeout = this._TimeOut; 
                request.UserAgent = this._UserAgent;  
                //request.Referer = string.Concat("http://", uri.Host); 
                if (this._ProxyHost.Length > 0) 
                { 
                    request.Proxy = new WebProxy(this._ProxyHost, this._ProxyInt); 
                } 
                using (Stream requestStream = request.GetRequestStream()) 
                { 
                    requestStream.Write(bytes, 0, bytes.Length); 
                    requestStream.Close(); 
                } 
                try 
                { 
                    Encoding encoding; 
                    response = (HttpWebResponse)request.GetResponse(); 
                    responseStream = response.GetResponseStream(); 
                    if (this._HttpEncoding == null) 
                    { 
                        string str = response.CharacterSet.ToLower(); 
                        if (str.Length > 3) 
                        { 
                            if (str.Substring(0, 3) == "iso") 
                            { 
                                encoding = Encoding.Default; 
                            } 
                            else 
                            { 
                                encoding = Encoding.GetEncoding(response.CharacterSet); 
                            } 
                        } 
                        else 
                        { 
                            encoding = Encoding.GetEncoding(response.CharacterSet); 
                        } 
                        if (str.Length == 0) 
                        { 
                            encoding = Encoding.Default; 
                        } 
                    } 
                    else 
                    { 
                        encoding = this._HttpEncoding; 
                    } 
                    info.HTML = new StreamReader(responseStream, encoding).ReadToEnd(); 
                    info.StatusCode = HttpStatusCode.OK; 
 
                    responseStream.Close(); 
                    response.Close(); 
                    return info; 
                } 
                catch (Exception ex) 
                { 
                    info.HTML = "错误:" + ex.Message; 
                } 
 
            } 
            catch (Exception ex) 
            { 
                info.HTML = "错误:" + ex.Message; 
            } 
            finally 
            { 
                if (responseStream != null) 
                    responseStream.Close(); 
                if (response != null) 
                    response.Close(); 
            } 
            return info; 
        } 
 
        #region 检查链接 
        /// <summary> 
        /// 检查链接是否存在 
        /// </summary> 
        /// <param name="sURL"></param> 
        /// <param name="AllowBadNum"></param> 
        public bool UrlExist(string strURL) 
        { 
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strURL); 
            request.Method = "HEAD"; 
            request.AllowAutoRedirect = false; 
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)"; 
            HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 
            if (response.StatusCode != HttpStatusCode.OK) 
            { 
                response.Close(); 
                return false; 
            } 
            else 
            { 
                return true; 
            } 
 
 
        } 
        /// <summary> 
        /// 检查死链接是否在能容忍的数量内 
        /// </summary> 
        /// <param name="URLs"></param> 
        /// <param name="AllowBadNum"></param> 
        /// <returns></returns> 
        public bool UrlExist(List<string> URLs, int AllowBadNum) 
        { 
            //如果图片的数量小于能容忍的数量就不用检查了。 
            if (URLs.Count <= AllowBadNum) 
            { 
                return true; 
            } 
            int intTemp = 0; 
            foreach (string strUrl in URLs) 
            { 
                if (UrlExist(strUrl) == false) 
                { 
                    intTemp++; 
                    if (intTemp > AllowBadNum) 
                    { 
                        return false; 
                    } 
                } 
            } 
            return true; 
        } 
        #endregion 
    } 
 
    public class RemoteRes 
    { 
        private string _code; 
        private string _html; 
        private byte[] _bytes; 
        private string _ContentType; 
        private HttpStatusCode _StatusCode; 
        /// <summary> 
        /// 返回信息的代码 
        /// </summary> 
        public string Code 
        { 
            get 
            { 
                return this._code; 
            } 
            set 
            { 
                this._code = value; 
            } 
        } 
        /// <summary> 
        /// 信息 
        /// </summary> 
        public string HTML 
        { 
            get 
            { 
                return this._html; 
            } 
            set 
            { 
                this._html = value; 
            } 
        } 
        /// <summary> 
        /// 远程资源 
        /// </summary> 
        public byte[] Bytes 
        { 
            get 
            { 
                return this._bytes; 
            } 
            set 
            { 
                this._bytes = value; 
            } 
        } 
        /// <summary> 
        /// 内容类型 
        /// </summary> 
        public string ContentType 
        { 
            get 
            { 
                return this._ContentType; 
            } 
            set 
            { 
                this._ContentType = value; 
            } 
        } 
        /// <summary> 
        /// 状态代码 
        /// </summary> 
        public HttpStatusCode StatusCode 
        { 
            get 
            { 
                return this._StatusCode; 
            } 
            set 
            { 
                this._StatusCode = value; 
            } 
        } 
    } 




摘自 winner2050的专栏
相关TAG标签
上一篇:Windows下硬链接、软链接和快捷方式的区别
下一篇:修复IIS ISAPI Extension Enumerate Root Web Server Directory漏洞
相关文章
图文推荐

关于我们 | 联系我们 | 广告服务 | 投资合作 | 版权申明 | 在线帮助 | 网站地图 | 作品发布 | Vip技术培训 | 举报中心

版权所有: 红黑联盟--致力于做实用的IT技术学习网站