先来个采集网页的代码。
[csharp]
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Text;
namespace TopWinCMS.Common
{
public class NetHelper
{
//private string _HTTP_USER_AGENT = "Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.2;+SV1;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727)";
private string _UserAgent = "Googlebot/2.1 (+http://www.google.com/bot.html)";
private Encoding _HttpEncoding = null;
private string _ProxyHost = string.Empty;
private int _ProxyInt = 8080;
private int _TimeOut = 200000;
#region 属性
/// <summary>
/// 设置UserAgent
/// </summary>
public string UserAgent
{
get
{
return this._UserAgent;
}
set
{
this._UserAgent = value;
}
}
/// <summary>
/// 设置编码
/// </summary>
public Encoding HttpEncoding
{
get
{
return this._HttpEncoding;
}
set
{
this._HttpEncoding = value;
}
}
/// <summary>
/// 设置代理服务器
/// </summary>
public string ProxyHost
{
get
{
return this._ProxyHost;
}
set
{
this._ProxyHost = value;
}
}
/// <summary>
/// 设置代理服务器端口
/// </summary>
public int ProxyInt
{
get
{
return this._ProxyInt;
}
set
{
this._ProxyInt = value;
}
}
/// <summary>
/// 设置默认超时时间
/// </summary>
public int TimeOut
{
get
{
return this._TimeOut;
}
set
{
this._TimeOut = value;
}
}
#endregion
public RemoteRes Get(string uri)
{
return Get(new Uri(uri));
}
public RemoteRes Get(Uri uri)
{
RemoteRes info = new RemoteRes();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.Timeout = this._TimeOut;
request.UserAgent = this._UserAgent;
request.Method = "GET";
request.Referer = string.Concat("http://", uri.Host);
if (this._ProxyHost.Length > 0)
{
request.Proxy = new WebProxy(this._ProxyHost, this._ProxyInt);
}
HttpWebResponse response = null;
Stream responseStream = null;
try
{
Encoding encoding;
response = (HttpWebResponse)request.GetResponse();
responseStream = response.GetResponseStream();
if (response.Headers["Accept-Encoding"] != null)
{
if (MyCollections.Contain(response.Headers["Accept-Encoding"], "*", "gzip", "x-gzip"))
{
responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
}
}
else if (response.Headers["Content-Encoding"] != null)
{
if (MyCollections.Contain(response.Headers["Content-Encoding"], "*", "gzip", "x-gzip"))
{
responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
}
}
if (this._HttpEncoding == null)
{
string str = response.CharacterSet.ToLower();
if (str.Length > 3)
{
if (str.Substring(0, 3) == "iso")
{
encoding = Encoding.Default;
}
else
{
encoding = Encoding.GetEncoding(response.CharacterSet);
}
}
else
{
encoding = Encoding.GetEncoding(response.CharacterSet);
}
if (str.Length == 0)
{
encoding = Encoding.UTF8;
}
}
else
{
encoding = this._HttpEncoding;
}
info.HTML = new StreamReader(responseStream, encoding).ReadToEnd();
info.ContentType = response.ContentType;
info.StatusCode = response.StatusCode;
}
catch (WebException WE)
{
if (WE.Response != null)
{
info.StatusCode = (WE.Response as HttpWebResponse).StatusCode;
}
else
{
info.StatusCode = HttpStatusCode.ServiceUnavailable;
}
info.Code = "错误:" + WE.Message;
}
catch (Exception ex)
{
info.Code = "错误:" + ex.Message;
info.StatusCode = HttpStatusCode.InternalServerError;
}
finally
{
if (responseStream != null)
responseStream.Close();
if (response != null)
response.Close();
}
return info;
}
#region 取得远程资源
/// <summary>
/// 取得远程资源
www.2cto.com
/// </summary>
/// <param name="strUrl">要取的URL</param>
/// <returns>网页源代码</returns>
public RemoteRes GetRemoteResource(string strUrl)
{
HttpWebResponse response = null;
Stream stream = null;
RemoteRes info = new RemoteRes();
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
request.AllowAutoRedirect = true;
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506)";
request.Referer = "http://" + new Uri(strUrl).Host;
response = request.GetResponse() as HttpWebResponse;
stream = response.GetResponseStream();
info.ContentType = response.ContentType;
MemoryStream ms = new MemoryStream();
byte[] buffer = new byte[256];
int c = stream.Read(buffer, 0, buffer.Length);
while (c > 0)
{
ms.Write(buffer, 0, c);
c = stream.Read(buffer, 0, buffer.Length);
}
stream.Close();
info.StatusCode = response.StatusCode;
info.Bytes = ms.ToArray();
}
catch (WebException WE)
{
if (WE.Response != null)
{
info.StatusCode = (WE.Response as HttpWebResponse).StatusCode;
}
else
{
info.StatusCode = HttpStatusCode.ServiceUnavailable;
}
return null;
}
catch
{
info.StatusCode = HttpStatusCode.InternalServerError;
return null;
}
finally
{
if (stream != null)
stream.Close();
if (response != null)
response.Close();
}
return info;
}
#endregion
public RemoteRes Post(string strUrl, string postData)
{
RemoteRes info = new RemoteRes();
Stream responseStream = null;
HttpWebResponse response = null;
try
{
byte[] bytes = this._HttpEncoding.GetBytes(postData);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = bytes.Length;
request.Timeout = this._TimeOut;
request.UserAgent = this._UserAgent;
//request.Referer = string.Concat("http://", uri.Host);
if (this._ProxyHost.Length > 0)
{
request.Proxy = new WebProxy(this._ProxyHost, this._ProxyInt);
}
using (Stream requestStream = request.GetRequestStream())
{
requestStream.Write(bytes, 0, bytes.Length);
requestStream.Close();
}
try
{
Encoding encoding;
response = (HttpWebResponse)request.GetResponse();
responseStream = response.GetResponseStream();
if (this._HttpEncoding == null)
{
string str = response.CharacterSet.ToLower();
if (str.Length > 3)
{
if (str.Substring(0, 3) == "iso")
{
encoding = Encoding.Default;
}
else
{
encoding = Encoding.GetEncoding(response.CharacterSet);
}
}
else
{
encoding = Encoding.GetEncoding(response.CharacterSet);
}
if (str.Length == 0)
{
encoding = Encoding.Default;
}
}
else
{
encoding = this._HttpEncoding;
}
info.HTML = new StreamReader(responseStream, encoding).ReadToEnd();
info.StatusCode = HttpStatusCode.OK;
responseStream.Close();
response.Close();
return info;
}
catch (Exception ex)
{
info.HTML = "错误:" + ex.Message;
}
}
catch (Exception ex)
{
info.HTML = "错误:" + ex.Message;
}
finally
{
if (responseStream != null)
responseStream.Close();
if (response != null)
response.Close();
}
return info;
}
#region 检查链接
/// <summary>
/// 检查链接是否存在
/// </summary>
/// <param name="sURL"></param>
/// <param name="AllowBadNum"></param>
public bool UrlExist(string strURL)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strURL);
request.Method = "HEAD";
request.AllowAutoRedirect = false;
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode != HttpStatusCode.OK)
{
response.Close();
return false;
}
else
{
return true;
}
}
/// <summary>
/// 检查死链接是否在能容忍的数量内
/// </summary>
/// <param name="URLs"></param>
/// <param name="AllowBadNum"></param>
/// <returns></returns>
public bool UrlExist(List<string> URLs, int AllowBadNum)
{
//如果图片的数量小于能容忍的数量就不用检查了。
if (URLs.Count <= AllowBadNum)
{
return true;
}
int intTemp = 0;
foreach (string strUrl in URLs)
{
if (UrlExist(strUrl) == false)
{
intTemp++;
if (intTemp > AllowBadNum)
{
return false;
}
}
}
return true;
}
#endregion
}
public class RemoteRes
{
private string _code;
private string _html;
private byte[] _bytes;
private string _ContentType;
private HttpStatusCode _StatusCode;
/// <summary>
/// 返回信息的代码
/// </summary>
public string Code
{
get
{
return this._code;
}
set
{
this._code = value;
}
}
/// <summary>
/// 信息
/// </summary>
public string HTML
{
get
{
return this._html;
}
set
{
this._html = value;
}
}
/// <summary>
/// 远程资源
/// </summary>
public byte[] Bytes
{
get
{
return this._bytes;
}
set
{
this._bytes = value;
}
}
/// <summary>
/// 内容类型
/// </summary>
public string ContentType
{
get
{
return this._ContentType;
}
set
{
this._ContentType = value;
}
}
/// <summary>
/// 状态代码
/// </summary>
public HttpStatusCode StatusCode
{
get
{
return this._StatusCode;
}
set
{
this._StatusCode = value;
}
}
}
}
摘自 winner2050的专栏