asp.net 抓取网页源码三种实现方法

  方法1 比较推荐

  

复制代码 代码如下:

  /// <summary>

  /// 用HttpWebRequest取得网页源码

  /// 对于带BOM的网页很有效,不管是什么编码都能正确识别

  /// </summary>

  /// <param name="url">网页地址" </param>

  /// <returns>返回网页源文件</returns>

  public static string GetHtmlSource2(string  url)

  {

  //处理内容

  string html = "";

  HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

  request.Accept = "*/*"; //接受任意文件

  request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)"; // 模拟使用IE在浏览 http://www.52mvc.com

  request.AllowAutoRedirect = true;//是否允许302

  //request.CookieContainer = new CookieContainer();//cookie容器,

  request.Referer = url; //当前页面的引用

  HttpWebResponse response = (HttpWebResponse)request.GetResponse();

  Stream stream = response.GetResponseStream();

  StreamReader reader = new StreamReader(stream, Encoding.Default);

  html = reader.ReadToEnd();

  stream.Close();

  return html;

  }

  方法2

  

复制代码 代码如下:

  using System;

  using System.Collections.Generic;

  using System.Linq;

  using System.Web;

  using System.IO;

  using System.Text;

  using System.Net;

  namespace MySql

  {

  public class GetHttpData

  {

  public static string GetHttpData2(string Url)

  {

  string sException = null;

  string sRslt = null;

  WebResponse oWebRps = null;

  WebRequest oWebRqst = WebRequest.Create(Url);

  oWebRqst.Timeout = 50000;

  try

  {

  oWebRps = oWebRqst.GetResponse();

  }

  catch (WebException e)

  {

  sException = e.Message.ToString();

  }

  catch (Exception e)

  {

  sException = e.ToString();

  }

  finally

  {

  if (oWebRps != null)

  {

  StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), Encoding.GetEncoding("utf-8"));

  sRslt = oStreamRd.ReadToEnd();

  oStreamRd.Close();

  oWebRps.Close();

  }

  }

  return sRslt;

  }

  }

  }

  方法3

  

复制代码 代码如下:

  public static string getHtml(string url, params  string [] charSets)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码

  {

  try

  {

  string charSet = null;

  if (charSets.Length == 1) {

  charSet = charSets[0];

  }

  WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient

  // 需要注意的:

  //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等

  //这是就要具体问题具体分析比如在头部加入cookie

  // webclient.Headers.Add("Cookie", cookie);

  //这样可能需要一些重载方法。根据需要写就可以了

  //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。

  myWebClient.Credentials = CredentialCache.DefaultCredentials;

  //如果服务器要验证用户名,密码

  //NetworkCredential mycred = new NetworkCredential(struser, strpassword);

  //myWebClient.Credentials = mycred;

  //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)

  byte[] myDataBuffer = myWebClient.DownloadData(url);

  string strWebData = Encoding.Default.GetString(myDataBuffer);

  //获取网页字符编码描述信息

  Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);

  string webCharSet = charSetMatch.Groups[2].Value;

  if (charSet == null || charSet == "")

  charSet = webCharSet;

  if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)

  {

  strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);

  }

  else {

  strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);

  }

  return strWebData;

  }

  catch (Exception e) { return ""; }

  }