管理资源吧首页>>>教程>>>编程>>>ASP.NET教程>>>

asp.net 抓取网页源码三种实现方法

　　方法1 比较推荐

复制代码代码如下:

　　/// <summary>

　　/// 用HttpWebRequest取得网页源码

　　/// 对于带BOM的网页很有效，不管是什么编码都能正确识别

　　/// </summary>

　　/// <param name="url">网页地址" </param>

　　/// <returns>返回网页源文件</returns>

　　public static string GetHtmlSource2(string url)

　　{

　　//处理内容

　　string html = "";

　　HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

　　request.Accept = "*/*"; //接受任意文件

　　request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)"; // 模拟使用IE在浏览 http://www.52mvc.com

　　request.AllowAutoRedirect = true;//是否允许302

　　//request.CookieContainer = new CookieContainer();//cookie容器，

　　request.Referer = url; //当前页面的引用

　　HttpWebResponse response = (HttpWebResponse)request.GetResponse();

　　Stream stream = response.GetResponseStream();

　　StreamReader reader = new StreamReader(stream, Encoding.Default);

　　html = reader.ReadToEnd();

　　stream.Close();

　　return html;

　　}

　　方法2

复制代码代码如下:

　　using System;

　　using System.Collections.Generic;

　　using System.Linq;

　　using System.Web;

　　using System.IO;

　　using System.Text;

　　using System.Net;

　　namespace MySql

　　{

　　public class GetHttpData

　　{

　　public static string GetHttpData2(string Url)

　　{

　　string sException = null;

　　string sRslt = null;

　　WebResponse oWebRps = null;

　　WebRequest oWebRqst = WebRequest.Create(Url);

　　oWebRqst.Timeout = 50000;

　　try

　　{

　　oWebRps = oWebRqst.GetResponse();

　　}

　　catch (WebException e)

　　{

　　sException = e.Message.ToString();

　　}

　　catch (Exception e)

　　{

　　sException = e.ToString();

　　}

　　finally

　　{

　　if (oWebRps != null)

　　{

　　StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), Encoding.GetEncoding("utf-8"));

　　sRslt = oStreamRd.ReadToEnd();

　　oStreamRd.Close();

　　oWebRps.Close();

　　}

　　return sRslt;

　　}

　　方法3

复制代码代码如下:

　　public static string getHtml(string url, params string [] charSets)//url是要访问的网站地址，charSet是目标网页的编码，如果传入的是null或者""，那就自动分析网页的编码

　　{

　　try

　　{

　　string charSet = null;

　　if (charSets.Length == 1) {

　　charSet = charSets[0];

　　}

　　WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient

　　// 需要注意的：

　　//有的网页可能下不下来，有种种原因比如需要cookie,编码问题等等

　　//这是就要具体问题具体分析比如在头部加入cookie

　　// webclient.Headers.Add("Cookie", cookie);

　　//这样可能需要一些重载方法。根据需要写就可以了

　　//获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。

　　myWebClient.Credentials = CredentialCache.DefaultCredentials;

　　//如果服务器要验证用户名,密码

　　//NetworkCredential mycred = new NetworkCredential(struser, strpassword);

　　//myWebClient.Credentials = mycred;

　　//从资源下载数据并返回字节数组。（加@是因为网址中间有"/"符号）

　　byte[] myDataBuffer = myWebClient.DownloadData(url);

　　string strWebData = Encoding.Default.GetString(myDataBuffer);

　　//获取网页字符编码描述信息

　　Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);

　　string webCharSet = charSetMatch.Groups[2].Value;

　　if (charSet == null || charSet == "")

　　charSet = webCharSet;

　　if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)

　　{

　　strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);

　　}

　　else {

　　strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);

　　}

　　return strWebData;

　　}

　　catch (Exception e) { return ""; }

　　}

教程首页更多教程