C#编程之采集网页类
摘要:本文将带你了解C#编程之采集网页类,希望本文对大家学C#/.Net有所帮助。
采集网页内容的c#类,可以像火车头一样指定目标字符串前面的字符和后面的字符,自动截取到目标字符串,测试效果不错:
using System.IO;using System.Net;using System.Text;using System;using System.Text.RegularExpressions;using System.Collections.Generic;
namespace testtaobao {
public class caiji
{
#region 获取网页内容
/// <summary>
/// 获取网页内容
/// </summary>
/// <param name="url">网址</param>
/// <param name="code">网页编码例如GB2312</param>
/// <returns>网页源码</returns>
public string gethtml(string url,string code){
string strResult;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding(code);
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
return strResult;
}
catch (Exception ex)
{
throw ex;
}
}
#endregion
#region 替换换行符
/// <summary>
/// 替换掉网页源码里面的换行符,方便匹配
/// </summary>
/// <param name="HtmlCode">html代码</param>
/// <returns>去除换行符后的字符串</returns>
public string ReplaceEnter(string HtmlCode)
{
string s = "";
if (HtmlCode == null || HtmlCode == "")
s = "";
else
s = HtmlCode.Replace("\"", "");
s = s.Replace("\r\n", "");
return s;
}
#endregion
#region 执行正则提取出值
/// <summary>
/// 执行正则提取出值
/// </summary>
/// <param name="RegexString">正则表达式</param>
/// <param name="RemoteStr">HtmlCode源代码</param>
/// <returns></returns>
public MatchCollection GetRegValue(string RegexString, string RemoteStr)
{
Regex r = new Regex(RegexString,RegexOptions.Multiline);
MatchCollection matches = r.Matches(RemoteStr);
return matches;
}
#endregion
#region 获取目标字符串
/// <summary>
/// 获取目标字符串
/// </summary>
/// <param name="fstr">目标字符串前面的字串</param>
/// <param name="estr">目标字符串后面的字串</param>
/// <param name="scstr">源字符串</param>
/// <returns>匹配到的字符串数组</returns>
public List<string> getstr(string fstr, string estr, string scstr) {
//StringBuilder stb = new StringBuilder();
string regstr = fstr + @".*?" + estr;
List<string> rlist = new List<string>();
MatchCollection match = GetRegValue(regstr, scstr);
for (int i = 0; i < match.Count; i++)
{
string tpstr = match[i].ToString();
tpstr = tpstr.Replace(fstr, "");
tpstr = tpstr.Replace(estr, "");
rlist.Add(tpstr);
}
return rlist;
}
#endregion
}
}
以上就介绍了C#.NET的相关知识,希望对C#.NET有兴趣的朋友有所帮助。了解更多内容,请关注职坐标编程语言C#.NET频道!
看完这篇文章有何感觉?
已经有0人表态,0%的人喜欢
您输入的评论内容中包含违禁敏感词
我知道了