This article describes the web crawler class that uses regular expressions to obtain all information in a web page. Share it for your reference, as follows:
Class code:
using System; using ; using ; using ; using ; using ; using ; using ; using ; using ; using ; /// <summary> /// Web page class/// </summary> public class WebPage { #region Private Member private Uri m_uri; //url private List<Link> m_links; // Links on this page private string m_title; //title private string m_html; //HTML Code private string m_outstr; //Plain text that can be output by web pages private bool m_good; //Is the web page available private int m_pagesize; //The size of the web page private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//Cookies for storing all web pages #endregion #region properties /// <summary> /// You can get the URL of this page through this attribute, read only /// </summary> public string URL { get { return m_uri.AbsoluteUri; } } /// <summary> /// This property allows you to obtain the title of this page, read only /// </summary> public string Title { get { if (m_title == "") { Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", | ); Match mc = (m_html); if () m_title = ["title"].(); } return m_title; } } public string M_html { get { if (m_html == null) { m_html = ""; } return m_html; } } /// <summary> /// This attribute obtains all link information on this page, read only /// </summary> public List<Link> Links { get { if (m_links.Count == 0) getLinks(); return m_links; } } /// <summary> /// This attribute returns all plain text information of this web page, read only /// </summary> public string Context { get { if (m_outstr == "") getContext(); return m_outstr; } } /// <summary> /// This property gets the size of this page /// </summary> public int PageSize { get { return m_pagesize; } } /// <summary> /// This attribute obtains all website links of this page /// </summary> public List<Link> InsiteLinks { get { return getSpecialLinksByUrl("^http://" + m_uri.Host, ); } } /// <summary> /// This attribute indicates whether this web page is available /// </summary> public bool IsGood { get { return m_good; } } /// <summary> /// This attribute indicates the website where the web page is located /// </summary> public string Host { get { return m_uri.Host; } } #endregion /// <summary> ///Analyze link information from HTML code /// </summary> /// <returns>List<Link></returns> private List<Link> getLinks() { if (m_links.Count == 0) { Regex[] regex = new Regex[2]; regex[0] = new Regex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", | ); regex[1] = new Regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", ); for (int i = 0; i < 2; i++) { Match match = regex[i].Match(m_html); while () { try { string url = (new Uri(m_uri, ["URL"].Value).AbsoluteUri); string text = ""; if (i == 0) text = new Regex("(<[^>]+>)|(\\s)|( )|&|\"", | ).Replace(["text"].Value, ""); Link link = new Link(); = text; = url; m_links.Add(link); } catch (Exception ex) { (); }; match = (); } } } return m_links; } /// <summary> /// This private method extracts plain text with a certain number of words from a piece of HTML text /// </summary> /// <param name="instr">HTML Code</param> /// <param name="firstN">Extract how many words to count from scratch</param> /// <param name="withLink">Do you want to link the words in it</param> /// <returns>Plain text</returns> private string getFirstNchar(string instr, int firstN, bool withLink) { if (m_outstr == "") { m_outstr = () as string; m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", | ).Replace(m_outstr, ""); m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", | ).Replace(m_outstr, ""); m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", | ).Replace(m_outstr, ""); if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", | ).Replace(m_outstr, ""); Regex objReg = new ("(<[^>]+?>)| ", | ); m_outstr = (m_outstr, ""); Regex objReg2 = new ("(\\s)+", | ); m_outstr = (m_outstr, " "); } return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr; } #region Public Grammar /// <summary> /// This public method extracts plain text with a certain number of words in a web page, including link text /// </summary> /// <param name="firstN">word count</param> /// <returns></returns> public string getContext(int firstN) { return getFirstNchar(m_html, firstN, true); } /// <summary> /// This public method extracts a certain number of links from the link on this web page, and the URL of the link satisfies a regular form /// </summary> /// <param name="pattern">regularity</param> /// <param name="count">Number of returned links</param> /// <returns>List<Link></returns> public List<Link> getSpecialLinksByUrl(string pattern, int count) { if (m_links.Count == 0) getLinks(); List<Link> SpecialLinks = new List<Link>(); List<Link>.Enumerator i; i = m_links.GetEnumerator(); int cnt = 0; while (() && cnt < count) { if (new Regex(pattern, | ).Match().Success) { (); cnt++; } } return SpecialLinks; } /// <summary> /// This public method extracts a certain number of links from the link on this web page, and the text of the link satisfies a regular form /// </summary> /// <param name="pattern">regularity</param> /// <param name="count">Number of returned links</param> /// <returns>List<Link></returns> public List<Link> getSpecialLinksByText(string pattern, int count) { if (m_links.Count == 0) getLinks(); List<Link> SpecialLinks = new List<Link>(); List<Link>.Enumerator i; i = m_links.GetEnumerator(); int cnt = 0; while (() && cnt < count) { if (new Regex(pattern, | ).Match().Success) { (); cnt++; } } return SpecialLinks; } /// <summary> /// This public method extracts text that satisfies a regular form in the plain text of this web page by He Wen /// </summary> /// <param name="pattern">regularity</param> /// <returns>Return to text</returns> public string getSpecialWords(string pattern) { if (m_outstr == "") getContext(); Regex regex = new Regex(pattern, | ); Match mc = (m_outstr); if () return [1].Value; return ; } #endregion #region constructor private void Init(string _url) { try { m_uri = new Uri(_url); m_links = new List<Link>(); m_html = ""; m_outstr = ""; m_title = ""; m_good = true; if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi")) { m_good = false; return; } HttpWebRequest rqst = (HttpWebRequest)(m_uri); = true; = 3; = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"; = true; = 10000; lock () { if ((m_uri.Host)) = [m_uri.Host]; else { CookieContainer cc = new CookieContainer(); [m_uri.Host] = cc; = cc; } } HttpWebResponse rsps = (HttpWebResponse)(); Stream sm = (); if (!().StartsWith("text/") || > 1 << 22) { (); m_good = false; return; } Encoding cding = ; string contenttype = (); int ix = ("charset="); if (ix != -1) { try { cding = ((ix + "charset".Length + 1)); } catch { cding = ; } //This place depends on the situation. Some need to be decoded. //m_html = (new StreamReader(sm, cding).ReadToEnd()); m_html = new StreamReader(sm, cding).ReadToEnd(); } else { //This place depends on the situation. Some need to be decoded. //m_html = (new StreamReader(sm, cding).ReadToEnd()); m_html = new StreamReader(sm, cding).ReadToEnd(); Regex regex = new Regex("charset=(?<cding>[^=]+)?\"", ); string strcding = (m_html).Groups["cding"].Value; try { cding = (strcding); } catch { cding = ; } byte[] bytes = (m_html.ToCharArray()); m_html = (bytes); if (m_html.Split('?').Length > 100) { m_html = (bytes); } } m_pagesize = m_html.Length; m_uri = ; (); } catch (Exception ex) { } } public WebPage(string _url) { string uurl = ""; try { uurl = (_url); _url = uurl; } catch { }; Init(_url); } #endregion }
Called:
WebPage webInfo = new WebPage("/"); ;//Not all contents of html tagswebInfo.M_html;//Content containing html tag by He asked
PS: Here are two very convenient regular expression tools for your reference:
JavaScript regular expression online testing tool:
http://tools./regex/javascript
Regular expression online generation tool:
http://tools./regex/create_reg
For more information about C# related content, please check out the topic of this site:Summary of the usage of C# regular expressions》、《Summary of C# coding operation skills》、《Summary of XML file operation skills in C#》、《Tutorial on the usage of common C# controls》、《Summary of WinForm control usage》、《C# data structure and algorithm tutorial》、《Introduction to C# object-oriented programming tutorial"and"Summary of thread usage techniques for C# programming》
I hope this article will be helpful to everyone's C# programming.