SoFunction
Updated on 2025-03-06

C# implements the web crawling instance of the web page that obtains all information in the web page based on regular expressions

This article describes the web crawler class that uses regular expressions to obtain all information in a web page. Share it for your reference, as follows:

Class code:

using System;
using ;
using ;
using ;
using ;
using ;
using ;
using ;
using ;
using ;
using ;
/// <summary>
/// Web page class/// </summary>
public class WebPage
{
    #region Private Member    private Uri m_uri;  //url
    private List<Link> m_links;  // Links on this page    private string m_title;    //title    private string m_html;     //HTML Code    private string m_outstr;    //Plain text that can be output by web pages    private bool m_good;      //Is the web page available    private int m_pagesize;    //The size of the web page    private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//Cookies for storing all web pages    #endregion
    #region properties    /// <summary>
    /// You can get the URL of this page through this attribute, read only    /// </summary>
    public string URL
    {
      get
      {
        return m_uri.AbsoluteUri;
      }
    }
    /// <summary>
    /// This property allows you to obtain the title of this page, read only    /// </summary>
    public string Title
    {
      get
      {
        if (m_title == "")
        {
          Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>",  | );
          Match mc = (m_html);
          if ()
            m_title = ["title"].();
        }
        return m_title;
      }
    }
    public string M_html
    {
      get
      {
        if (m_html == null)
        {
          m_html = "";
        }
        return m_html;
      }
    }
    /// <summary>
    /// This attribute obtains all link information on this page, read only    /// </summary>
    public List<Link> Links
    {
      get
      {
        if (m_links.Count == 0) getLinks();
        return m_links;
      }
    }
    /// <summary>
    /// This attribute returns all plain text information of this web page, read only    /// </summary>
    public string Context
    {
      get
      {
        if (m_outstr == "") getContext();
        return m_outstr;
      }
    }
    /// <summary>
    /// This property gets the size of this page    /// </summary>
    public int PageSize
    {
      get
      {
        return m_pagesize;
      }
    }
    /// <summary>
    /// This attribute obtains all website links of this page    /// </summary>
    public List<Link> InsiteLinks
    {
      get
      {
        return getSpecialLinksByUrl("^http://" + m_uri.Host, );
      }
    }
    /// <summary>
    /// This attribute indicates whether this web page is available    /// </summary>
    public bool IsGood
    {
      get
      {
        return m_good;
      }
    }
    /// <summary>
    /// This attribute indicates the website where the web page is located    /// </summary>
    public string Host
    {
      get
      {
        return m_uri.Host;
      }
    }
    #endregion
    /// <summary>
    ///Analyze link information from HTML code    /// </summary>
    /// <returns>List<Link></returns>
    private List<Link> getLinks()
    {
      if (m_links.Count == 0)
      {
        Regex[] regex = new Regex[2];
        regex[0] = new Regex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>",  | );
        regex[1] = new Regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", );
        for (int i = 0; i < 2; i++)
        {
          Match match = regex[i].Match(m_html);
          while ()
          {
            try
            {
              string url = (new Uri(m_uri, ["URL"].Value).AbsoluteUri);
              string text = "";
              if (i == 0) text = new Regex("(<[^>]+>)|(\\s)|( )|&|\"",  | ).Replace(["text"].Value, "");
              Link link = new Link();
               = text;
               = url;
              m_links.Add(link);
            }
            catch (Exception ex) { (); };
            match = ();
          }
        }
      }
      return m_links;
    }
    /// <summary>
    /// This private method extracts plain text with a certain number of words from a piece of HTML text    /// </summary>
    /// <param name="instr">HTML Code</param>    /// <param name="firstN">Extract how many words to count from scratch</param>    /// <param name="withLink">Do you want to link the words in it</param>    /// <returns>Plain text</returns>    private string getFirstNchar(string instr, int firstN, bool withLink)
    {
      if (m_outstr == "")
      {
        m_outstr = () as string;
        m_outstr = new Regex(@"(?m)&lt;script[^&gt;]*&gt;(\w|\W)*?&lt;/script[^&gt;]*&gt;",  | ).Replace(m_outstr, "");
        m_outstr = new Regex(@"(?m)&lt;style[^&gt;]*&gt;(\w|\W)*?&lt;/style[^&gt;]*&gt;",  | ).Replace(m_outstr, "");
        m_outstr = new Regex(@"(?m)&lt;select[^&gt;]*&gt;(\w|\W)*?&lt;/select[^&gt;]*&gt;",  | ).Replace(m_outstr, "");
        if (!withLink) m_outstr = new Regex(@"(?m)&lt;a[^&gt;]*&gt;(\w|\W)*?&lt;/a[^&gt;]*&gt;",  | ).Replace(m_outstr, "");
        Regex objReg = new ("(&lt;[^&gt;]+?&gt;)| ",  | );
        m_outstr = (m_outstr, "");
        Regex objReg2 = new ("(\\s)+",  | );
        m_outstr = (m_outstr, " ");
      }
      return m_outstr.Length &gt; firstN ? m_outstr.Substring(0, firstN) : m_outstr;
    }
    #region Public Grammar    /// &lt;summary&gt;
    /// This public method extracts plain text with a certain number of words in a web page, including link text    /// &lt;/summary&gt;
    /// <param name="firstN">word count</param>    /// &lt;returns&gt;&lt;/returns&gt;
    public string getContext(int firstN)
    {
      return getFirstNchar(m_html, firstN, true);
    }
    /// &lt;summary&gt;
    /// This public method extracts a certain number of links from the link on this web page, and the URL of the link satisfies a regular form    /// &lt;/summary&gt;
    /// <param name="pattern">regularity</param>    /// <param name="count">Number of returned links</param>    /// &lt;returns&gt;List&lt;Link&gt;&lt;/returns&gt;
    public List&lt;Link&gt; getSpecialLinksByUrl(string pattern, int count)
    {
      if (m_links.Count == 0) getLinks();
      List&lt;Link&gt; SpecialLinks = new List&lt;Link&gt;();
      List&lt;Link&gt;.Enumerator i;
      i = m_links.GetEnumerator();
      int cnt = 0;
      while (() &amp;&amp; cnt &lt; count)
      {
        if (new Regex(pattern,  | ).Match().Success)
        {
          ();
          cnt++;
        }
      }
      return SpecialLinks;
    }
    /// &lt;summary&gt;
    /// This public method extracts a certain number of links from the link on this web page, and the text of the link satisfies a regular form    /// &lt;/summary&gt;
    /// <param name="pattern">regularity</param>    /// <param name="count">Number of returned links</param>    /// &lt;returns&gt;List&lt;Link&gt;&lt;/returns&gt;
    public List&lt;Link&gt; getSpecialLinksByText(string pattern, int count)
    {
      if (m_links.Count == 0) getLinks();
      List&lt;Link&gt; SpecialLinks = new List&lt;Link&gt;();
      List&lt;Link&gt;.Enumerator i;
      i = m_links.GetEnumerator();
      int cnt = 0;
      while (() &amp;&amp; cnt &lt; count)
      {
        if (new Regex(pattern,  | ).Match().Success)
        {
          ();
          cnt++;
        }
      }
      return SpecialLinks;
    }
    /// &lt;summary&gt;
    /// This public method extracts text that satisfies a regular form in the plain text of this web page by He Wen    /// &lt;/summary&gt;
    /// <param name="pattern">regularity</param>    /// <returns>Return to text</returns>    public string getSpecialWords(string pattern)
    {
      if (m_outstr == "") getContext();
      Regex regex = new Regex(pattern,  | );
      Match mc = (m_outstr);
      if ()
        return [1].Value;
      return ;
    }
    #endregion
    #region constructor    private void Init(string _url)
    {
      try
      {
        m_uri = new Uri(_url);
        m_links = new List&lt;Link&gt;();
        m_html = "";
        m_outstr = "";
        m_title = "";
        m_good = true;
        if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))
        {
          m_good = false;
          return;
        }
        HttpWebRequest rqst = (HttpWebRequest)(m_uri);
         = true;
         = 3;
         = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
         = true;
         = 10000;
        lock ()
        {
          if ((m_uri.Host))
             = [m_uri.Host];
          else
          {
            CookieContainer cc = new CookieContainer();
            [m_uri.Host] = cc;
             = cc;
          }
        }
        HttpWebResponse rsps = (HttpWebResponse)();
        Stream sm = ();
        if (!().StartsWith("text/") ||  &gt; 1 &lt;&lt; 22)
        {
          ();
          m_good = false;
          return;
        }
        Encoding cding = ;
        string contenttype = ();
        int ix = ("charset=");
        if (ix != -1)
        {
          try
          {
            cding = ((ix + "charset".Length + 1));
          }
          catch
          {
            cding = ;
          }
          //This place depends on the situation. Some need to be decoded.          //m_html = (new StreamReader(sm, cding).ReadToEnd());
          m_html = new StreamReader(sm, cding).ReadToEnd();
        }
        else
        {
         //This place depends on the situation. Some need to be decoded.          //m_html = (new StreamReader(sm, cding).ReadToEnd());
          m_html = new StreamReader(sm, cding).ReadToEnd();
          Regex regex = new Regex("charset=(?&lt;cding&gt;[^=]+)?\"", );
          string strcding = (m_html).Groups["cding"].Value;
          try
          {
            cding = (strcding);
          }
          catch
          {
            cding = ;
          }
          byte[] bytes = (m_html.ToCharArray());
          m_html = (bytes);
          if (m_html.Split('?').Length &gt; 100)
          {
            m_html = (bytes);
          }
        }
        m_pagesize = m_html.Length;
        m_uri = ;
        ();
      }
      catch (Exception ex)
      {
      }
    }
    public WebPage(string _url)
    {
      string uurl = "";
      try
      {
        uurl = (_url);
        _url = uurl;
      }
      catch { };
      Init(_url);
    }
    #endregion
}

Called:

WebPage webInfo = new WebPage("/");
;//Not all contents of html tagswebInfo.M_html;//Content containing html tag by He asked

PS: Here are two very convenient regular expression tools for your reference:

JavaScript regular expression online testing tool:
http://tools./regex/javascript

Regular expression online generation tool:
http://tools./regex/create_reg

For more information about C# related content, please check out the topic of this site:Summary of the usage of C# regular expressions》、《Summary of C# coding operation skills》、《Summary of XML file operation skills in C#》、《Tutorial on the usage of common C# controls》、《Summary of WinForm control usage》、《C# data structure and algorithm tutorial》、《Introduction to C# object-oriented programming tutorial"and"Summary of thread usage techniques for C# programming

I hope this article will be helpful to everyone's C# programming.