SoFunction
Updated on 2025-03-01

C# Example of crawling website information using regular expressions

This article describes the method of C# using regular expressions to crawl website information. Share it for your reference, as follows:

Here we take the example of grabbing JD Mall product details.

1. Create a program class

public class JdRobber
{
  /// <summary>
  /// Determine whether JD.com links  /// </summary>
  /// <param name="param"></param>
  /// <returns></returns>
  public bool ValidationUrl(string url)
  {
    bool result = false;
    if (!(url))
    {
      Regex regex = new Regex(@"^/\d+.html$");
      Match match = (url);
      if ()
      {
        result = true;
      }
    }
    return result;
  }
  /// <summary>
  ///Crawl JD.com information  /// </summary>
  /// <param name="param"></param>
  /// <returns></returns>
  public void GetInfo(string url)
  {
    if (ValidationUrl(url))
    {
      string htmlStr = (url, "Default");
      if (!(htmlStr))
      {
        string pattern = "";     // Regular expression        string sourceWebID = "";   //Product Key ID        string title = "";      //title        decimal price = 0;      //price        string picName = "";     //picture        //Extract the product key ID        pattern = @"/(?<Object>\d+).html";
        sourceWebID = (url, pattern);
        //Extract title        pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>";
        title = (htmlStr, pattern);
        //Extract pictures        int begin = ("<div id=\"spec-n1\"");
        int end = ("</div>", begin + 1);
        if (begin > 0 && end > 0)
        {
          string subPicHtml = (begin, end - begin);
          pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>";
          picName = (subPicHtml, pattern);
        }
        //Extraction price        if (sourceWebID != "")
        {
          string priceUrl = @"http://p./prices/get?skuid=J_" + sourceWebID + "&type=1";
          string priceJson = (priceUrl, "Default");
          pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\""";
          price = ((priceJson, pattern));
        }
        ("Product Name:{0}", title);
        ("picture:{0}", picName);
        ("price:{0}", price);
      }
    }
  }
}

2. Create a public method class

/// <summary>
/// Public method class/// </summary>
public class WebHandler
{
  /// <summary>
  /// Get the HTML code of the web page  /// </summary>
  /// <param name="url">link address</param>  /// <param name="encoding">Encoding type</param>  /// &lt;returns&gt;&lt;/returns&gt;
  public static string GetHtmlStr(string url, string encoding)
  {
    string htmlStr = "";
    try
    {
      if (!(url))
      {
        WebRequest request = (url); //Instantiate the WebRequest object        WebResponse response = (); //Create WebResponse object        Stream datastream = (); //Create a stream object        Encoding ec = ;
        if (encoding == "UTF8")
        {
          ec = Encoding.UTF8;
        }
        else if (encoding == "Default")
        {
          ec = ;
        }
        StreamReader reader = new StreamReader(datastream, ec);
        htmlStr = (); //Read data        ();
        ();
        ();
      }
    }
    catch { }
    return htmlStr;
  }
  /// &lt;summary&gt;
  /// Get keywords in regular expressions  /// &lt;/summary&gt;
  /// <param name="input">text</param>  /// <param name="pattern">Expression</param>  /// &lt;returns&gt;&lt;/returns&gt;
  public static string GetRegexText(string input, string pattern)
  {
    string result = "";
    if (!(input) &amp;&amp; !(pattern))
    {
      Regex regex = new Regex(pattern, );
      Match match = (input);
      if ()
      {
        result = ["Object"].Value;
      }
    }
    return result;
  }
  /// &lt;summary&gt;
  /// Return to valid price  /// &lt;/summary&gt;
  /// &lt;param name="strPrice"&gt;&lt;/param&gt;
  /// &lt;returns&gt;&lt;/returns&gt;
  public static decimal GetValidPrice(string strPrice)
  {
    decimal price = 0;
    try
    {
      if (!(strPrice))
      {
        Regex regex = new Regex(@"^\d+(\.\d{1,2})?$", );
        Match match = (strPrice);
        if ()
        {
          price = (strPrice);
        }
      }
    }
    catch { }
    return price;
  }
}

PS: Here are two very convenient regular expression tools for your reference:

JavaScript regular expression online testing tool:
http://tools./regex/javascript

Regular expression online generation tool:
http://tools./regex/create_reg

For more information about C#, please visit the special topic of this site:Summary of the usage of C# regular expressions》、《Summary of C# coding operation skills》、《Summary of XML file operation skills in C#》、《Tutorial on the usage of common C# controls》、《Summary of WinForm control usage》、《C# data structure and algorithm tutorial》、《Introduction to C# object-oriented programming tutorial"and"Summary of thread usage techniques for C# programming

I hope this article will be helpful to everyone's C# programming.