SoFunction
Updated on 2025-03-06

A HTML parsing class written by C# (similar to XElement syntax)

Function:

1. Easily obtain the HTML elements of the reference element.
2. You can filter according to the attribute tags
3. All returned are strong Llist types without conversion

 
Anyone who has used XElement knows that it is very convenient to parse XML, but it is really incompatible with HTML format diversification.

So I wrote this XHTMLElement similar to XElement

usage:

string filePath = ("~/file/");
      //Get HTML code      string mailBody = (filePath);

      XHtmlElement xh = new XHtmlElement(mailBody);

      //Get the subset of body a tag and class="icon"      var link = ("body").ChildDescendants("a").Where(c => (a =>  == "class" &&  == "icon")).ToList();

      //Get a element with href      var links = ("a").Where(c => (a =>  == "href")).ToList();
      foreach (var r in links)
      {
        ((c =>  == "href").Value); //Output href      }

      //Get the first img      var img = ("img");

      //Get the most recent first p element and other p elements at the same level as it      var ps = ("p");

Code:

using System;
using ;
using ;
using ;
using ;
using ;

namespace SyntacticSugar
{
  /// <summary>
  /// ** Description: html parsing class  /// ** Founding date: 2015-4-23  /// ** Modification time:-  /// ** Author: sunkaixuan  /// ** qq: 610262374 Welcome to communicate and improve together, naming grammar and other bad things. Welcome everyone to give valuable suggestions  /// </summary>
  public class XHtmlElement
  {
    private string _html;
    public XHtmlElement(string html)
    {
      _html = html;
    }

    /// <summary>
    /// Get the most recent HTML element at the same level    /// </summary>
    /// <param name="elementName">Equal to null for all elements</param>    /// &lt;returns&gt;&lt;/returns&gt;
    public List&lt;HtmlInfo&gt; Descendants(string elementName = null)
    {
      if (_html == null)
      {
        throw new ArgumentNullException("html can't be empty!");
      }
      var allList = RootDescendants(_html);
      var reval = (c =&gt; elementName == null || () == ()).ToList();
      if (reval == null ||  == 0)
      {
        reval = GetDescendantsSource(allList, elementName);
      }
      return reval;
    }


    /// &lt;summary&gt;
    /// Get the first level element    /// &lt;/summary&gt;
    /// &lt;param name="elementName"&gt;&lt;/param&gt;
    /// &lt;returns&gt;&lt;/returns&gt;
    public List&lt;HtmlInfo&gt; RootDescendants(string html = null)
    {
      /*
        * Business logic:
              * 1. Get the first html tag and keep looking for the ending tag. If you encounter the same tag in this process, you need to add 1.
              * 2. After the first label is obtained, continue to the first step and find the second element.  .  Element N
        */
      if (html == null) html = _html;
      var firstTag = (html, "&lt;.+?&gt;");

      List&lt;string&gt; eleList = new List&lt;string&gt;();
      List&lt;HtmlInfo&gt; reval = new List&lt;HtmlInfo&gt;();
      GetElementsStringList(html, ref eleList);
      foreach (var r in eleList)
      {
        HtmlInfo data = new HtmlInfo();
         = r;
         = html;
         = (r, @"(?&lt;=\s{1}|\&lt;)[a-z,A-Z]+(?=\&gt;|\s)", ).Value;
         = (r, @"(?&lt;=\&gt;).+(?=&lt;)", ).Value;
        var eleBegin = (r, "&lt;.+?&gt;").Value;
        var attrList = (eleBegin, @"[a-z,A-Z]+\="".+?""").Cast&lt;Match&gt;().Select(c =&gt; new { key = ('=').First(), value = ('=').Last().TrimEnd('"').TrimStart('"') }).ToList();
         = new Dictionary&lt;string, string&gt;();
        if (attrList != null &amp;&amp;  &gt; 0)
        {
          foreach (var a in attrList)
          {
            (, );
          }
        }
        (data);
      }
      return reval;

    }





    #region private
    private List&lt;HtmlInfo&gt; GetDescendantsSource(List&lt;HtmlInfo&gt; allList, string elementName)
    {
      foreach (var r in allList)
      {
        if ( == null || !("&lt;")) continue;
        var childList = RootDescendants().Where(c =&gt; elementName == null || () == ()).ToList();
        if (childList == null ||  == 0)
        {
          childList = GetDescendantsSource(RootDescendants(), elementName);
          if (childList != null &amp;&amp;  &gt; 0)
            return childList;
        }
        else
        {
          return childList;
        }
      }
      return null;
    }

    private void GetElementsStringList(string html, ref List&lt;string&gt; eleList)
    {
      HtmlInfo info = new HtmlInfo();
       = (html, @"(?&lt;=\&lt;\s{0,5}|\&lt;)([a-z,A-Z]+|h\d{1})(?=\&gt;|\s)", ).Value;
      string currentTagBeginReg = @"&lt;\s{0,10}" +  + @".*?&gt;";//Get the current label element start label regular      string currentTagEndReg = @"\&lt;\/" +  + @"\&gt;";//Get the end label rule for the current label element      if (()) return;

      string eleHtml = "";
      //Case 1 <a/>      //Scenario 2 <a></a>      //Case 3 <a> Error format      //Scenario 4endif      if ((html, @"&lt;\s{0,10}" +  + "[^&lt;].*?/&gt;"))//Single label      {
        eleHtml = (html, @"&lt;\s{0,10}" +  + "[^&lt;].*?/&gt;").Value;
      }
      else if (!(html, currentTagEndReg))//No ending      {
        if ((html, @"\s{0,10}\&lt;\!\-\-\[if"))
        {
          eleHtml = GetElementString(html, @"\s{0,10}\&lt;\!\-\-\[if", @"\[endif\]\-\-\&gt;", 1);
        }
        else
        {
          eleHtml = (html, currentTagBeginReg,).Value;
        }
      }
      else
      {
        eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);
      }


      try
      {
        (eleHtml);
        html = (eleHtml, "");
        html = (html, @"&lt;\!DOCTYPE.*?&gt;", "");
        if (!(html, @"^\s*$"))
        {
          GetElementsStringList(html, ref eleList);
        }

      }
      catch (Exception ex)
      {
        throw new Exception("SORRY, your HTML format cannot be parsed!!!");

      }

    }

    private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)
    {

      string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);
      var currentTagBeginMatches = (newHtml, currentTagBeginReg, ).Cast&lt;Match&gt;().Select(c =&gt; ).ToList();
      var currentTagEndMatches = (newHtml, currentTagEndReg).Cast&lt;Match&gt;().Select(c =&gt; ).ToList();
      if ( == )
      { //The two label elements are equal        return newHtml;
      }
      return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);
    }

    private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)
    {
      return (val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?",  | ).Value;
    }
    #endregion



  }
  public static class XHtmlElementExtendsion
  {
    /// &lt;summary&gt;
    /// Get the most recent HTML element at the same level    /// &lt;/summary&gt;
    /// <param name="elementName">Equal to null for all elements</param>    /// &lt;returns&gt;&lt;/returns&gt;
    public static List&lt;HtmlInfo&gt; Descendants(this IEnumerable&lt;HtmlInfo&gt; htmlInfoList, string elementName = null)
    {
      var html = ().InnerHtml;
      XHtmlElement xhe = new XHtmlElement(html);
      return (elementName);
    }
    /// &lt;summary&gt;
    /// Get the lower element    /// &lt;/summary&gt;
    /// &lt;param name="elementName"&gt;&lt;/param&gt;
    /// &lt;returns&gt;&lt;/returns&gt;
    public static List&lt;HtmlInfo&gt; ChildDescendants(this IEnumerable&lt;HtmlInfo&gt; htmlInfoList, string elementName = null)
    {
      var html = ().InnerHtml;
      XHtmlElement xhe = new XHtmlElement(html);
      return (html).Where(c =&gt; elementName == null ||  == elementName).ToList();
    }

    /// &lt;summary&gt;
    /// Get the parent    /// &lt;/summary&gt;
    /// &lt;param name="htmlInfoList"&gt;&lt;/param&gt;
    /// &lt;returns&gt;&lt;/returns&gt;
    public static List&lt;HtmlInfo&gt; ParentDescendant(this IEnumerable&lt;HtmlInfo&gt; htmlInfoList,string fullHtml)
    {
      var saveLeveHtml = ().SameLeveHtml;
      string replaceGuid=().ToString();
      fullHtml = (saveLeveHtml,replaceGuid);
      var parentHtml = (fullHtml, @"&lt;[^&lt;]+?&gt;[^&lt;]*?" + replaceGuid + @".*?&lt;\/.+?&gt;").Value;
      parentHtml = (replaceGuid, saveLeveHtml);
      XHtmlElement xhe = new XHtmlElement(parentHtml);
      return ();
    }
  }
  /// &lt;summary&gt;
  /// html information class  /// &lt;/summary&gt;
  public class HtmlInfo
  {
    /// &lt;summary&gt;
    /// Element name    /// &lt;/summary&gt;
    public string TagName { get; set; }
    /// &lt;summary&gt;
    /// Element properties    /// &lt;/summary&gt;
    public Dictionary&lt;string, string&gt; Attributes { get; set; }
    /// &lt;summary&gt;
    /// The internal html of the element    /// &lt;/summary&gt;
    public string InnerHtml { get; set; }

    public string OldFullHtml { get; set; }

    public string SameLeveHtml { get; set; }

    /// &lt;summary&gt;
    /// Get the html of the element    /// &lt;/summary&gt;
    /// &lt;returns&gt;&lt;/returns&gt;
    public string FullHtml
    {
      get
      {
        StringBuilder reval = new StringBuilder();
        string attributesString = ;
        if (Attributes != null &amp;&amp;  &gt; 0)
        {
          attributesString = (" ", (c =&gt; ("{0}=\"{1}\"", , )));
        }
        ("&lt;{0} {2}&gt;{1}&lt;/{0}&gt;", TagName, InnerHtml, attributesString);
        return ();
      }
    }
  }
}

Front Desk HTML:

&lt;!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http:///TR/xhtml1/DTD/"&gt;
&lt;html xmlns="http:///1999/xhtml"&gt;
&lt;head&gt;
  &lt;title&gt;&lt;/title&gt;
&lt;/head&gt;
&lt;body&gt;
  &lt;a &gt;I am1&lt;/a&gt; 
  &lt;a  class="icon"&gt;icon&lt;/a&gt;
  &lt;img /&gt;
&lt;/body&gt;
&lt;/html&gt;