This article describes the method of C# using regular expressions to crawl website information. Share it for your reference, as follows:
Here we take the example of grabbing JD Mall product details.
1. Create a program class
public class JdRobber { /// <summary> /// Determine whether JD.com links /// </summary> /// <param name="param"></param> /// <returns></returns> public bool ValidationUrl(string url) { bool result = false; if (!(url)) { Regex regex = new Regex(@"^/\d+.html$"); Match match = (url); if () { result = true; } } return result; } /// <summary> ///Crawl JD.com information /// </summary> /// <param name="param"></param> /// <returns></returns> public void GetInfo(string url) { if (ValidationUrl(url)) { string htmlStr = (url, "Default"); if (!(htmlStr)) { string pattern = ""; // Regular expression string sourceWebID = ""; //Product Key ID string title = ""; //title decimal price = 0; //price string picName = ""; //picture //Extract the product key ID pattern = @"/(?<Object>\d+).html"; sourceWebID = (url, pattern); //Extract title pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>"; title = (htmlStr, pattern); //Extract pictures int begin = ("<div id=\"spec-n1\""); int end = ("</div>", begin + 1); if (begin > 0 && end > 0) { string subPicHtml = (begin, end - begin); pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>"; picName = (subPicHtml, pattern); } //Extraction price if (sourceWebID != "") { string priceUrl = @"http://p./prices/get?skuid=J_" + sourceWebID + "&type=1"; string priceJson = (priceUrl, "Default"); pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\"""; price = ((priceJson, pattern)); } ("Product Name:{0}", title); ("picture:{0}", picName); ("price:{0}", price); } } } }
2. Create a public method class
/// <summary> /// Public method class/// </summary> public class WebHandler { /// <summary> /// Get the HTML code of the web page /// </summary> /// <param name="url">link address</param> /// <param name="encoding">Encoding type</param> /// <returns></returns> public static string GetHtmlStr(string url, string encoding) { string htmlStr = ""; try { if (!(url)) { WebRequest request = (url); //Instantiate the WebRequest object WebResponse response = (); //Create WebResponse object Stream datastream = (); //Create a stream object Encoding ec = ; if (encoding == "UTF8") { ec = Encoding.UTF8; } else if (encoding == "Default") { ec = ; } StreamReader reader = new StreamReader(datastream, ec); htmlStr = (); //Read data (); (); (); } } catch { } return htmlStr; } /// <summary> /// Get keywords in regular expressions /// </summary> /// <param name="input">text</param> /// <param name="pattern">Expression</param> /// <returns></returns> public static string GetRegexText(string input, string pattern) { string result = ""; if (!(input) && !(pattern)) { Regex regex = new Regex(pattern, ); Match match = (input); if () { result = ["Object"].Value; } } return result; } /// <summary> /// Return to valid price /// </summary> /// <param name="strPrice"></param> /// <returns></returns> public static decimal GetValidPrice(string strPrice) { decimal price = 0; try { if (!(strPrice)) { Regex regex = new Regex(@"^\d+(\.\d{1,2})?$", ); Match match = (strPrice); if () { price = (strPrice); } } } catch { } return price; } }
PS: Here are two very convenient regular expression tools for your reference:
JavaScript regular expression online testing tool:
http://tools./regex/javascript
Regular expression online generation tool:
http://tools./regex/create_reg
For more information about C#, please visit the special topic of this site:Summary of the usage of C# regular expressions》、《Summary of C# coding operation skills》、《Summary of XML file operation skills in C#》、《Tutorial on the usage of common C# controls》、《Summary of WinForm control usage》、《C# data structure and algorithm tutorial》、《Introduction to C# object-oriented programming tutorial"and"Summary of thread usage techniques for C# programming》
I hope this article will be helpful to everyone's C# programming.