The program automatically reads the information displayed on other websites' web pages, similar to a crawler program. For example, we have a system that extracts the song search rankings on the BaiDu website. The analysis system performs data analysis based on the obtained data. Provide reference data for business.
In order to complete the above requirements, we need to simulate the browser browsing the web page, obtain the page data and analyze it, and finally write the analysis structure, that is, the sorted data, into the database. Then our idea is:
1. Send an HttpRequest request.
2. Receive the result returned by HttpResponse. Get the html source file for a specific page.
3. Take out the source code containing the data.
4. Generate HtmlDocument based on the html source code and loop out the data.
5. Write to the database.
The procedure is as follows:
//Get the html source code of the web page based on the URL address
private string GetWebContent(string Url)
{
string strResult="";
try
{
HttpWebRequest request = (HttpWebRequest)(Url);
//Declare an HttpWebRequest request
= 30000;
//Set the connection timeout time
("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)();
Stream streamReceive = ();
Encoding encoding = ("GB2312");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = ();
}
catch
{
("Error");
}
return strResult;
}
In order to use HttpWebRequest and HttpWebResponse, you need to fill in the namespace reference
using ;
The following is the specific implementation process of the program:
private void button1_Click(object sender, EventArgs e)
{
//The URL address to be crawled
string Url = "http://list./topso/?id=1#top2";
//Get the source code of the specified Url
string strWebContent = GetWebContent(Url);
= strWebContent;
//Fetch out the source code related to the data
int iBodyStart = ("<body", 0);
int iStart = ("TOP500 Songs", iBodyStart);
int iTableStart = ("<table", iStart);
int iTableEnd = ("</table>", iTableStart);
string strWeb = (iTableStart, iTableEnd - iTableStart + 8);
//Generate HtmlDocument
WebBrowser webb = new WebBrowser();
("about:blank");
HtmlDocument htmldoc = (true);
(strWeb);
HtmlElementCollection htmlTR = ("TR");
foreach (HtmlElement tr in htmlTR)
{
string strID = ("TD")[0].InnerText;
string strName = SplitName(("TD")[1].InnerText, "MusicName");
string strSinger = SplitName(("TD")[1].InnerText, "Singer");
strID = (".", "");
//Insert DataTable
AddLine(strID, strName, strSinger,"0");
string strID1 = ("TD")[2].InnerText;
string strName1 = SplitName(("TD")[3].InnerText, "MusicName");
string strSinger1 = SplitName(("TD")[3].InnerText, "Singer");
//Insert DataTable
strID1 = (".", "");
AddLine(strID1, strName1, strSinger1,"0");
string strID2 = ("TD")[4].InnerText;
string strName2 = SplitName(("TD")[5].InnerText, "MusicName");
string strSinger2 = SplitName(("TD")[5].InnerText, "Singer");
//Insert DataTable
strID2 = (".", "");
AddLine(strID2, strName2, strSinger2,"0");
}
//Insert into the database
InsertData(dt);
= ;
}
In order to complete the above requirements, we need to simulate the browser browsing the web page, obtain the page data and analyze it, and finally write the analysis structure, that is, the sorted data, into the database. Then our idea is:
1. Send an HttpRequest request.
2. Receive the result returned by HttpResponse. Get the html source file for a specific page.
3. Take out the source code containing the data.
4. Generate HtmlDocument based on the html source code and loop out the data.
5. Write to the database.
The procedure is as follows:
//Get the html source code of the web page based on the URL address
private string GetWebContent(string Url)
{
string strResult="";
try
{
HttpWebRequest request = (HttpWebRequest)(Url);
//Declare an HttpWebRequest request
= 30000;
//Set the connection timeout time
("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)();
Stream streamReceive = ();
Encoding encoding = ("GB2312");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = ();
}
catch
{
("Error");
}
return strResult;
}
In order to use HttpWebRequest and HttpWebResponse, you need to fill in the namespace reference
using ;
The following is the specific implementation process of the program:
private void button1_Click(object sender, EventArgs e)
{
//The URL address to be crawled
string Url = "http://list./topso/?id=1#top2";
//Get the source code of the specified Url
string strWebContent = GetWebContent(Url);
= strWebContent;
//Fetch out the source code related to the data
int iBodyStart = ("<body", 0);
int iStart = ("TOP500 Songs", iBodyStart);
int iTableStart = ("<table", iStart);
int iTableEnd = ("</table>", iTableStart);
string strWeb = (iTableStart, iTableEnd - iTableStart + 8);
//Generate HtmlDocument
WebBrowser webb = new WebBrowser();
("about:blank");
HtmlDocument htmldoc = (true);
(strWeb);
HtmlElementCollection htmlTR = ("TR");
foreach (HtmlElement tr in htmlTR)
{
string strID = ("TD")[0].InnerText;
string strName = SplitName(("TD")[1].InnerText, "MusicName");
string strSinger = SplitName(("TD")[1].InnerText, "Singer");
strID = (".", "");
//Insert DataTable
AddLine(strID, strName, strSinger,"0");
string strID1 = ("TD")[2].InnerText;
string strName1 = SplitName(("TD")[3].InnerText, "MusicName");
string strSinger1 = SplitName(("TD")[3].InnerText, "Singer");
//Insert DataTable
strID1 = (".", "");
AddLine(strID1, strName1, strSinger1,"0");
string strID2 = ("TD")[4].InnerText;
string strName2 = SplitName(("TD")[5].InnerText, "MusicName");
string strSinger2 = SplitName(("TD")[5].InnerText, "Singer");
//Insert DataTable
strID2 = (".", "");
AddLine(strID2, strName2, strSinger2,"0");
}
//Insert into the database
InsertData(dt);
= ;
}