Friday, May 22, 2009

How to get all URLs of the page

Topic : How to get all URLs of the page
namespace PubishApps
{
class FetchURLsFromSite
{

/************************************************
* Topic : How to fetch all URLs of the site.
* Author : kalit sikka
* For : http://eggheadcafe.com
* **********************************************/

///
/// To fetch all URLs name from the site
///

///
public void FetchUrls( string webPage )
{
GetAllUrls(GetContent(webPage));
}


///
/// Get the content of the web page
///

///
///
private string GetContent(string webPage)
{
HttpWebResponse response = null;//used to get response
StreamReader respStream = null;//used to read response into string
try
{
//create a request object using the url passed in
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webPage);
request.Timeout = 10000;

//go get a response from the page
response = (HttpWebResponse)request.GetResponse();

//create a streamreader object from the response
respStream = new StreamReader(response.GetResponseStream());

//get the contents of the page as a string and return it
return respStream.ReadToEnd();
}
catch (Exception ex)
{
throw ex;
}
finally
{
//close it down, we're going home!
response.Close();
respStream.Close();
}
}

///
/// Use regular expression to filter required URLs
///

///

private void GetAllUrls(string content)
{

// Address of local LogFile
string LocalFile = @"C:\Documents and Settings\kalit.20413\My Documents\LogFile.txt";

//regular expression
string pattern = @"(?:href\s*=)(?:[\s""']*)(?!#|mailto|location.|javascript|.*css|.*this\.)(?
.*?)(?:[\s>""'])";

//Set up regex object
Regex RegExpr = new Regex(pattern, RegexOptions.IgnoreCase);

//get the first match
Match match = RegExpr.Match(content);

//loop through matches
while (match.Success)
{

//output the match info
Console.WriteLine("href match: " + match.Groups[0].Value);
WriteToLog(LocalFile, "href match: " + match.Groups[0].Value + "\r\n");

Console.WriteLine("Url match: " + match.Groups[1].Value);

//get next match
match = match.NextMatch();
}
}

///
/// Write log at local machine
///

///
///
private void WriteToLog(string file, string message)
{
using (StreamWriter w = File.AppendText(file))
{
w.WriteLine(DateTime.Now.ToString() + ": " + message);
w.Close();
}
}
}

}


Visit: http://www.eggheadcafe.com/tutorials/aspnet/23ce657b-87a9-45b3-856d-1c891803fcbd/how-to-get-all-urls-of-th.aspx

No comments:

Post a Comment

Locations of visitors to this page