namespace PubishApps
{
class FetchURLsFromSite
{
/************************************************
* Topic : How to fetch all URLs of the site.
* Author : kalit sikka
* For : http://eggheadcafe.com
* **********************************************/
///
/// To fetch all URLs name from the site
///
///
public void FetchUrls( string webPage )
{
GetAllUrls(GetContent(webPage));
}
///
/// Get the content of the web page
///
///
///
private string GetContent(string webPage)
{
HttpWebResponse response = null;//used to get response
StreamReader respStream = null;//used to read response into string
try
{
//create a request object using the url passed in
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webPage);
request.Timeout = 10000;
//go get a response from the page
response = (HttpWebResponse)request.GetResponse();
//create a streamreader object from the response
respStream = new StreamReader(response.GetResponseStream());
//get the contents of the page as a string and return it
return respStream.ReadToEnd();
}
catch (Exception ex)
{
throw ex;
}
finally
{
//close it down, we're going home!
response.Close();
respStream.Close();
}
}
///
/// Use regular expression to filter required URLs
///
///
private void GetAllUrls(string content)
{
// Address of local LogFile
string LocalFile = @"C:\Documents and Settings\kalit.20413\My Documents\LogFile.txt";
//regular expression
string pattern = @"(?:href\s*=)(?:[\s""']*)(?!#|mailto|location.|javascript|.*css|.*this\.)(?
.*?)(?:[\s>""'])";
//Set up regex object
Regex RegExpr = new Regex(pattern, RegexOptions.IgnoreCase);
//get the first match
Match match = RegExpr.Match(content);
//loop through matches
while (match.Success)
{
//output the match info
Console.WriteLine("href match: " + match.Groups[0].Value);
WriteToLog(LocalFile, "href match: " + match.Groups[0].Value + "\r\n");
Console.WriteLine("Url match: " + match.Groups[1].Value);
//get next match
match = match.NextMatch();
}
}
///
/// Write log at local machine
///
///
///
private void WriteToLog(string file, string message)
{
using (StreamWriter w = File.AppendText(file))
{
w.WriteLine(DateTime.Now.ToString() + ": " + message);
w.Close();
}
}
}
}
Visit: http://www.eggheadcafe.com/tutorials/aspnet/23ce657b-87a9-45b3-856d-1c891803fcbd/how-to-get-all-urls-of-th.aspx
No comments:
Post a Comment