private void button2_Click(object sender, EventArgs e)
{
listBox1.Items.Clear();
StringBuilder sb = new StringBuilder();
byte[] ResultsBuffer = new byte[8192];
string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
sb.Append(tempString);
}
}
while (count > 0);
string sbb = sb.ToString();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.OptionOutputAsXml = true;
html.LoadHtml(sbb);
HtmlNode doc = html.DocumentNode;
StreamWriter sw = File.AppendText("website.txt");
foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
string hrefValue = link.GetAttributeValue("href", string.Empty);
if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
{
int index = hrefValue.IndexOf("&");
if (index > 0)
{
hrefValue = hrefValue.Substring(0, index);
listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
}
}
List<string> values = new List<string>();
string SourceCode = worker.GetSourceCode(SearchResults);
MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);
foreach (Match m in data)
{
string value = m.Groups[1].Value;
value = value.Replace("’", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("ö", "ö").Replace("ü", "ü").Replace("ç", "ç");
values.Add(value);
sw.Write(value);
}
}
sw.Close(); ;
}
public static string GetSourceCode(string url)
{
HttpWebRequest reg = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse resp = (HttpWebResponse)reg.GetResponse();
StreamReader sr = new
StreamReader(resp.GetResponseStream(),System.Text.UTF8Encoding.UTF8);
string SourceCode = sr.ReadToEnd();
sr.Close();
resp.Close();
return SourceCode
Hi for all. I am trying to prepare a Windows Form Application for scraping. I ll enter some expression from my windows Form and search automatically that expressions inside google . Program will show me found links inside a listbox and show that links contains(Letters inside that links) inside a text file. Showing links work fine but program don't record link content inside text file.
I tried debug mode.As a result program didn't enter inside that code block.
foreach(Match m in data)
{
string value = m.Groups[1].Value;
value = value.Replace("’", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("ö", "ö").Replace("ü", "ü").Replace("ç", "ç");
values.Add(value);
sw.Write(value);
}
I tried showing links code block and recording links content code block seperately.Both of them works fine. when I tried to combine them ı couldn't get a working code.no error but didn't work.Please help.
private void Clicked(object sender, EventArgs e)
{
List<string> values = new List<string>();
string url = textBox1.Text;
string SourceCode = worker.GetSourceCode(url);
MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);
foreach (Match m in data)
{
string value = m.Groups[1].Value;
value = value.Replace("’", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("ö", "ö").Replace("ü", "ü").Replace("ç", "ç");
values.Add(value);
StreamWriter sw = File.AppendText("website.txt");
sw.Write(value);
sw.Close(); ;
}
}
private void button2_Click(object sender, EventArgs e)
{
listBox1.Items.Clear();
StringBuilder sb = new StringBuilder();
byte[] ResultsBuffer = new byte[8192];
string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
sb.Append(tempString);
}
}
while (count > 0);
string sbb = sb.ToString();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.OptionOutputAsXml = true;
html.LoadHtml(sbb);
HtmlNode doc = html.DocumentNode;
//StreamWriter sw = File.AppendText("website.txt");
foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
string hrefValue = link.GetAttributeValue("href", string.Empty);
if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
{
int index = hrefValue.IndexOf("&");
if (index > 0)
{
hrefValue = hrefValue.Substring(0, index);
hrefValue = hrefValue.Replace("/url?q=", "");
listBox1.Items.Add(hrefValue);
GetData(hrefValue);
}
}
}
}
private void GetData(string url)
{
StreamWriter sw = File.AppendText("website.txt");
List<string> values = new List<string>();
string SourceCode = worker.GetSourceCode(url);
MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);
foreach (Match m in data)
{
string value = m.Groups[1].Value;
value = value.Replace("’", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("ö", "ö").Replace("ü", "ü").Replace("ç", "ç");
values.Add(value);
sw.Write(value);
}
sw.Close();
}
private void listBox1_SelectedIndexChanged(object sender, EventArgs e)
{
}
private void label3_Click(object sender, EventArgs e)
{
}
private void label2_Click(object sender, EventArgs e)
{
}
}
}
I finally manage to launch successfully. Here is the answer. Just left a few problems in my answer. They are all about regular expressions. Because websites html codes don't have a standart concept. So it needed to be corrected with regex. When ı complete my project ı ll share my full codes.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.