简体   繁体   中英

Web Scraping-crawling

 private void button2_Click(object sender, EventArgs e)
    {
        listBox1.Items.Clear();
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
        do
        {
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
            {
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
                sb.Append(tempString);
            }
        }

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        html.LoadHtml(sbb);
        HtmlNode doc = html.DocumentNode;
        StreamWriter sw = File.AppendText("website.txt");
        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
        {
            HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);
            if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
            {
                int index = hrefValue.IndexOf("&");
                if (index > 0)
                {
                    hrefValue = hrefValue.Substring(0, index);
                    listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
                }

            }
            List<string> values = new List<string>();

            string SourceCode = worker.GetSourceCode(SearchResults);

            MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

            foreach (Match m in data)
            {

                string value = m.Groups[1].Value;
                value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
                values.Add(value);

                sw.Write(value);
            }
        }
        sw.Close(); ;
    }



       public static string GetSourceCode(string url)
    {
         HttpWebRequest reg = (HttpWebRequest)WebRequest.Create(url);
        HttpWebResponse resp = (HttpWebResponse)reg.GetResponse();
        StreamReader sr = new 
       StreamReader(resp.GetResponseStream(),System.Text.UTF8Encoding.UTF8);
        string SourceCode = sr.ReadToEnd();
        sr.Close();
        resp.Close();
        return SourceCode

Hi for all. I am trying to prepare a Windows Form Application for scraping. I ll enter some expression from my windows Form and search automatically that expressions inside google . Program will show me found links inside a listbox and show that links contains(Letters inside that links) inside a text file. Showing links work fine but program don't record link content inside text file.

I tried debug mode.As a result program didn't enter inside that code block.

foreach(Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);

            sw.Write(value);
        }

I tried showing links code block and recording links content code block seperately.Both of them works fine. when I tried to combine them ı couldn't get a working code.no error but didn't work.Please help.

    private void Clicked(object sender, EventArgs e)
    {
        List<string> values = new List<string>();
        string url = textBox1.Text;
        string SourceCode = worker.GetSourceCode(url);

        MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

        foreach (Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);
            StreamWriter sw = File.AppendText("website.txt");
            sw.Write(value);
            sw.Close(); ;
        }

    }

    private void button2_Click(object sender, EventArgs e)
    {
        listBox1.Items.Clear();
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
        do
        {
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
            {
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
                sb.Append(tempString);
            }
        }

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        html.LoadHtml(sbb);
        HtmlNode doc = html.DocumentNode;
        //StreamWriter sw = File.AppendText("website.txt");
        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
        {
            HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);

            if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
            {

                int index = hrefValue.IndexOf("&");

                if (index > 0)
                {
                    hrefValue = hrefValue.Substring(0, index);
                    hrefValue = hrefValue.Replace("/url?q=", "");
                    listBox1.Items.Add(hrefValue);
                    GetData(hrefValue);
                }                  
            }              
        }            
    }

    private void GetData(string url)
    {
        StreamWriter sw = File.AppendText("website.txt");

        List<string> values = new List<string>();

        string SourceCode = worker.GetSourceCode(url);

        MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

        foreach (Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);

            sw.Write(value);

        }
        sw.Close();
    }

    private void listBox1_SelectedIndexChanged(object sender, EventArgs e)
    {

    }

    private void label3_Click(object sender, EventArgs e)
    {

    }

    private void label2_Click(object sender, EventArgs e)
    {

    }


}

}

I finally manage to launch successfully. Here is the answer. Just left a few problems in my answer. They are all about regular expressions. Because websites html codes don't have a standart concept. So it needed to be corrected with regex. When ı complete my project ı ll share my full codes.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM