简体   繁体   中英

A console application to get a web page resource, using c# (javascript may cause this)

Aim: To download a website source with using a console application. You can find the used class in the program below.

Question: I use the code below to download a data (source) of a web page. Imagine you use chrome; If you enter first this query string, the web page itself redirects you a view HTML page and you see the data.

  1. Entering this URL, to show the results it redirects itself to second page below. I make it by using javascript.

www.xyz.com/aaa.html?search=aaa&id=1

  1. it redirects here: www.xyz.com/ViewResult.html

In an explorer, It works fine . I see 4 HTML tables inside the page when I use google chrome view source option. Bu in my application I see only two tables of the 4 . The two tables inside the web page is missing.(the missing two tables are the second and third.)

How can I overcome to this problem? I want to get the source of the page as I see in chrome.

Bonus informations: There is no iframe.

The particular Code :

  string url = "www.xyz.com/aaa.html?search=aaa&id=1";
   WebPage  pG = ss.RequestPage(url, "", "GET");

    pG = ss.RequestPage("www.xyz.com/ViewResult.html");

    string source= pG.Html;


 public WebPage RequestPage(Uri url, string content, string method, string contentType)
        {
            string htmlResult;
            HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
            HttpWebResponse response = null;
            ASCIIEncoding encoding = new ASCIIEncoding();
            byte[] contentData = encoding.GetBytes(content);

            request.Proxy = Proxy;
            request.Timeout = 60000;
            request.Method = method;
            request.AllowAutoRedirect = false; // false
            request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
            request.Referer = LastUrl;
            request.KeepAlive = true; //false,

            request.UserAgent = UserAgent;

            request.Headers.Add("Accept-Language", "en-us,en;q=0.5");
            //request.Headers.Add("UA-CPU", "x86");
            request.Headers.Add("Cache-Control", "no-cache");
            request.Headers.Add("Accept-Encoding", "gzip,deflate");

            String cookieString = "";
            foreach (KeyValuePair<String, String> cookiePair in Cookies)
                cookieString += cookiePair.Key + "=" + cookiePair.Value + ";";

            if (cookieString.Length > 2)
            {
                String cookie = cookieString.Substring(0, cookieString.Length - 1);
                request.Headers.Add("Cookie", cookie);
            }

            if (method == "POST")
            {
                request.ContentLength = contentData.Length;
                request.ContentType = contentType;

                Stream contentWriter = request.GetRequestStream();
                contentWriter.Write(contentData, 0, contentData.Length);
                contentWriter.Close();
            }

            int attempts = 0;

            while (true)
            {
                try
                {
                    response = (HttpWebResponse)request.GetResponse();
                    if (response == null)
                        throw new WebException();

                    break;
                }
                catch (WebException)
                {
                    if (response != null)
                        response.Close();

                    if (attempts == PageReattempts)
                    {
                       throw;  
                    }
                    else { }

                    // Wait three seconds before trying again
                    Thread.Sleep(3000);

                }

                attempts += 1;
            }

            // Tokenize cookies
            if (response.Headers["Set-Cookie"] != null)
            {
                String headers = response.Headers["Set-Cookie"].Replace("path=/,", ";").Replace("HttpOnly,", "");
                foreach (String cookie in headers.Split(';'))
                {
                    if (cookie.Contains("="))
                    {
                        String[] splitCookie = cookie.Split('=');
                        String cookieKey = splitCookie[0].Trim();
                        String cookieValue = splitCookie[1].Trim();

                        if (Cookies.ContainsKey(cookieKey))
                            Cookies[cookieKey] = cookieValue;
                        else
                            Cookies.Add(cookieKey, cookieValue);
                    }
                    else
                    {
                        if (Cookies.ContainsKey(cookie))
                            Cookies[cookie] = "";
                        else
                            Cookies.Add(cookie, "");
                    }
                }
            }

            htmlResult = ReadResponseStream(response);
            response.Close();

            if (response.Headers["Location"] != null)
            {
                response.Close();
                Thread.Sleep(1500);
                String newLocation = response.Headers["Location"];
                WebPage result = RequestPage(newLocation);
                return new WebPage(result.Html, new WebPage(htmlResult));
            }

            LastUrl = url.ToString();

            return new WebPage(htmlResult);
        }

1-WebBrowser :

public class ExtendedWebBrowser : System.Windows.Forms.WebBrowser
{
    public ExtendedWebBrowser()
    {
        // Ensure that ScriptErrorsSuppressed is set to false.
        this.ScriptErrorsSuppressed = true;
        this.ProgressChanged += ExtendedWebBrowser_ProgressChanged;
    }

    private void ExtendedWebBrowser_ProgressChanged(object sender, WebBrowserProgressChangedEventArgs e)
    {
        // InjectAlertBlocker();
        string alertBlocker = @"window.alert = function () { }; 
                        window.print = function () { }; 
                        window.open = function () { }; 
                        window.onunload = function () { }; 
                        window.onbeforeunload = function () { };";
        var webBrowser = sender as WebBrowser;
        webBrowser?.Document?.InvokeScript("execScript", new Object[] { alertBlocker, "JavaScript" });
        this.Document?.InvokeScript("execScript", new Object[] { alertBlocker, "JavaScript" });
    }

    public void NavigationWaitToComplete(string url)
    {
        bool complete = false;
        NavigationAsync(url).ContinueWith((t) => complete = true);
        while (!complete)
        {
            System.Windows.Forms.Application.DoEvents();
        }
    }
    public void NavigationWaitToComplete(string url, string targetFrameName, byte[] postData, string additionalHeaders)
    {
        bool complete = false;
        NavigationAsync(url, targetFrameName, postData, additionalHeaders).ContinueWith((t) => complete = true);
        while (!complete)
        {
            System.Windows.Forms.Application.DoEvents();
        }
    }
    public async Task NavigationAsync(string url, string targetFrameName, byte[] postData, string additionalHeaders)
    {
        TaskCompletionSource<bool> tcsNavigation = new TaskCompletionSource<bool>(); ;
        TaskCompletionSource<bool> tcsDocument = new TaskCompletionSource<bool>(); ;

        Navigated += (s, e) =>
        {
            if (tcsNavigation.Task.IsCompleted)
                return;
            tcsNavigation.SetResult(true);
        };

        DocumentCompleted += (s, e) =>
        {
            if (ReadyState != WebBrowserReadyState.Complete)
                return;
            if (tcsDocument.Task.IsCompleted)
                return;
            tcsDocument.SetResult(true);
        };

        Navigate(url, targetFrameName, postData, additionalHeaders);
        await tcsNavigation.Task;
        // navigation completed, but the document may still be loading

        await tcsDocument.Task;
        // the document has been fully loaded, you can access DOM here

    }
    public async Task NavigationAsync(string url)
    {
        TaskCompletionSource<bool> tcsNavigation = new TaskCompletionSource<bool>(); ;
        TaskCompletionSource<bool> tcsDocument = new TaskCompletionSource<bool>(); ;

        Navigated += (s, e) =>
        {
            if (tcsNavigation.Task.IsCompleted)
                return;
            tcsNavigation.SetResult(true);
        };

        DocumentCompleted += (s, e) =>
        {
            if (ReadyState != WebBrowserReadyState.Complete)
                return;
            if (tcsDocument.Task.IsCompleted)
                return;
            tcsDocument.SetResult(true);
        };

        Navigate(url);
        await tcsNavigation.Task;
        // navigation completed, but the document may still be loading

        await tcsDocument.Task;
        // the document has been fully loaded, you can access DOM here

    }
}

Calling:

var browser = new ExtendedWebBrowser();
        browser.NavigationWaitToComplete("www.xyz.com/aaa.html?search=aaa&id=1");
var html = browser.Document.Body.OuterHtml();

2-CefSharp.OffScreen

private async Task<string> RequestPageAsync(string url, string cachePath, double zoomLevel)
    {
        var tcs = new TaskCompletionSource<string>();
        var browserSettings = new BrowserSettings();
        //Reduce rendering speed to one frame per second so it's easier to take screen shots
        browserSettings.WindowlessFrameRate = 1;
        var requestContextSettings = new RequestContextSettings { CachePath = cachePath };
        // RequestContext can be shared between browser instances and allows for custom settings
        // e.g. CachePath
        using (var requestContext = new RequestContext(requestContextSettings))
        using (var browser = new ChromiumWebBrowser(url, browserSettings, requestContext))
        {
            if (zoomLevel > 1)
            {
                browser.FrameLoadStart += (s, argsi) =>
                {
                    var b = (ChromiumWebBrowser)s;
                    if (argsi.Frame.IsMain)
                    {
                        b.SetZoomLevel(zoomLevel);
                    }
                };
            }
            browser.FrameLoadEnd += (s, argsi) =>
            {
                var b = (ChromiumWebBrowser)s;
                if (argsi.Frame.IsMain)
                {
                    b.GetSourceAsync().ContinueWith(taskHtml =>
                    {
                        tcs.TrySetResult(taskHtml.Result);
                    });
                }
            };
        }
        return tcs.Task.Result;
    }

Calling :

RequestPageAsync("www.xyz.com/aaa.html?search=aaa&id=1", "cachePath1", 1.0);

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM