简体   繁体   中英

How to get dynamic HTML and Javascript values from a page using PhantomJS

How can I get the latest page data (HTML & Javascript varaibles) from PhantomJS

eg page.refresh() or something?

I have an Interval, than checks a variable (on the page) every 200ms. However, this variable and the page content, isn't shown to have changed over time. (even though I know it has)

So I need an efficient way to check the value of a JS variable every 200ms or so,

then once I've discovered that variable has changed value, I want to request the latest page HTML.

How can I do this?

var Error = function (description) {
    this.description = description;
    return this;
};

var DTO = function (status, content, error) {
    this.status = status;
    this.content = content;
    this.error = error;
    return this;
};

function outputAndExit(dto) {
    console.log(JSON.stringify(dto));
    phantom.exit();
}

//For any uncaught exception, just log it out for .NET to capture
window.onerror = function (errorMsg, url, lineNumber) {
    var description = 'window.onerror caught an error: ' +
        'errorMsg: ' + errorMsg +
        'url: ' + url +
        'lineNumber: ' + lineNumber;
    outputAndExit(new DTO(false, null, new Error(description)));
};

var GetDynamicPageResult__ = function () {
    var obj = new GetDynamicPageResult();
    obj.initialize();
    return obj;
};

var GetDynamicPageResult = function () {
    var self = this;
    this.initialize = function () {

        this.error = null;
        this.isContentReadyForCrawler = false;

        this.ticker = null;
        this.tickerInterval = 150;
        this.tickerElapsed = 0;

        this.url = '';

        this.loadDependencies();
        this.processArgs();

        this.openPage();

    };
    this.loadDependencies = function () {
        this.system = require('system'),
        this.page = require('webpage').create(),
        this.page.injectJs('jquery-1.10.2.min');
        this.fs = require('fs');
    };
    this.processArgs = function () {
        if (this.system.args.length == 0) {
            outputAndExit(new DTO(false, null, new Error('No arguments given')));
        }
        //system.args[0] Was the name of this script
        this.url = this.system.args[1];
    };
    this.updateIsContentReadyForCrawler = function () {
        var updateIsContentReadyForCrawler = self.page.evaluate(function () {
            self.isContentReadyForCrawler = window.isContentReadyForCrawler;
        });
    };
    this.openPage = function () {
        self.page.open(this.url, function (status) { //NB: status = 'success' || 'fail'
            if (status !== 'success') {
                outputAndExit(new DTO(false, null, new Error('page.open received a non-success status')));
            }
            self.initTicker();
        });
    };

    this.initTicker = function () {
        this.ticker = setInterval(self.handleTick, self.tickerInterval);
    };
    this.handleTick = function () {
        self.tickerElapsed += self.tickerInterval;
        self.updateIsContentReadyForCrawler();
        if (self.isContentReadyForCrawler) {
            clearInterval(self.ticker);
            var content = self.page.content;
            self.finish(true, content, null);
        } else {
            var tooMuchTimeElapsed = self.tickerElapsed > 7000;
            if (tooMuchTimeElapsed) {
                clearInterval(self.ticker);
                self.finish(false, null, new Error('Too much time elapsed'));
            }
        }
    };
    this.finish = function (status, content, error) {
        content = content || '';
        error = error || {};
        outputAndExit(new DTO(status, content, error));
    };
};

/**********************************************************************************/
/***************************** Helpers *****************************/
/**********************************************************************************/

var Utility__ = function () {
    var obj = new Utility();
    obj.initialize();
    return obj;
};

var Utility = function () {
    var self = this;
    this.initialize = function () {
    };
    this.isEmpty = function (obj) {
        var isEmpty = false;
        (obj == undefined || obj == null) && (isEmpty = true);
        return isEmpty;
    };
    this.isStringEmpty = function (str) {
        var isEmpty = false;
        isEmpty(str) && (isEmpty = true);
        (isEmpty == false && $.trim(str) == '') && (isEmpty = true);
        return isEmpty;
    };
};

var getDynamicPageResult = new GetDynamicPageResult__();

I think you are almost there: you need to be using page.evaluate() , but currently only use it to get window.isContentReadyForCrawler. You need to use page.evaluate() to grab the latest HTML too.

I'm going to shamelessly paste in code from another answer ( https://stackoverflow.com/a/12044474/841830 ):

var html = page.evaluate(function () {
    var root = document.getElementsByTagName("html")[0];
    var html = root ? root.outerHTML : document.body.innerHTML;
    return html;
});

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM