I am trying to analyze anchor links ( their text
property ) in PhantomJS.
The retrieval happens here:
var list = page.evaluate(function() {
return document.getElementsByTagName('a');
});
this will return an object with a property length
which is good (the same length
I get when running document.getElementsByTagName('a');
in the console). But the vast majority of the elements in the object have the value of null
which is not good.. I have no idea why this is happening.
I have been playing with converting to a real array thru slice
which did no good. I have tried different sites, no difference. I have dumped the .png file to verify proper loading and the site is properly loaded.
This is obviously not the full script, but a minimal script that shows the problem on a well known public site ;)
How can I retrieve the full list of anchors from the loaded page ?
var page = require('webpage').create();
page.onError = function(msg, trace)
{ //Error handling mantra
var msgStack = ['PAGE ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function +'")' : ''));
});
}
console.error(msgStack.join('\n'));
};
phantom.onError = function(msg, trace)
{ //Error handling mantra
var msgStack = ['PHANTOM ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
});
}
console.error(msgStack.join('\n'));
phantom.exit(1);
};
function start( url )
{
page.open( url , function (status)
{
console.log( 'Loaded' , url , ': ' , status );
if( status != 'success' )
phantom.exit( 0 );
page.render( 'login.png');
var list = page.evaluate(function() {
return document.getElementsByTagName('a');
});
console.log( 'List length: ' , list.length );
for( var i = 0 ; i < list.length ; i++ )
{
if( !list[i] )
{
console.log( i , typeof list[i] , list[i] === null , list[i] === undefined );
//list[i] === null -> true for the problematic anchors
continue;
}
console.log( i, list[i].innerText , ',' , list[i].text /*, JSON.stringify( list[i] ) */ );
}
//Exit with grace
phantom.exit( 0 );
});
}
start( 'http://data.stackexchange.com/' );
//start( 'http://data.stackexchange.com/account/login?returnurl=/' );
The current version of phantomjs permits only primitive types (boolean, string, number, []
and {}
) to pass to and from the page context. So essentially all functions will be stripped and that is what DOM elements are. t.niese found the quote from the docs :
Note: The arguments and the return value to the evaluate function must be a simple primitive object. The rule of thumb: if it can be serialized via JSON, then it is fine.
Closures, functions, DOM nodes, etc. will not work!
You need to do a part of the work inside of the page context. If you want the innerText
property of every node, then you need to map it to a primitive type first:
var list = page.evaluate(function() {
return Array.prototype.map.call(document.getElementsByTagName('a'), function(a){
return a.innerText;
});
});
console.log(list[0]); // innerText
You can of course map multiple properties at the same time:
return Array.prototype.map.call(document.getElementsByTagName('a'), function(a){
return { text: a.innerText, href: a.href };
});
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.