[英]Getting “raw” jss and css code from server using Chrome/Chromium using WWW::Mechanize::Chrome
我正在嘗試使用WWW::Mechanize::Chrome
下載css / js文件。 是的,還有其他方法可以獲取文件。 但我的要求是用WWW::Mechanize::Chrome
。 我想知道是否有可能。
我可以對css或js文件執行$mech->get($url)
。 然后它會顯示在瀏覽器窗口中,然后我可以使用$mech->content
獲取它。 問題是HTML實體被編碼並解碼它們導致與原始文件不同的文件(我測試了這個)。 這是js文件的問題。 之后他們運行不正常。
您可以運行此測試腳本以查看編碼的文件。
use strict;
use warnings;
use WWW::Mechanize::Chrome;
my $mech = WWW::Mechanize::Chrome->new();
$mech->get('https://www.nytimes.com/vi-assets/static-assets/vendor-454814a0340940dc9b42.js');
my $content = $mech->content;
use Data::Dumper qw(Dumper);
print Dumper $content;
我想知道是否有某種解決方法可以直接從服務器獲取這些文件。 同樣,必須使用WWW::Mechanize::Chrome
。
如果沒有別的,你可以注入一個腳本為你下載文件。
以下演示了使用Selenium :: Chrome的這種方法,但該方法可以適用於WWW :: Mechanize :: Chrome。
use strict;
use warnings qw( all );
use FindBin qw( $RealBin );
use MIME::Base64 qw( decode_base64 );
use Selenium::Chrome qw( );
use Time::HiRes qw( sleep );
use Sub::ScopeFinalizer qw( scope_finalizer );
# nf = Non-fatal.
sub nf_find_element {
my $web_driver = shift;
my $node;
if (!eval {
$node = $web_driver->find_element(@_);
return 1; # No exception.
}) {
return undef if $@ =~ /Unable to locate element|An element could not be located on the page using the given search parameters/;
die($@);
}
return $node;
}
sub nf_find_elements {
my $web_driver = shift;
my $nodes;
if (!eval {
$nodes = $web_driver->find_elements(@_);
return 1; # No exception.
}) {
return undef if $@ =~ /Unable to locate element|An element could not be located on the page using the given search parameters/;
die($@);
}
return wantarray ? @$nodes : $nodes;
}
sub nf_find_child_element {
my $web_driver = shift;
my $node;
if (!eval {
$node = $web_driver->find_child_element(@_);
return 1; # No exception.
}) {
return undef if $@ =~ /Unable to locate element|An element could not be located on the page using the given search parameters/;
die($@);
}
return $node;
}
sub nf_find_child_elements {
my $web_driver = shift;
my $nodes;
if (!eval {
$nodes = $web_driver->find_child_elements(@_);
return 1; # No exception.
}) {
return undef if $@ =~ /Unable to locate element|An element could not be located on the page using the given search parameters/;
die($@);
}
return wantarray ? @$nodes : $nodes;
}
# Warning: This clears the log.
sub has_js_failed {
my ($web_driver) = @_;
my $log = $web_driver->get_log('browser');
return 0+grep { no warnings qw( uninitialized ); $_->{level} eq 'SEVERE' && $_->{source} eq 'javascript' } @$log;
}
{
my $js = <<'__EOS__';
var array_buffer_to_base64 = function(buf) {
let binary = '';
let bytes = new Uint8Array(buf);
for (let byte of bytes) {
binary += String.fromCharCode(byte);
}
return btoa(binary);
};
var set_response = function(code, msg) {
let code_node = document.createElement('input');
code_node.setAttribute('type', 'hidden');
code_node.setAttribute('name', 'code');
code_node.setAttribute('value', code);
let msg_node = document.createElement('input');
msg_node.setAttribute('type', 'hidden');
msg_node.setAttribute('name', 'msg');
msg_node.setAttribute('value', msg);
let form_node = document.createElement('form');
form_node.setAttribute('id', 'exit');
form_node.appendChild(code_node);
form_node.appendChild(msg_node);
document.body.appendChild(form_node);
};
var request = function(url) {
fetch(url)
.then(
response => {
if (!response.ok)
throw new Error("HTTP error: " + response.status);
return response.arrayBuffer();
}
)
.then(
buffer => set_response("success", array_buffer_to_base64(buffer)),
reason => set_response("error", reason),
);
};
request(...arguments);
__EOS__
my $web_driver;
my $guard = scope_finalizer {
if ($web_driver) {
$web_driver->shutdown_binary();
$web_driver = undef;
}
};
$web_driver = Selenium::Chrome->new(
binary => "$RealBin/chromedriver.exe",
);
$web_driver->get('https://www.nytimes.com/');
$web_driver->execute_script($js, 'https://www.nytimes.com/vi-assets/static-assets/vendor-454814a0340940dc9b42.js');
my $exit_form_node;
while (1) {
if (has_js_failed($web_driver)) {
die("JavaScript error detected.\n");
}
$exit_form_node = nf_find_element($web_driver, '/html/body/form[@id="exit"]')
and last;
sleep(0.250);
}
my $code = nf_find_child_element($web_driver, $exit_form_node, 'input[@name="code"]')->get_value();
my $msg = nf_find_child_element($web_driver, $exit_form_node, 'input[@name="msg"]')->get_value();
if (!defined($code) || $code ne 'success') {
$msg ||= "Unknown error";
die("$msg\n");
}
my $doc = decode_base64($msg);
binmode STDOUT;
print $doc;
}
可能希望在輪詢循環中添加超時,以便在出現問題時不會永遠等待。
好的,這里有一些代碼來展示如何使用WMC執行此操作。 再次感謝@ikegami的想法和代碼,以展示如何使用此代碼所基於的Selenimum使用javascript完成此操作。 這是一個巧妙的小解決方法。
下面的代碼稍微修改了他的js代碼示例以允許多個文件,並添加了一個變得可見的元素,因此$ mech可以檢測數據何時可以抓取並保存。
use MIME::Base64;
use WWW::Mechanize::Chrome;
# etc.
sub js_here {
return <<'JS'
var array_buffer_to_base64 = function(buf) {
let binary = '';
let bytes = new Uint8Array(buf);
for (let byte of bytes) {
binary += String.fromCharCode(byte);
}
return btoa(binary);
};
var set_response = function(code, msg, number) {
let code_node = document.createElement('input');
code_node.setAttribute('type', 'hidden');
code_node.setAttribute('id', 'code-' + number);
code_node.setAttribute('value', code);
let msg_node = document.createElement('input');
msg_node.setAttribute('type', 'hidden');
msg_node.setAttribute('id', 'msg-' + number);
msg_node.setAttribute('value', msg);
let vis_node = document.createElement('span');
vis_node.setAttribute('id', 'vis-' + number);
vis_node.setAttribute('value', '');
let form_node = document.createElement('form');
form_node.setAttribute('id', 'exit-' + number);
form_node.appendChild(code_node);
form_node.appendChild(msg_node);
form_node.appendChild(vis_node);
document.body.appendChild(form_node);
}
var request = function(url, number) {
fetch(url)
.then(
response => {
if (!response.ok)
throw new Error("HTTP error: " + response.status);
return response.arrayBuffer();
}
)
.then(
buffer => set_response("success", array_buffer_to_base64(buffer), number),
reason => set_response("error", reason),
);
};
JS
}
$mech->eval_in_page(js_here());
sub js_download {
my ($url, $number) = @_;
return "request('$url', $number)";
}
請注意,這需要兩個參數。 文件的URL和標識文件的任意數字。
這是一個循環下載多個文件:
my $count = 1;
foreach my $file (@files) {
$mech->clear_js_errors;
# throw contents of file into a hidden field on the web page
$mech->eval_in_page( js_download($file, $count));
# check for javascript errors
if ($mech->js_errors) {
warn "A javascript error encountered while fetching $file. Skipping file.\n";
foreach my $err ( $mech->js_errors() ) {
my $msg = $err->{message} || '';
warn "\t" . $msg . "\n";
}
++$count;
next;
}
# check for download errors
$mech->wait_until_visible(selector => "#vis-$count");
$mech->form_id( "exit-$count" );
my $ret_code = $mech->value("#code-$count", one => 1);
if ( $ret_code eq 'error' ) {
warn "Unable to download $file: \n";
warn $mech->value("#msg-$count") . "\n";
++$count;
next;
}
# get the file's content and save it to the directory
my $value = $mech->value("#msg-$count", one => 1);
my $content = decode_base64($value);
_save_file ($content, $file); # up to you how to implement
$count++;
}
而已。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.