簡體   English   中英

當我使用jsoup或htmlunit獲取頁面時,href字段丟失

[英]href field missing when I get the page using jsoup or htmlunit

我正在嘗試解析Google圖片搜索結果

我正在嘗試獲取元素的href屬性 我注意到以編程方式獲取頁面時, href字段丟失了(jsoup和htmlunit都會發生這種情況)。
比較通過java以編程方式獲得的頁面元素和實際瀏覽器加載的頁面元素,唯一的區別是,確實缺少了href字段 (其余部分相同)。

href屬性(IMAGE_LINK)如下: /imgres?imgurl=http%3A%2F%2Fcdn.zonarutoppuden.com%2Fns%2Fpe‌​liculas-naruto-shipp‌​uden.jpg&imgrefurl=h‌​ttp%3A%2F%2Fwww.zona‌​rutoppuden.com%2F201‌​0%2F10%2Fnaruto-ship‌​puden-peliculas.html‌​&docid=JR8NPqKrF3ac_‌​M&tbnid=0EPPOYQcflXk‌​MM%3A&w=900&h=600&bi‌​h=638&biw=1275&ved=0‌​ahUKEwih9O2e88_OAhWM‌​ExoKHRLGAGQQMwg2KAMw‌​Aw&iact=mrc&uact=8

javascript引擎可能有問題嗎? 還是網站使用了某種算法反解析?

代碼段Java代碼:

WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.waitForBackgroundJavaScript(50000);
HtmlPage page1=null;

        try {
            // Get the first page
            page1 = webClient.getPage(URL);
            System.out.println(page1.asXml());
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

片段HTML代碼(真實瀏覽器):

<a jsaction="fire.ivg_o;mouseover:str.hmov;mouseout:str.hmou" class="rg_l" style="width: 134px; height: 201px; left: 0px; background: rgb(128, 128, 128);" href="IMAGE_LINK"> CONTENT... </a>

片段HTML代碼(以編程方式獲取頁面):

<a jsaction="fire.ivg_o;mouseover:str.hmov;mouseout:str.hmou" class="rg_l" style="width: 134px; height: 201px; left: 0px; background: rgb(128, 128, 128);"> CONTENT... </a>

謝謝。

對於每個搜索結果,都有一個<div class="rg_meta"> ,其中包含一個JSON對象,該對象還包含url。 使用類似於json-simple的JSON解析器來解析對象,以下代碼將打印圖像網址:

String searchTerm = "naruto shippuden";
String searchUrl = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&biw=1920&bih=955&q=" + searchTerm.replace(" ", "+") + "&gws_rd=cr";

try {
    Document doc = Jsoup.connect(searchUrl)
            .userAgent("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36")
            .referrer("https://www.google.com/").get();

    JSONObject obj;

    for (Element result : doc.select("div.rg_meta")) {

        // div.rg_meta contains a JSON object, which also holds the image url
        obj = (JSONObject) new JSONParser().parse(result.text());

        String imageUrl = (String) obj.get("ou");

        // just printing out the url to demonstate the approach
        System.out.println("imageUrl: " + imageUrl);    
    } 

} catch (IOException e1) {
    e1.printStackTrace();
}catch (ParseException e) {
    e.printStackTrace();
}

輸出:

imageUrl: http://ib3.huluim.com/show_key_art/1603?size=1600x600&region=US
imageUrl: http://cdn.zonarutoppuden.com/ns/peliculas-naruto-shippuden.jpg
imageUrl: http://www.saiyanisland.com/news/wp-content/uploads2/2014/12/Naruto-Sasuke.jpg
...

更新

由於jsAction似乎不能很好地與htmlUnit配合使用,因此我建議使用phantomJs 只需為您的操作系統下載二進制文件並創建一個腳本文件。

創建一個page.js文件:

var page = require('webpage').create();
var fs = require('fs');

page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';

page.zoomFactor = 0.1;

page.viewportSize = {
  width: 1920,
  height: 1080
};

var divCount="-1";
var topPosition=0;
var unchangedCounter=0;

page.open('https://www.google.com/search?site=imghp&tbm=isch&source=hp&q=naruto+shippuden&gws_rd=cr', function(status) {
    console.log("Status: " + status);
    if(status === "success") {

        window.setInterval(function() {

            var newDivCount = page.evaluate(function() { 
                var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
                return divs[divs.length-1].getAttribute("data-ri");
            });

            topPosition = topPosition + 1080;

            page.scrollPosition = {
                top: topPosition,
                left: 0
            };

            if(newDivCount===divCount){
                page.evaluate(function() {
                    var button = document.querySelector("#smb");
                    console.log("buttontype:"+typeof button);
                    if(!(typeof button === "undefined")) {
                        button.click();
                        return true;
                    }else{
                        return false;
                    }
                });

                if(unchangedCounter===5){
                    console.log(newDivCount);
                    var path = 'output.html';
                    fs.write(path, page.content, 'w');
                    phantom.exit();
                }else{
                    unchangedCounter=unchangedCounter+1;
                }
            }else{
                unchangedCounter=0;
            }
            divCount = newDivCount;

        }, 500);
    }
});

現在,我們使用phantomJs執行腳本文件,並使用jsoup像以前一樣解析結果:

try {
    Process process = Runtime.getRuntime().exec("bin\\phantomjs page.js"); //change path to phantomjs binary and your script file
    process.waitFor();

    Document doc = Jsoup.parse(new File("output.html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js

    for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {
        System.out.println(element.attr("href"));
    }
    System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());
} catch (IOException | InterruptedException e) {
    e.printStackTrace();
}

輸出:

/imgres?imgurl=http%3A%2F%2Fib3.huluim.com%2Fshow_key_art%2F1603%3Fsize%3D1600x600%26region%3DUS&imgrefurl=http%3A%2F%2Fwww.hulu.com%2Fnaruto-shippuden&docid=OgW4j66rp7CKkM&tbnid=SElXvYDJj9cR6M%3A&w=1600&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwgzKAAwAA&iact=mrc&uact=8
/imgres?imgurl=http%3A%2F%2Fcdn.zonarutoppuden.com%2Fns%2Fpeliculas-naruto-shippuden.jpg&imgrefurl=http%3A%2F%2Fwww.zonarutoppuden.com%2F2010%2F10%2Fnaruto-shippuden-peliculas.html&docid=JR8NPqKrF3ac_M&tbnid=0EPPOYQcflXkMM%3A&w=900&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwg0KAEwAQ&iact=mrc&uact=8
...
Number of results: 463

更新:將url作為參數傳遞給腳本

腳本 page.js

var page = require('webpage').create();
var fs = require('fs');
var system = require('system');

var url = "";
var searchParameter = "";

if (system.args.length === 3) {
    url=system.args[1];
    searchParameter=system.args[2];
}

if(url==="" || searchParameter===""){
    phantom.exit();
}

page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';

page.zoomFactor = 0.1;

page.viewportSize = {
  width: 1920,
  height: 1080
};

var divCount="-1";
var topPosition=0;
var unchangedCounter=0;

page.open(url, function(status) {
    console.log("Status: " + status);
    if(status === "success") {

        window.setInterval(function() {

            var newDivCount = page.evaluate(function() { 
                var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
                return divs[divs.length-1].getAttribute("data-ri");
            });

            topPosition = topPosition + 1080;

            page.scrollPosition = {
                top: topPosition,
                left: 0
            };

            if(newDivCount===divCount){
                page.evaluate(function() {
                    var button = document.querySelector("#smb");
                    if(!(typeof button === "undefined")) {
                        button.click();
                        return true;
                    }else{
                        return false;
                    }
                });

                if(unchangedCounter===5){
                    var path = searchParameter+'.html';
                    fs.write(path, page.content, 'w');
                    phantom.exit();
                }else{
                    unchangedCounter=unchangedCounter+1;
                }
            }else{
                unchangedCounter=0;
            }
            divCount = newDivCount;

        }, 500);
    }else{
        phantom.exit();
    }
});

Java代碼

try {
    //change path to phantomjs binary and your script file
    String phantomJSPath = "phantomjs" + File.separator + "bin" + File.separator + "phantomjs";
    String scriptFile = "page.js";

    String searchTerm = "naruto+shippuden";
    String urlParameter = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&gws_rd=cr&q="+searchTerm;

    Process process = Runtime.getRuntime().exec(phantomJSPath + " " + scriptFile + " " + urlParameter + " " + searchTerm);
    process.waitFor();

    Document doc = Jsoup.parse(new File(searchTerm + ".html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js

    for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {
        System.out.println(element.attr("href"));
    }
    System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());
} catch (IOException | InterruptedException e) {
    e.printStackTrace();
}

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM