簡體   English   中英

從HTML頁面中的JavaScript抓取數據

[英]scraping data from Javascript in html page

下面的代碼段是HTML頁面的一部分。 我需要抓取數據,但不確定什么是最可靠的方法。 最好的方法是JSON,但是我不確定是否可以將以下內容轉換為JSON。 正則表達式是我唯一的選擇嗎?

<script type="text/javascript"> 

    window.arMailRuMessages = [];

    arMailRuMessages = (function() {
        var k = 1024,
            u = ajs.Html.unescape,
            m = function(data) {
                try {
                    return u(decodeURIComponent(data.text));
                } catch (e) {}
                return '';
            };

        return [

            {
                id: "14412430340000000392",
                prev: "",
                next: "14412428590000000596",
                subject: u("hi"),
                date: "1441243034",
                size: "3" | 0,
                folder: "0",
                correspondents: {
                    from: [{
                        name: u("firstname lastname"),
                        email: u("firstname@gmail.com"),
                        avatars: {
                            "default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&amp;trust=true&amp;user=firstname%40mail.ru&amp;sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
                        }
                    }],
                    to: [{
                        name: u(""),
                        email: u("firstname6000@mail.ru"),
                        avatars: {
                            "default": u("")
                        }
                    }],
                    cc: []
                },
                flags: {
                    spf: true,
                    unread: true,
                    flagged: false,
                    reply: false,
                    forward: false,
                    attach: false
                },
                snippet: m({
                    "ntype": "letter",
                    "text": "thisisaford"
                }),
                priority: 3
            }, {
                id: "14412428590000000596",
                prev: "14412430340000000392",
                next: "",
                subject: u("hi"),
                date: "1441242859",
                size: "3" | 0,
                folder: "0",
                correspondents: {
                    from: [{
                        name: u("firstname lastname"),
                        email: u("firstname@gmail.com"),
                        avatars: {
                            "default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&amp;trust=true&amp;user=firstname%40mail.ru&amp;sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
                        }
                    }],
                    to: [{
                        name: u(""),
                        email: u("firstname@mail.ru"),
                        avatars: {
                            "default": u("")
                        }
                    }],
                    cc: []
                },
                flags: {
                    spf: true,
                    unread: true,
                    flagged: false,
                    reply: false,
                    forward: false,
                    attach: false
                },
                snippet: m({
                    "ntype": "letter",
                    "text": "thisisatest"
                }),
                priority: 3
            }
        ];
    })();
    __log.letters_data_js = 1;
</script>

使用HtmlUnit,您可以使用htmlPage。 executeJavaScript ,它將返回一個Object進行操作。

下面是一個完整的示例:

    try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
        String url = "http://localhost/test.html";
        HtmlPage htmlPage = webClient.getPage(url);
        NativeArray array = (NativeArray) htmlPage.executeJavaScript("arMailRuMessages").getJavaScriptResult();
        for (int i = 0; i < array.getLength(); i++) {
            NativeObject object = (NativeObject) array.get(i);
            String id = (String) object.get("id");
            System.out.println(id);
            NativeObject correspondents = (NativeObject) object.get("correspondents");
            NativeArray from = (NativeArray) correspondents.get("from");
            System.out.println(((NativeObject) from.get(0)).get("name"));
        }
    }

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM