简体   繁体   中英

scraping data from Javascript in html page

The snippet below is part of an HTML page. I need to scrape the data but not sure what would be the most reliable way. The best way would be JSON, but I'm not sure if the following can be converted to JSON. Is Regular Expression my only choice?

<script type="text/javascript"> 

    window.arMailRuMessages = [];

    arMailRuMessages = (function() {
        var k = 1024,
            u = ajs.Html.unescape,
            m = function(data) {
                try {
                    return u(decodeURIComponent(data.text));
                } catch (e) {}
                return '';
            };

        return [

            {
                id: "14412430340000000392",
                prev: "",
                next: "14412428590000000596",
                subject: u("hi"),
                date: "1441243034",
                size: "3" | 0,
                folder: "0",
                correspondents: {
                    from: [{
                        name: u("firstname lastname"),
                        email: u("firstname@gmail.com"),
                        avatars: {
                            "default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&amp;trust=true&amp;user=firstname%40mail.ru&amp;sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
                        }
                    }],
                    to: [{
                        name: u(""),
                        email: u("firstname6000@mail.ru"),
                        avatars: {
                            "default": u("")
                        }
                    }],
                    cc: []
                },
                flags: {
                    spf: true,
                    unread: true,
                    flagged: false,
                    reply: false,
                    forward: false,
                    attach: false
                },
                snippet: m({
                    "ntype": "letter",
                    "text": "thisisaford"
                }),
                priority: 3
            }, {
                id: "14412428590000000596",
                prev: "14412430340000000392",
                next: "",
                subject: u("hi"),
                date: "1441242859",
                size: "3" | 0,
                folder: "0",
                correspondents: {
                    from: [{
                        name: u("firstname lastname"),
                        email: u("firstname@gmail.com"),
                        avatars: {
                            "default": u("\/\/filin.mail.ru\/pic?email=firstname%40gmail.com&amp;trust=true&amp;user=firstname%40mail.ru&amp;sign=CA0D4E8E74E806A459EA9C793CE8BC665EB2D049")
                        }
                    }],
                    to: [{
                        name: u(""),
                        email: u("firstname@mail.ru"),
                        avatars: {
                            "default": u("")
                        }
                    }],
                    cc: []
                },
                flags: {
                    spf: true,
                    unread: true,
                    flagged: false,
                    reply: false,
                    forward: false,
                    attach: false
                },
                snippet: m({
                    "ntype": "letter",
                    "text": "thisisatest"
                }),
                priority: 3
            }
        ];
    })();
    __log.letters_data_js = 1;
</script>

With HtmlUnit, you can use htmlPage. executeJavaScript , which will return an Object to manipulate.

Below is a complete example:

    try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
        String url = "http://localhost/test.html";
        HtmlPage htmlPage = webClient.getPage(url);
        NativeArray array = (NativeArray) htmlPage.executeJavaScript("arMailRuMessages").getJavaScriptResult();
        for (int i = 0; i < array.getLength(); i++) {
            NativeObject object = (NativeObject) array.get(i);
            String id = (String) object.get("id");
            System.out.println(id);
            NativeObject correspondents = (NativeObject) object.get("correspondents");
            NativeArray from = (NativeArray) correspondents.get("from");
            System.out.println(((NativeObject) from.get(0)).get("name"));
        }
    }

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM