简体   繁体   中英

Webscraper in Node.js returns empty array with async and promise

I have problems in getting nodejs async and promise to work with a webscraper using a forloop to visits websites. After looking at several posts and testing different solutions on stackoverflow I can't get my async function to work properly. Thanks!

Code:

var data = {};

async function run() {
    console.log("Setup links..");
    var links = ['https://example1.com', 'https://example2.com'];

    await Promise.all(links.map(async (element) => {
        const contents = await scrape(element);
        console.log("After call in Promise: " + JSON.stringify(data));
    }));

    console.log("------------");
    console.log(JSON.stringify(data));
    return JSON.stringify(data);
}

async function scrape(element) {
    request(element, function (error, response, html) {
        console.log("Scrape website...");
        if (!error && response.statusCode == 200) {
            var $ = cheerio.load(html);
            var rowCounter = 0;
            var columnCounter = 0;
            var dates = [];
            var item = [];
            var mainTitle = false;
            var title;

            $('tr td').each(function(i, elem) {
                var txt = $(elem).text().trim();
                if (rowCounter == 0) {
                    if (columnCounter != 0) {
                        dates.push(txt.substring(txt.length - 4, txt.length));
                    }
                } else {
                    if (txt == "Current Assets" || txt == "Current Liabilities" || txt == "Stockholders' Equity" || txt == "Revenue" || txt == "Operating Expenses" || txt == "Income from Continuing Operations" || txt == "Non-recurring Events" || txt == "Net Income") {
                        mainTitle = true;
                    } else {
                        if (columnCounter == 0) {
                            title = txt.split(' ').join('');
                            data[title] = {};
                        } else {
                            item.push(txt);
                        }
                    }
                }

                columnCounter++;

                if (mainTitle) {
                    columnCounter = 0;
                    mainTitle = false;
                }

                if (columnCounter == 5) {
                    columnCounter = 0;
                    if (rowCounter != 0) {
                        data[title][0] = item[0];
                        data[title][1] = item[1];
                        data[title][2] = item[2];
                        data[title][3] = item[3];
                        item = [];
                    } 
                    rowCounter++;
                }
            });
        }
    });   
}

module.exports.run = run;

The code above in console:

Server started!
Route called
Setup links..
After call in Promise: {}
After call in Promise: {}
------------
{}
Scrape website...
Scrape website...

So it's a problem with the promise when using a loop.

I believe this is what you want (not tested, just hacked):

async function scrape(element) {
return new Promise( (resolve, reject ) => {
    request(element, function (error, response, html) {
        if( error ) return reject( error );
        if (response.statusCode != 200) return reject( "Got HTTP code: " + response.statusCode);

        console.log("Scrape website...");
        var $ = cheerio.load(html);
        var rowCounter = 0;
        var columnCounter = 0;
        var dates = [];
        var item = [];
        var mainTitle = false;
        var title;

        $('tr td').each(function(i, elem) {
            var txt = $(elem).text().trim();
            if (rowCounter == 0) {
                if (columnCounter != 0) {
                    dates.push(txt.substring(txt.length - 4, txt.length));
                }
            } else {
                if (txt == "Current Assets" || txt == "Current Liabilities" || txt == "Stockholders' Equity" || txt == "Revenue" || txt == "Operating Expenses" || txt == "Income from Continuing Operations" || txt == "Non-recurring Events" || txt == "Net Income") {
                    mainTitle = true;
                } else {
                    if (columnCounter == 0) {
                        title = txt.split(' ').join('');
                        data[title] = {};
                    } else {
                        item.push(txt);
                    }
                }
            }

            columnCounter++;

            if (mainTitle) {
                columnCounter = 0;
                mainTitle = false;
            }

            if (columnCounter == 5) {
                columnCounter = 0;
                if (rowCounter != 0) {
                    data[title][0] = item[0];
                    data[title][1] = item[1];
                    data[title][2] = item[2];
                    data[title][3] = item[3];
                    item = [];
                } 
                rowCounter++;
            }
        });
        resolve();
    });   
} );

}

Wrapped the code in a Promise , called resolve and handled errors with reject - but you know best about how to handle the errors.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM