简体   繁体   English

如何按顺序发出page.open()请求?

[英]How can I make my page.open() requests sequentially?

I coded this, but it's still non-sequential. 我编写了这个代码,但是它仍然是非顺序的。 I hoped with the functions it would wait until the actual request finishes until a new one is called .... but that doesn't work. 我希望可以使用这些功能,直到实际请求完成为止,直到调用了一个新请求....但这是行不通的。

Problem 1: The page.open() calls are not sequential as you can see here: 问题1: page.open()调用不是顺序的,如下所示:

6       protocol: https:     type: Content
7       protocol: https:     type: Content
8       protocol: https:     type: Content
9       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/
10       protocol: https:     type: Content
11       protocol: https:     type: Content
12       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/anrichte/
LINE: https://www.roller.de/einrichten/arbeitsstuhl/
LINE: https://www.roller.de/einrichten/arbeitstisch/
LINE: https://www.roller.de/einrichten/armlehnstuehle/
LINE: https://www.roller.de/einrichten/badezimmermoebel
LINE: https://www.roller.de/einrichten/bistrostuehle/
LINE: https://www.roller.de/einrichten/buecherregal/
13       protocol: https:     type: Content
14       protocol: https:     type: Content
15       protocol: https:     type: Content
16       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/buerocontainer/
LINE: https://www.roller.de/einrichten/bueroregale/
17       protocol: https:     type: Content
18       protocol: https:     type: Content

The LINE: should only be printed once per request, but it appears several time without the page.open result, resulting in an early stream.atEnd() = true. LINE:每个请求仅应打印一次,但在没有page.open结果的情况下会出现几次,从而导致stream.atEnd()= true。 That should be impossible if it's sequential. 如果是顺序的,那应该是不可能的。

Problem 2: The last line is not taken, when I have a .txt file with 100 links (1 per line), 99 are printed, one is not 问题2:最后一行未使用,当我有一个.txt文件,其中包含100个链接(每行1个)时,打印了99个,而没有一个

Problem 3: it crashes when I give it a list with 1000 urls 问题3:当我给它提供一个包含1000个网址的列表时,它会崩溃

Problem 4: 10 links = 10 prints, 100 links = 98 prints and stream.atEnd() does appear several times, 500 links = 497-498 prints + stream.atEnd() problem, 1000 links = Crash 问题 4:10个链接= 10个打印,100个链接= 98个打印和stream.atEnd()确实出现了几次,500个链接= 497-498打印+ stream.atEnd()问题,1000个链接=崩溃

console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('100sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;

function nextPage() {
    if (stream.atEnd()) {
        //stream.close();
        console.log("STREAM END: " + stream.atEnd());
        console.log("FILE ENDS HERE");
        //phantom.exit();
    }
    if (!stream.atEnd()) {
        var line = stream.readLine();
        console.log("LINE: " + line);
        getRequest(line);
    }
}

function getRequest(line2) {
    //console.log(line);
    var page = webPage.create();
    page.settings.loadImages = false;
    page.open(line2, function() {});
    //console.log("page.open() " + line2);
    //console.log("opened " + line2);
    page.onResourceRequested = function(requestData, request) {
        //console.log("BEFORE: " + requestData.url);
        var match = requestData.url.match(/example.com\/ca/g)
        //console.log("Match: " + match);
        //console.log(request.url);
        if (match != null) {
            hasFound = true;
            var targetString = decodeURI(JSON.stringify(requestData.url));
            var klammerauf = targetString.indexOf("{");
            var jsonobjekt = targetString.substr(klammerauf,      (targetString.indexOf("}") - klammerauf) + 1);
            targetJSON = (decodeURIComponent(jsonobjekt));
            var t = JSON.parse(targetJSON);
            console.log(i + "       " + t['groups'] + "     " +    t['campID']);
            i++;
            //console.log(targetJSON);
            request.abort;
        }
    };
    page.onLoadFinished = function(status) {
        if (!hasFound) {
            console.log(i + " :NOT FOUND: " + line2);
            i++;
        }
        //request.abort();
        page.close();
        nextPage();
    }
}

nextPage();

Now it works with this code, iFrames seems to trigger onLoadFinished() twice, so I check that with hasOnLoadFinished to prevent the multiple entries in the function (using multiple page.open() at once is a really bad idea in PhantomJS). 现在,它可以与此代码一起使用,iFrames似乎会触发两次onLoadFinished(),因此我使用hasOnLoadFinished进行了检查,以防止函数中出现多个条目(一次使用多个page.open()在PhantomJS中是一个非常糟糕的主意)。

Be aware that 2.0 will crash with too many links / too many urls (in my case 120-180) for unknown reasons (most times no error message, rare times a "QThread::start: Failed to create thread ()". 请注意,由于未知原因(大多数情况下没有错误消息,很少出现“ QThread :: start:无法创建线程()”),2.0链接过多/ URL过多(在我的情况下为120-180)会崩溃。

To prevent that, use the 1.9.8 version instead of 2.0, seems to be a bug there, filled a crash report with dumps on Github. 为防止这种情况,请使用1.9.8版本而不是2.0版本,那里似乎是一个错误,在Github上的转储中填充了崩溃报告。

/edit crashed without an error message after 3836 links with 1.9.8 links ............... PHANTOM. / edit在3836个链接与1.9.8个链接...... PHANTOM之后崩溃,没有错误消息。

console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('linklist.de.txt', 'r');
var webPage = require('webpage');
var i = 1;
var hasFound = Array();
var hasonLoadFinished = Array();

function handle_page(link) {
var page = webPage.create();
page.settings.loadImages = false;
page.open(link, function() {});

page.onResourceRequested = function(requestData, request) {
    var match = requestData.url.match(/example.com\/searchmeI'maString/g)
    if (match != null) {
        hasFound[link] = true;
        var targetString = decodeURI(JSON.stringify(requestData.url));
        var klammerauf = targetString.indexOf("{");
        var jsonobjekt = targetString.substr(klammerauf,     (targetString.indexOf("}") - klammerauf) + 1);
        targetJSON = (decodeURIComponent(jsonobjekt));
        var t = JSON.parse(targetJSON);
        console.log(i + "   " + t + "       " + t['id']);
        //console.log(targetJSON);
        //console.log("");
        request.abort;
    } else {
        request.abort;
        return;
    }

};
page.onLoadFinished = function(status) {    
    if (!hasonLoadFinished[link]) {
        hasonLoadFinished[link] = true;
        //console.log(" " + status + "  " + link);
        //console.log("onLoadFinished()")   
        //setTimeout(function(){/* Look mah! No name! */},1000);
        if (!hasFound[link]) {
            console.log(i + " :NOT FOUND: " + link);
            console.log("");
        }
        i++;
        page.close();
        nextPage();
    }
}
};

function nextPage() {
var link = stream.readLine();
if (!link) {
    end = Date.now();
    console.log("");
    console.log(((end - start) / 1000) + " Sekunden");
    console.log("FILE ENDS HERE!!!");
    phantom.exit(0);
}
hasFound[link] = false;
hasonLoadFinished[link] = false;
handle_page(link);
}

start = Date.now();
nextPage();

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM