[英]How can i rewrite these JavaScript promises to be less complicated?
[英]How can I rewrite this with promises?
我正在為T恤網站構建內容管理器。
目標是通過一個硬編碼網址進入一個網站: http://shirts4mike.com
: http://shirts4mike.com
然后,我將找到每個T恤的所有產品頁面,然后使用它的詳細信息創建一個對象。 然后將其添加到數組中。
當陣列中充滿了T恤時,我將完成整個陣列並將其記錄到CSV文件中。
現在,我在請求/響應和函數調用的時間方面遇到了一些麻煩。
如何確保在正確的時間調用NEXT功能? 我知道它不起作用,因為它是異步的性質。
如何在正確的時間調用secondScrape
, lastScraper
和convertJson2Csv
,以便他們使用的變量不是未定義的?
我嘗試使用諸如response.end()
東西,但這不起作用。
我假設我需要使用promises來使這項工作正常進行? 並且清晰可辨?
有任何想法嗎? 我的代碼如下:
//Modules being used:
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray;
// Load front page of shirts4mike
request(url, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
});
});
}
//call second scrape for remainder
secondScrape();
});
function secondScrape() {
request(remainder, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if($('[type=submit]').length !== 0){
urlSet.add(scrapeLink);
}
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
lastScraper();
};
function lastScraper(){
//scrape set, product pages
for(var i = 0; i < urlSet.length; i++){
var url = urlSet[i];
request(url, function(error, response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
};
有一個名為request-promise的npm模塊。
只是:
var rp = require("request-promise");
在您提出請求的任何地方,您都可以切換請求承諾。
例如:
rp(url)
.then(function(value){
//do whatever
})
.catch(function(err){
console.log(err)
})
我只是試着用這個模塊做你的代碼
希望這對你有用
瀑布的格式
async.waterfall([
function(callback) {
callback(null, previousvalue);
},
function(previousvalue, callback) {}
], function(err, result) { //Final callback
});
var async = require('async');
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
async.waterfall([
function(callback) {
// Load front page of shirts4mike
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if ($('[type=submit]').length !== 0) {
//add page to set
urlSet.add(scrapeLink);
callback(null, true);
} else if (remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
callback(nul, true);
}
}
});
});
}
//call second scrape for remainder
// secondScrape();
});
},
function(previousvalue, callback) {
request(remainder, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if ($('[type=submit]').length !== 0) {
urlSet.add(scrapeLink);
}
callback(null, true);
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
},
function(previousvalue, callback) {
//scrape set, product pages
for (var i = 0; i < urlSet.length; i++) {
var url = urlSet[i];
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
}
], function(err, result) {
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
});
您可以使用此示例轉換其余代碼示例。
promise = new Promise((resolve, reject) => (
request("http://shirts4mike.com/",
(err, response, html) => (response.statusCode == 200 ? resolve(html): reject(err))
)));
promise.then(html => {
var $ = cheerio.load(html);
// continue
});
您正確地將承諾確定為解決時間問題的方法。
為了使promise可用,您需要promisify request
(或采用HTTP lib,其方法返回promises)。
你可以用承諾來解決時間問題,但你也可以借此機會改進整體范式。 您可以編寫一個遞歸調用自身的函數,而不是幾乎相同的第一/第二/第三階段的離散函數。 如果寫得正確,這將確保目標站點中的每個頁面最多訪問一次; 應根據整體性能和目標服務器的加載來避免重訪。
//Modules being used:
var Promise = require('path/to/bluebird');
var cheerio = require('cheerio');
var moment = require('moment');
// Promisify `request` to make `request.getAsync()` available.
// Ref: http://stackoverflow.com/questions/28308131/how-do-you-properly-promisify-request
var request = Promise.promisify(require('request'));
Promise.promisifyAll(request);
//hardcoded url
var url = 'http://shirts4mike.com/';
var urlSet = new Set();
var tshirtArray = [];
var maxLevels = 3; // limit the recursion to this number of levels.
function scrapePage(url_, levelCounter) {
// Bale out if :
// a) the target url_ has been visited already,
// b) maxLevels has been reached.
if(urlSet.has(url_) || levelCounter >= maxLevels) {
return Promise.resolve();
}
urlSet.add(url_);
return request.getAsync(url_).then(function(response, html) {
var $;
if(response.statusCode !== 200) {
throw new Error('statusCode was not 200'); // will be caught below
}
$ = cheerio.load(html);
if($('[type=submit]').length > 0) {
// yay, it's a product page.
tshirtArray.push({
price: $('.price').text(),
img: $('.shirt-picture').find("img").attr("src"),
title: $('body').find(".shirt-details > h1").text().slice(4),
url: url_,
date: moment().format('MMMM Do YYYY, h:mm:ss a')
});
}
// find any shirt links on page represented by $, visit each link in turn, and scrape.
return Promise.all($("a[href*=shirt]").map(function(link) {
return scrapePage(link.href, levelCounter + 1);
}).get());
}).catch(function(e) {
// ensure "success" even if scraping threw an error.
console.log(e);
return null;
});
}
scrapePage(url, 0).then(convertJson2Csv);
如您所見,遞歸解決方案:
maxLevels
決定。 注意:這仍然不是一個好的解決方案。 這里有一個隱含的假設,就像在原始代碼中一樣,所有襯衫頁面都可以從網站的主頁上通過“襯衫”鏈接單獨訪問。 如果襯衫可以通過例如“衣服”>“襯衫”到達,那么上面的代碼將找不到任何襯衫。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.