![](/img/trans.png)
[英]Node JS Webscraping with Cheerio Problem to fetch the html on some websites
[英]Cheerio WebScraping Node JS
我想抓取一个网页并获取所有链接(如果存在)在“a”标签中带有缩略图或图像。
我能够获得链接,但不确定如何在当前迭代的标签中获取 img > src 值。
const cheerio = require('cheerio')
const request = require('request')
const throttledRequest = require('throttled-request')(request)
throttledRequest.configure({ requests: 18, milliseconds: 1000 })
let o = {
linksOut: []
}
const scrapeLinksOut = (o, body) => {
if (body) {
let $ = cheerio.load(body)
$('a').map(function () {
let link = $(this).attr('href')
// I want to get the img url within the a tag for the current iteration
let thumbnail = $(this).//img > src
o.linksOut.push( {
link: link,
thumbnail: thumbnail
})
})
} else {
// something else
}
}
const scrape = (() => {
return new Promise((resolve, reject) => {
throttledRequest({
url: 'https://www.ibm.com/us-en',
followAllRedirects: true,
timeout: 30000
}, (err, res, body) => {
scrapeLinksOut(o, body)
return resolve(o)
})
})
})
scrape()
.then((res) => {
res.linksOut.forEach((obj) => {
console.log(obj);
})
})
.catch((err) => console.log(err))
我写了一个快速的解决方案,希望能解决你的问题。 我在我的机器上测试了它,它对我有用。
$('a').map(function () {
...
let thumbnail = $(this).html();
thumbnail = cheerio.parseHTML(thumbnail);
if (thumbnail!== null && thumbnail.length > 0 && //Ensure link has an image
thumbnail[0].attribs !== undefined &&
thumbnail[0].attribs.src !== undefined) {
thumbnail = thumbnail[0].attribs.src; //The image URL
}
...
});
const scrapeLinksOut = (o, body) => {
if (body) {
let $ = cheerio.load(body)
$('a').map(function () {
let link = $(this).attr('href')
let thumbnail = $(this).html();
thumbnail = cheerio.parseHTML(thumbnail)
if (thumbnail !== null && thumbnail.length > 0 && thumbnail[0].attribs !== undefined && thumbnail[0].attribs.src !== undefined) {
thumbnail = thumbnail[0].attribs.src
} else {
thumbnail = null
}
o.linksOut.push( {
link: link,
thumbnail: thumbnail
})
})
} else {
o.linksOut.push(links)
}
}
这个日志:
null
null
null
[object Object]
[object Object]
[object Object]
[object Object]
[object Object]
[object Object]
[object Object]
[object Object]
null
null
null
null
null
null
null
这似乎奏效了。 链接和图像匹配。
const scrapeLinksOut = (o, body) => {
if (body) {
let $ = cheerio.load(body);
$('a').map(function () {
let thumbnail;
let link = $(this).attr('href');
$(this)
.find('img')
.map(function (i, img) {
thumbnail = $(img).attr('src');
});
if (thumbnail !== undefined) {
o.linksOut.push({
link: link,
thumbnail: thumbnail,
});
}
});
} else {
// something else
}
};
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.