I am trying to extract some content from website using nodejs with cheerio. I want to extract the following content:
Here is the html:
<body>
<div class="detail_loop">
<img class="imfast" data-original="http://www.example.com/wp-content/uploads/2017/03/imageurl-250x150.jpg" title=""
align="left" width="250" height="150"
src="http://www.example.com/wp-content/uploads/2017/03/imageurl-250x150.jpg" style="display: block;">
<h2>
<a href="http://www.example.com/2017/04/576487/" rel="bookmark">This is my titile text</a>
</h2>
Here will be my description content.
<div class="clear"></div>
<div class="send_loop" style="display: none;">
<a href="http://www.example.com/2017/04/576487//#respond" target="_blank">
<div class="send_com">
<div class="send_bubb">
<div class="count">
0
</div>
</div>
</div>
</a>
<a href="https://www.facebook.com/sendr.php?u=http://www.example.com/2017/04/576487/" target="_blank">
<div class="send_fb">
<div class="send_bubb">
<div class="count">
send
</div>
</div>
</div>
</a>
<a href="https://twitter.com/send?url=http://www.example.com/2017/04/576487/&text=this is sample title;hashtags=example"
target="_blank">
<div class="send_tt">
<div class="send_bubb">
<div class="count">
Tweet
</div>
</div>
</div>
</a>
<div class="clear"></div>
</div>
<div class="clear"></div>
<div class="detail_loop_dvd"></div>
<div class="clear"></div>
</div>
</body>
Something like this what you were aiming for? You could of course simply pass the data à la: cheerio.load('<html><body>…</html>')
Note: .text()
will return all children (other <div>, etc.), hence the filter which returns true only on text nodes. –[ #20832910 ]
const cheerio = require('cheerio');
const fs = require('fs');
/**
* Given data saved in file 'index.html' in current path
*/
fs.readFile('index.html', {encoding: 'utf-8'}, (err, data) => {
if (err) { console.log(err); return; }
const $ = cheerio.load(data);
/**
* Print what you desire
*/
console.log($('h2 a').text()); // Title text
console.log($('div.detail_loop').contents().filter( function() {
return this.type === 'text';
}).text()); // Description content (without child nodes--only text)
console.log($('img').attr('src')); // Image source
});
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.