简体   繁体   中英

Node.js fs cheerio read and write multiple files

I have the following code adapted from here that I am using with Node.js and Cheerio to read html files and split large source files into small chunks. The code is working well for a single file.

Now I need to read multiple large html files and split them one after the other and output the resulting files in a folder. How can I read and write every file in the folder and then split it?

Here is the code:

var cheerio = require('cheerio'),
    fs = require('fs');

fs.readFile('./sourceHtml2/testone.html', 'utf8', dataLoaded);

function dataLoaded(err, data) {

  $ = cheerio.load(data);


  $('#toplevel > div').each(function (i, elem) {

    var id = $(elem).attr('id'),

        filename = id + '.html',
        content = $.html(elem);

    fs.writeFile('./output2/' + filename, content, function (err) {

        console.log('Written html to ' + filename);
    });
  });
}

Here is my sample source file

<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <title>Lorem Ipsum</title>
  </head>
  <body>
    <div id="toplevel">
      <div id="1-1">
        <h1>HTML Ipsum Presents One</h1>
        <p>
        <strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper. 

        <h2>Header Level 2</h2>
        <ol>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ol>
        <h3>Header Level 3</h3>
        <ul>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ul>
      </div>
      <div id="1-2">
        <h1>HTML Ipsum Presents Two</h1>
        <p>
        <strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper. 

        <h2>Header Level 2</h2>
        <ol>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ol>
        <blockquote>
          <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus,
          at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p>
        </blockquote>
        <h3>Header Level 3</h3>
        <ul>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ul>
      </div>
      <div id="1-3">
        <h1>HTML Ipsum Presents Three</h1>
        <p>
        <strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper. 

        <h2>Header Level 2</h2>
        <ol>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ol>
        <blockquote>
          <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus,
          at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p>
        </blockquote>
        <h3>Header Level 3</h3>
        <ul>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ul>
      </div>
    </div>
  </body>
</html>

Your help will be greatly appreciated.

You need to process the files in the input directory as an array and you'll also want to prevent filename collisions in the output folder.

The code provided below provides a solution to both issues. HTML files (.htm and .html) are read from the 'input' subfolder and the generated files written to the 'output' subfolder.

var cheerio = require('cheerio'),
    fs = require('fs');

// process files found in the 'input' folder
fs.readdir('./input', 'utf8', findHtmlFiles);

function findHtmlFiles(err, files) {

    if (files.length) {
        files.forEach(function (fullFilename) {
            var pattern = /\.[0-9a-z]{1,5}$/i;
            var ext = (fullFilename).match(pattern);
            // only process '.htm' and '.html' files
            if (ext[0] == '.htm' || ext[0] == '.html') {
                fs.readFile('./input/' + fullFilename, 'utf8', function (err, data) {
                    if (err)
                        throw err
                    else {
                        // add the file name to prevent collisions
                        // in the output folder
                        var fileData = {
                            file: fullFilename.slice(0, (ext[0].length * -1)),
                            data: data
                        };
                        dataLoaded(null, fileData);
                    }
                });
            }
        });
    }

}

function dataLoaded(err, fd) {

    $ = cheerio.load(fd.data);

    $('#toplevel > div').each(function (i, elem) {

        var id = $(elem).attr('id'),
            filename = fd.file + '_' + id + '.html',
            content = $.html(elem);

        fs.writeFile('./output/' + filename, content, function (err) {

            console.log('Written html to ' + filename);
        });
    });
}

Sample console output:

Written html to testone_1-1.html
Written html to testone_1-2.html
Written html to testone_1-3.html
Written html to testtwo_1-1.html
Written html to testtwo_1-2.html
Written html to testtwo_1-3.html

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM