简体   繁体   中英

Javascript memory leak in loop

I'm trying to write a parser for STEP-files in javascript, that will be used primarily in the browser, but also in Node, and for now I use Node to debug.

It's going quite well and it's parsing along for a while. But when I get to really large files with millions of lines (around 200Mb and more) it chokes and eventually crashes and complains about JavaScript heap out of memory!

The files look something like this:

...
#10=ORGANIZATION('O0001','LKSoft','company');
#11=PRODUCT_DEFINITION_CONTEXT('part definition',#12,'manufacturing');
#12=APPLICATION_CONTEXT('mechanical design');
#13=APPLICATION_PROTOCOL_DEFINITION('','automotive_design',2003,#12);
#14=PRODUCT_DEFINITION('0',$,#15,#11);
#15=PRODUCT_DEFINITION_FORMATION('1',$,#16);
#16=PRODUCT('A0001','Test Part 1','',(#18));
#17=PRODUCT_RELATED_PRODUCT_CATEGORY('part',$,(#16));
#18=PRODUCT_CONTEXT('',#12,'');
...
#3197182=APPLIED_ORGANIZATION_ASSIGNMENT(#10,#20,(#16));
#3197183=ORGANIZATION_ROLE('id owner');

The files are a bit irregular so I'm writing a quite blunt parser, parsing letter for letter:

const fs = require('fs');

class bigObject {
  constructor(data) {

    this.parse(data);
  }

  propertyLexer(row) {

    let refNrRE = /[-0-9]/;
    let floatNumberRE = /[.\-0-9E]/;
    let charsRE = /[_a-zA-Z.]/;
    let stringRE = /'((?:''|[^'])*)'/;

    let lexedRow = [];
    let current = 0;
    let rowLen = row.length;

    while (current < rowLen) {
      let char = row[current];

      // I.E. #32123
      if (char === '#') {
        let property = '';

        while (refNrRE.test(row[current + 1]) && current < rowLen) {
          current++;
          property += row[current];
        }

        lexedRow.push(parseInt(property));

        current++;
      }

      // Empty property
      else if (char === '$') {
        lexedRow.push('');

        current++;
      }

      // Skip to next property
      else if (char === ',') {
        current++;
      }

      // I.E. 'Comments, blabla (more comments)'
      else if (char === "'") {
        let property = stringRE.exec(row.substr(current));

        lexedRow.push(property[1]);

        current += property[1].length + 2;
      }

      // I.E. .AREAUNIT.
      else if (charsRE.test(char)) {
        let property = '';

        while (charsRE.test(row[current]) && current < rowLen) {
          property += row[current];

          current++;
        }

        lexedRow.push(property);
      }

      // I.E. -1000.00
      else if (floatNumberRE.test(char)) {
        let property = '';

        while (floatNumberRE.test(row[current]) && current < rowLen) {
          property += row[current];

          current++;
        }

        lexedRow.push(property);
      }

      // Skip rest for now
      else {
        current++;
      }
    }

    return lexedRow;
  }

  parse(data) {
    if (typeof data !== "string") {
      try {
        data = data.toString();
      }
      catch (e) {
        throw `Indata not string or not able to convert to string: ${e}`;
      }
    }

    let stepRowRE = /#\d+\s*=\s*[a-zA-Z0-9]+\s*\([^)]*(?:\)(?!;)[^)]*)*\);/g;

    // Split single row into three capture groups
    let singleRowWithGroupingRE = /^#(\d+)\s*=\s*([a-zA-Z0-9]+)\s*\(([^)]*(?:\)(?!;)[^)]*)*)\);/;

    let stepRows = data.match(stepRowRE);
    let rowIndex = stepRows.length - 1;
    let rowsFromFile = {};
    let count = 0;

    for (let i = 0; i <= rowIndex; i++) {
      let matching = singleRowWithGroupingRE.exec(stepRows[i]);

      rowsFromFile[matching[1]] = {c: matching[2], p: this.propertyLexer(matching[3].replace(/(\r\n|\n|\r)/gm, ''))};

      if (i % 200000 === 0) {
        console.log(i + '::' + JSON.stringify(rowsFromFile[matching[1]]));
      }

      count++;

    }
  }
}

//// Start here ////

fs.readFile('./ifc-files/A-40-V-00252.ifc', (err, data) => {
  let newObject = new bigObject(data);
});

I get this error:

<--- Last few GCs --->

[11348:000002D4A6E72260]    81407 ms: Mark-sweep 1403.2 (1458.8) ->
1403.2 (1458.8) MB, 2428.1 / 0.0 ms  allocation failure GC in old space requested [11348:000002D4A6E72260]    83836 ms: Mark-sweep
1403.2 (1458.8) -> 1403.2 (1428.8) MB, 2429.0 / 0.0 ms  last resort gc [11348:000002D4A6E72260]    86282 ms: Mark-sweep 1403.2 (1428.8) ->
1403.1 (1428.8) MB, 2446.3 / 0.0 ms  last resort gc


<--- JS stacktrace --->

==== JS stack trace =========================================

Security context: 00000384656C0D51 <JS Object>
    1: parse [C:\Users\user\Projects\parser\index.js:~95] [pc=000000525FB71B18](this=000001EE5F96DE19 <a bigObject with map 0000036221B1B7A9>,data=0000034357F04201 <Very long string[190322237]>)
    2: new bigObject [C:\Users\user\Projects\parser\index.js:8] [pc=000000525FB48737](this=000001EE5F96DE19 <a bigObject with map 0000036221B1B7...

FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory

I've been trying to find the reason for this for days now but I can't see anything that looks like a memory leak or infinite loop.

My machine has 16Gb memory and should easily be able to handle a 200mb file, many times over!

Are there anybody who can help me with my problem? Thanks!

EDIT: Everything is working just fine if I use Firefox or even Edge(!), and also when I use --max_old_space_size=4096 flag to increase available memory for Chrome/Node (V8). But it's not likely that regular users will do this... So I still need to make it more memory efficient. But I have no clue how.

EDIT2: It's not the JSON.stringify or the fact that I read the whole file that causes the problem. This will be a problem if I try to read an even larger file than I currently do. But for now it's more because I'm storing too much in memory or something.

Your application crashes before you get to anything complicated: the crash on line 95 occurs when you call data.toString().

Apparently Node.js doesn't like 200MB strings. This isn't particularly surprising; 200MB is a lot to ask of any String implementation.

Since your input file consists of newline-delimited records, I think the suggestion from mscdex is the right way to go: use readline , read the file line by line, and parse each line.

This code sample seems to do what you want.

The line-by-line approach has the added benefit that it won't block the event loop . Instead of doing one enormous task without any opportunity to interleave other events, you can easily structure your application to yield between each line event. readline might do this automatically for you, but maybe not.

Some related SO questions: this one , that one .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM