Split on new line, unless inside double quotes

Question

Update

Just to clarify, I'm specifically looking for a RegEx that will:

Split on newline(s)...unless the newline(s) are inside double quotes.

If the newline(s) are inside double quotes, it will:

Ignore the newlines inside the double quotes
Not include the outer double quotes inside the result
Convert any double-double quotes ( "" ) to single quotes inside the outer double quotes

I have a grid of data that looks like this.

When copied and pasted, this is the resulting text:

Data        
Data    Data    
Data    Data    Data
Data    Data    Data"
Data    Data    "Da
ta"
Data    Data    "Da
ta"""
Data    Data    Data""
Data    Data    """Da
ta"""
Data    Data    """Da

ta"""

The resulting text gets a little wonky because a newline inside of a cell causes some weird behavior:

The contents of the cell get surrounded by double quotes
Any existing double quote inside that cell are converted to double-double quotes ( "" ).

I want to be able to paste that text into a textarea and then recreate the original grid in a table in HTML even with the wonky behavior mentioned.

I have found and slightly modified this code, which I think is close, but I don't think the RegEx is quite right so I've also added the RegEx in this answer as an option (which I've commented out since it causes an "out of memory" exception:

function splitOnNewlineExceptInDoubleQuotes(string) {
    //The parenthesis in the regex creates a captured group within the quotes
    var myRegexp = /[^\n"]+|"([^"]*)"/gim;
    //var myRegexp = /(\n)(?=(?:[^\"]|\"[^\"]*\")*$)/m;
    var myString = string
    var myArray = [];

    do {
        //Each call to exec returns the next regex match as an array
        var match = myRegexp.exec(myString);
        if (match != null)
        {
            //Index 1 in the array is the captured group if it exists
            //Index 0 is the matched text, which we use if no captured group exists
            myArray.push(match[1] ? match[1] : match[0]);
        }
    } while (match != null);

    return myArray
}

So, I think this is possible with a regular expression (as opposed to a full blown state machine) but I'm not quite sure how to do it.

Answer 1

Parsing all the data

Here is a regular expression that will match each component of your source, one by one into numbered capture groups:

Tab separator
End of Row / New line
Quoted data
Unquoted data

This will work on a single row of data or all rows at once. Also handles CLRF ( \\r\\n ) and RF ( \\n ) line endings.

Expression

/(?:(\t)|(\r?\n)|"((?:[^"]+|"")*)"|([^\t\r\n]+))/

Visualisation

Example Usage

Here we use the captured group to indicate what to do.

This outputs an array of the rows in the console.

 var str = 'Data ' + "\\r\\n" + 'Data Data ' + "\\r\\n" + 'Data Data Data' + "\\r\\n" + 'Data Data Data"' + "\\r\\n" + 'Data Data "Da' + "\\r\\n" + 'ta"' + "\\r\\n" + 'Data Data "Da' + "\\r\\n" + 'ta"""' + "\\r\\n" + 'Data Data Data""' + "\\r\\n" + 'Data Data """Da' + "\\r\\n" + 'ta"""' + "\\r\\n" + 'Data Data """Da' + "\\r\\n" + '' + "\\r\\n" + 'ta"""'; var myregexp = /(?:(\\t)|(\\r?\\n)|"((?:[^"]+|"")*)"|([^\\t\\r\\n]+))/ig; var match = myregexp.exec(str); var emptyRow = []; var row = emptyRow.slice(); var rows = []; var prevTab = false; while (match != null) { if (match[4]) { // Unquoted data row.push(match[4]); prevTab = false; } else if (match[3]) { // Quoted data (replace escaped double quotes with single) row.push(match[3].replace(/""/g, "'")); prevTab = false; } else if (match[1]) { // Tab seperator if (prevTab) { // Two tabs means empty data row.push(''); } prevTab = true; } else if (match[2]) { // End of the row if (prevTab) { // Previously had a tab, so include the empty data row.push(''); } prevTab = false; rows.push(row); // Here we are ensuring the new empty row doesn't reference the old one. row = emptyRow.slice(); } match = myregexp.exec(str); } // Handles missing new line at end of string if (row.length) { if (prevTab) { // Previously had a tab, so include the empty data row.push(''); } rows.push(row); } console.log('rows', rows);

Commented Regular Expression

// (?:(\t)|(\r?\n)|"((?:[^"]+|"")*)"|([^\t\r\n]+))
// 
// Options: Case insensitive; ^$ don’t match at line breaks
// 
// Match the regular expression below «(?:(\t)|(\r?\n)|"((?:[^"]+|"")*)"|([^\t\r\n]+))»
//    Match this alternative (attempting the next alternative only if this one fails) «(\t)»
//       Match the regex below and capture its match into backreference number 1 «(\t)»
//          Match the tab character «\t»
//    Or match this alternative (attempting the next alternative only if this one fails) «(\r?\n)»
//       Match the regex below and capture its match into backreference number 2 «(\r?\n)»
//          Match the carriage return character «\r?»
//             Between zero and one times, as many times as possible, giving back as needed (greedy) «?»
//          Match the line feed character «\n»
//    Or match this alternative (attempting the next alternative only if this one fails) «"((?:[^"]+|"")*)"»
//       Match the character “"” literally «"»
//       Match the regex below and capture its match into backreference number 3 «((?:[^"]+|"")*)»
//          Match the regular expression below «(?:[^"]+|"")*»
//             Between zero and unlimited times, as many times as possible, giving back as needed (greedy) «*»
//             Match this alternative (attempting the next alternative only if this one fails) «[^"]+»
//                Match any character that is NOT a “"” «[^"]+»
//                   Between one and unlimited times, as many times as possible, giving back as needed (greedy) «+»
//             Or match this alternative (the entire group fails if this one fails to match) «""»
//                Match the character string “""” literally «""»
//       Match the character “"” literally «"»
//    Or match this alternative (the entire group fails if this one fails to match) «([^\t\r\n]+)»
//       Match the regex below and capture its match into backreference number 4 «([^\t\r\n]+)»
//          Match any single character NOT present in the list below «[^\t\r\n]+»
//             Between one and unlimited times, as many times as possible, giving back as needed (greedy) «+»
//             The tab character «\t»
//             The carriage return character «\r»
//             The line feed character «\n»

Answer 2

While it might hypothetically be possible to define your parser using an extremely complex regular expression (and I'm not even convinced it is possible with this grammar), it will be a lot easier and more maintainable to use a parser generator to define your grammar in a more readable format.

Using PEG.js , you can define the simple grammar below for the plaintext Excel table format:

Table
  = row: Row '\n' table: Table { return [row, ...table] }
  / row: Row { return [row] }

Row
  = cell: Cell '\t' row: Row { return [cell, ...row] }
  / cell: Cell { return [cell] }

Cell
  = '"' value: Value '"' { return value }
  / $ [^\t\n]*

Value
  = escaped: $ Escaped
  { return escaped.replace(/""/g, '"') }

Escaped
  = multiline: $ ([^"\t]+ / '""')+
  & { return multiline.includes('\n') }

Here's a demo with your input:

 window.excelTableParser=function(){"use strict";function n(r,t,e,u){this.message=r,this.r=t,this.t=e,this.e=u,this.name="SyntaxError","function"==typeof Error.captureStackTrace&&Error.captureStackTrace(this,n)}return function(n,r){function t(){this.constructor=n}t.prototype=r.prototype,n.prototype=new t}(n,Error),nu=function(n,r){var t={o:function(n){return'"'+u(ni)+'"'},f:function(n){var r,t="";for(r=0;r<nclength;r++)t+=nc[r]instanceof Array?o(nc[r][0])+"-"+o(nc[r][1]):o(nc[r]);return"["+(ns?"^":"")+t+"]"},a:function(n){return"any character"},l:function(n){return"end of input"},x:function(n){return n.description}};function e(n){return n.charCodeAt(0).toString(16).toUpperCase()}function u(n){return n.replace(/\\\\/g,"\\\\\\\\").replace(/"/g,'\\\\"').replace(/\\0/g,"\\\\0").replace(/\\t/g,"\\\\t").replace(/\\n/g,"\\\\n").replace(/\\r/g,"\\\\r").replace(/[\\x00-\\x0F]/g,function(n){return"\\\\x0"+e(n)}).replace(/[\\x10-\\x1F\\x7F-\\x9F]/g,function(n){return"\\\\x"+e(n)})}function o(n){return n.replace(/\\\\/g,"\\\\\\\\").replace(/\\]/g,"\\\\]").replace(/\\^/g,"\\\\^").replace(/-/g,"\\\\-").replace(/\\0/g,"\\\\0").replace(/\\t/g,"\\\\t").replace(/\\n/g,"\\\\n").replace(/\\r/g,"\\\\r").replace(/[\\x00-\\x0F]/g,function(n){return"\\\\x0"+e(n)}).replace(/[\\x10-\\x1F\\x7F-\\x9F]/g,function(n){return"\\\\x"+e(n)})}return"Expected "+function(n){var r,e,u,o=new Array(n.length);for(r=0;r<n.length;r++)o[r]=(u=n[r],t[ug](u));if(o.sort(),o.length>0){for(r=1,e=1;r<o.length;r++)o[r-1]!==o[r]&&(o[e]=o[r],e++);o.length=e}switch(o.length){case 1:return o[0];case 2:return o[0]+" or "+o[1];default:return o.slice(0,-1).join(", ")+", or "+o[o.length-1]}}(n)+" but "+function(n){return n?'"'+u(n)+'"':"end of input"}(r)+" found."},{v:n,parse:function(r,t){t=void 0!==t?t:{};var e,u={},o={d:I},i=I,f="\\n",c=q("\\n",!1),s=function(n,r){return[n,...r]},a=function(n){return[n]},l="\\t",x=q("\\t",!1),g=function(n,r){return[n,...r]},v=function(n){return[n]},d='"',h=q('"',!1),p=function(n){return n},y=/^[^\\t\\n]/,w=z(["\\t","\\n"],!0,!1),F=function(n){return n.replace(/""/g,'"')},E=/^[^"\\t]/,m=z(['"',"\\t"],!0,!1),P='""',C=q('""',!1),b=function(n){return n.includes("\\n")},A=0,S=[{h:1,p:1}],R=0,T=[],j=0,k={};if("startRule"in t){if(!(ty in o))throw new Error("Can't start parsing from rule \\""+t.y+'".');i=o[ty]}function q(n,r){return{g:"literal",i:n,ignoreCase:r}}function z(n,r,t){return{g:"class",c:n,s:r,ignoreCase:t}}function B(n){var t,e=S[n];if(e)return e;for(t=n-1;!S[t];)t--;for(e={h:(e=S[t]).h,p:ep};t<n;)10===r.charCodeAt(t)?(e.h++,ep=1):e.p++,t++;return S[n]=e,e}function D(n,r){var t=B(n),e=B(r);return{w:{F:n,h:th,p:tp},l:{F:r,h:eh,p:ep}}}function G(n){A<R||(A>R&&(R=A,T=[]),T.push(n))}function H(r,t,e){return new n(nu(r,t),r,t,e)}function I(){var n,t,e,o,i=5*A+0,l=k[i];return l?(A=lm,lP):(n=A,(t=J())!==u?(10===r.charCodeAt(A)?(e=f,A++):(e=u,0===j&&G(c)),e!==u&&(o=I())!==u?n=t=s(t,o):(A=n,n=u)):(A=n,n=u),n===u&&(n=A,(t=J())!==u&&(t=a(t)),n=t),k[i]={m:A,P:n},n)}function J(){var n,t,e,o,i=5*A+1,f=k[i];return f?(A=fm,fP):(n=A,(t=K())!==u?(9===r.charCodeAt(A)?(e=l,A++):(e=u,0===j&&G(x)),e!==u&&(o=J())!==u?n=t=g(t,o):(A=n,n=u)):(A=n,n=u),n===u&&(n=A,(t=K())!==u&&(t=v(t)),n=t),k[i]={m:A,P:n},n)}function K(){var n,t,e,o,i=5*A+2,f=k[i];if(f)return A=fm,fP;if(n=A,34===r.charCodeAt(A)?(t=d,A++):(t=u,0===j&&G(h)),t!==u&&(e=function(){var n,t,e,o=5*A+3,i=k[o];return i?(A=im,iP):(n=A,t=A,(t=(e=function(){var n,t,e,o,i,f=5*A+4,c=k[f];if(c)return A=cm,cP;if(n=A,t=A,e=[],o=[],E.test(r.charAt(A))?(i=r.charAt(A),A++):(i=u,0===j&&G(m)),i!==u)for(;i!==u;)o.push(i),E.test(r.charAt(A))?(i=r.charAt(A),A++):(i=u,0===j&&G(m));else o=u;if(o===u&&(r.substr(A,2)===P?(o=P,A+=2):(o=u,0===j&&G(C))),o!==u)for(;o!==u;){if(e.push(o),o=[],E.test(r.charAt(A))?(i=r.charAt(A),A++):(i=u,0===j&&G(m)),i!==u)for(;i!==u;)o.push(i),E.test(r.charAt(A))?(i=r.charAt(A),A++):(i=u,0===j&&G(m));else o=u;o===u&&(r.substr(A,2)===P?(o=P,A+=2):(o=u,0===j&&G(C)))}else e=u;return(t=e!==u?r.substring(t,A):e)!==u&&(e=(e=b(t))?void 0:u)!==u?n=t=[t,e]:(A=n,n=u),k[f]={m:A,P:n},n}())!==u?r.substring(t,A):e)!==u&&(t=F(t)),n=t,k[o]={m:A,P:n},n)}())!==u?(34===r.charCodeAt(A)?(o=d,A++):(o=u,0===j&&G(h)),o!==u?n=t=p(e):(A=n,n=u)):(A=n,n=u),n===u){for(n=A,t=[],y.test(r.charAt(A))?(e=r.charAt(A),A++):(e=u,0===j&&G(w));e!==u;)t.push(e),y.test(r.charAt(A))?(e=r.charAt(A),A++):(e=u,0===j&&G(w));n=t!==u?r.substring(n,A):t}return k[i]={m:A,P:n},n}if((e=i())!==u&&A===r.length)return e;throw e!==u&&A<r.length&&G({g:"end"}),H(T,R<r.length?r.charAt(R):null,R<r.length?D(R,R+1):D(R,R))}}}(); console.log(excelTableParser.parse(`Data Data Data Data Data Data Data Data Data" Data Data "Da ta" Data Data "Da ta""" Data Data Data"" Data Data """Da ta""" Data Data """Da ta"""`));

Answer 3

Here is a regex solution that will split the text into cells (including cells with newlines). It doesn't deal with all the complexities but should make it much easier to parse now that you can handle each cell separately.

Warning: As Patrick Roberts pointed out in the comments, this only works if multiline cells are only found in the last column.

 const input = `Data Data Data Data Data Data Data Data Data" Data Data "Da ta" Data Data "Da ta""" Data Data Data"" Data Data """Da ta""" Data Data """Da ta"""`; const s = (input + '\\n') .replace(/(("[^"]*")*)\\n/g, '$1\\r') .trim('\\r'); const cells = s .split('\\r') .map(row => row.split('\\t')); console.log(cells);

Answer 4

I wasn't able to come up with a regex solution that is robust but here is a solution that works.

Note: I changed the input slightly to test for multiline cells that aren't in the last column.

 const input = `Data Data Data "Da ta" Data Data Data Data Data" Data Data "Da ta" Data Data "Da ta""" Data Data Data"" Data Data """Da ta""" Data Data """Da ta"""`; const columnCount = (input.split('\\n')[0].match(/\\t/g) || []).length + 1; //parse input into cells and undo wonkiness const parts = input.split(/[\\t\\n]/); const cells = []; let next = ''; for (let part of parts) { next += part + '\\n'; const quoteCount = (next.match(/"/g) || []).length; if (!next.startsWith('"') || quoteCount % 2 === 0) { let cell = next.trim('\\n'); if (cell.startsWith('"') && cell.endsWith('"')) { cell = cell.replace(/^"([^]*)"$/, '$1'); cell = cell.replace(/""/g, '"'); } cells.push(cell); next = ''; } } //rearrange cells into rows const rows = []; let row = []; for (let cell of cells) { row.push(cell); if (row.length === columnCount) { rows.push(row); row = []; } } //display results in table const tableElem = document.getElementById('table'); for (let row of rows) { let trElem = document.createElement('tr'); for (let cell of row) { let tdElem = document.createElement('td'); tdElem.innerHTML = cell.replace('\\n', '<br/>'); trElem.appendChild(tdElem); } tableElem.appendChild(trElem); }

 <style> table, th, td { border: 1px solid black; border-collapse: collapse; padding: 2px; } </style> <table id="table"></table>

Split on new line, unless inside double quotes

Question

Update

4 answers

solution1
3 ACCPTED 2019-07-19 03:20:10

Parsing all the data

Expression

Visualisation

Example Usage

Commented Regular Expression

solution2
2 2019-07-18 22:01:44

solution3
2 2019-07-18 22:40:23

solution4
1 2019-07-19 00:52:07

Split on new line, unless inside double quotes

Question

Update

4 answers

solution1 3 ACCPTED 2019-07-19 03:20:10

Parsing all the data

Expression

Visualisation

Example Usage

Commented Regular Expression

solution2 2 2019-07-18 22:01:44

solution3 2 2019-07-18 22:40:23

solution4 1 2019-07-19 00:52:07

solution1
3 ACCPTED 2019-07-19 03:20:10

solution2
2 2019-07-18 22:01:44

solution3
2 2019-07-18 22:40:23

solution4
1 2019-07-19 00:52:07