javascript and string manipulation w/ utf-16 surrogate pairs

Question

I'm working on a twitter app and just stumbled into the world of utf-8(16). It seems the majority of javascript string functions are as blind to surrogate pairs as I was. I've got to recode some stuff to make it wide character aware.

I've got this function to parse strings into arrays while preserving the surrogate pairs. Then I'll recode several functions to deal with the arrays rather than strings.

function sortSurrogates(str){
  var cp = [];                 // array to hold code points
  while(str.length){           // loop till we've done the whole string
    if(/[\uD800-\uDFFF]/.test(str.substr(0,1))){ // test the first character
                               // High surrogate found low surrogate follows
      cp.push(str.substr(0,2)); // push the two onto array
      str = str.substr(2);     // clip the two off the string
    }else{                     // else BMP code point
      cp.push(str.substr(0,1)); // push one onto array
      str = str.substr(1);     // clip one from string 
    }
  }                            // loop
  return cp;                   // return the array
}

My question is, is there something simpler I'm missing? I see so many people reiterating that javascript deals with utf-16 natively, yet my testing leads me to believe, that may be the data format, but the functions don't know it yet. Am I missing something simple?

EDIT: To help illustrate the issue:

var a = "0123456789"; // U+0030 - U+0039 2 bytes each
var b = "𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡"; // U+1D7D8 - U+1D7E1 4 bytes each
alert(a.length); // javascript shows 10
alert(b.length); // javascript shows 20

Twitter sees and counts both of those as being 10 characters long.

Answer 1

Javascript uses UCS-2 internally, which is not UTF-16. It is very difficult to handle Unicode in Javascript because of this, and I do not suggest attempting to do so.

As for what Twitter does, you seem to be saying that it is sanely counting by code point not insanely by code unit.

Unless you have no choice, you should use a programming language that actually supports Unicode, and which has a code-point interface, not a code-unit interface. Javascript isn't good enough for that as you have discovered.

It has The UCS-2 Curse, which is even worse than The UTF-16 Curse, which is already bad enough. I talk about all this in OSCON talk, 🔫 Unicode Support Shootout: 👍 The Good, the Bad, & the (mostly) Ugly 👎 .

Due to its horrible Curse, you have to hand-simulate UTF-16 with UCS-2 in Javascript, which is simply nuts.

Javascript suffers from all kinds of other terrible Unicode troubles, too. It has no support for graphemes or normalization or collation, all of which you really need. And its regexes are broken, sometimes due to the Curse, sometimes just because people got it wrong. For example, Javascript is incapable of expressing regexes like [𝒜-𝒵] . Javascript doesn't even support casefolding, so you can't write a pattern like /ΣΤΙΓΜΑΣ/i and have it correctly match στιγμας .

You can try to use the XRegEXp plugin , but you won't banish the Curse that way. Only changing to a language with Unicode support will do that, and 𝒥𝒶𝓋𝒶𝓈𝒸𝓇𝒾𝓅𝓉 just isn't one of those.

Answer 2

I've knocked together the starting point for a Unicode string handling object. It creates a function called UnicodeString() that accepts either a JavaScript string or an array of integers representing Unicode code points and provides length and codePoints properties and toString() and slice() methods. Adding regular expression support would be very complicated, but things like indexOf() and split() (without regex support) should be pretty easy to implement.

 var UnicodeString = (function() { function surrogatePairToCodePoint(charCode1, charCode2) { return ((charCode1 & 0x3FF) << 10) + (charCode2 & 0x3FF) + 0x10000; } function stringToCodePointArray(str) { var codePoints = [], i = 0, charCode; while (i < str.length) { charCode = str.charCodeAt(i); if ((charCode & 0xF800) == 0xD800) { codePoints.push(surrogatePairToCodePoint(charCode, str.charCodeAt(++i))); } else { codePoints.push(charCode); } ++i; } return codePoints; } function codePointArrayToString(codePoints) { var stringParts = []; for (var i = 0, len = codePoints.length, codePoint, offset, codePointCharCodes; i < len; ++i) { codePoint = codePoints[i]; if (codePoint > 0xFFFF) { offset = codePoint - 0x10000; codePointCharCodes = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)]; } else { codePointCharCodes = [codePoint]; } stringParts.push(String.fromCharCode.apply(String, codePointCharCodes)); } return stringParts.join(""); } function UnicodeString(arg) { if (this instanceof UnicodeString) { this.codePoints = (typeof arg == "string") ? stringToCodePointArray(arg) : arg; this.length = this.codePoints.length; } else { return new UnicodeString(arg); } } UnicodeString.prototype = { slice: function(start, end) { return new UnicodeString(this.codePoints.slice(start, end)); }, toString: function() { return codePointArrayToString(this.codePoints); } }; return UnicodeString; })(); var ustr = UnicodeString("f𝌆𝌆bar"); document.getElementById("output").textContent = "String: '" + ustr + "', length: " + ustr.length + ", slice(2, 4): " + ustr.slice(2, 4);

 <div id="output"></div>

Answer 3

Here are a couple scripts that might be helpful when dealing with surrogate pairs in JavaScript:

ES6 Unicode shims for ES3+ adds the String.fromCodePoint and String.prototype.codePointAt methods from ECMAScript 6. The ES3/5 fromCharCode and charCodeAt methods do not account for surrogate pairs and therefore give wrong results.
Full 21-bit Unicode code point matching in XRegExp with \\u{10FFFF} allows matching any individual code point in XRegExp regexes.

Answer 4

Javascript string iterators can give you the actual characters instead of the surrogate code points:

>>> [..."0123456789"]
["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
>>> [..."𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡"]
["𝟘", "𝟙", "𝟚", "𝟛", "𝟜", "𝟝", "𝟞", "𝟟", "𝟠", "𝟡"]
>>> [..."0123456789"].length
10
>>> [..."𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡"].length
10

Answer 5

This is along the lines of what I was looking for. It needs better support for the different string functions. As I add to it I will update this answer.

function wString(str){
  var T = this; //makes 'this' visible in functions
  T.cp = [];    //code point array
  T.length = 0; //length attribute
  T.wString = true; // (item.wString) tests for wString object

//member functions
  sortSurrogates = function(s){  //returns array of utf-16 code points
    var chrs = [];
    while(s.length){             // loop till we've done the whole string
      if(/[\uD800-\uDFFF]/.test(s.substr(0,1))){ // test the first character
                                 // High surrogate found low surrogate follows
        chrs.push(s.substr(0,2)); // push the two onto array
        s = s.substr(2);         // clip the two off the string
      }else{                     // else BMP code point
        chrs.push(s.substr(0,1)); // push one onto array
        s = s.substr(1);         // clip one from string 
      }
    }                            // loop
    return chrs;
  };
//end member functions

//prototype functions
  T.substr = function(start,len){
    if(len){
      return T.cp.slice(start,start+len).join('');
    }else{
      return T.cp.slice(start).join('');
    }
  };

  T.substring = function(start,end){
    return T.cp.slice(start,end).join('');
  };

  T.replace = function(target,str){
    //allow wStrings as parameters
    if(str.wString) str = str.cp.join('');
    if(target.wString) target = target.cp.join('');
    return T.toString().replace(target,str);
  };

  T.equals = function(s){
    if(!s.wString){
      s = sortSurrogates(s);
      T.cp = s;
    }else{
        T.cp = s.cp;
    }
    T.length = T.cp.length;
  };

  T.toString = function(){return T.cp.join('');};
//end prototype functions

  T.equals(str)
};

Test results:

// plain string
var x = "0123456789";
alert(x);                    // 0123456789
alert(x.substr(4,5))         // 45678
alert(x.substring(2,4))      // 23
alert(x.replace("456","x")); // 0123x789
alert(x.length);             // 10

// wString object
x = new wString("𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡");
alert(x);                    // 𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡
alert(x.substr(4,5))         // 𝟜𝟝𝟞𝟟𝟠
alert(x.substring(2,4))      // 𝟚𝟛
alert(x.replace("𝟜𝟝𝟞","x")); // 𝟘𝟙𝟚𝟛x𝟟𝟠𝟡
alert(x.length);             // 10

javascript and string manipulation w/ utf-16 surrogate pairs

Question

5 answers

solution1
21 ACCPTED 2011-07-30 22:05:03

solution2
8 2011-07-31 00:19:36

solution3
5 2012-05-28 07:28:53

solution4
4 2016-06-05 17:12:54

solution5
3 2011-07-31 12:59:23

javascript and string manipulation w/ utf-16 surrogate pairs

Question

5 answers

solution1 21 ACCPTED 2011-07-30 22:05:03

solution2 8 2011-07-31 00:19:36

solution3 5 2012-05-28 07:28:53

solution4 4 2016-06-05 17:12:54

solution5 3 2011-07-31 12:59:23

solution1
21 ACCPTED 2011-07-30 22:05:03

solution2
8 2011-07-31 00:19:36

solution3
5 2012-05-28 07:28:53

solution4
4 2016-06-05 17:12:54

solution5
3 2011-07-31 12:59:23