javascript和字符串操作w / utf-16代理對

Question

我正在開發一個推特應用程序，偶然發現了utf-8（16）的世界。 似乎大多數javascript字符串函數對代理對都是盲目的。 我必須重新編碼一些東西才能讓它具有廣泛的字符意識。

我有這個函數來解析字符串到數組，同時保留代理對。 然后我將重新編碼幾個函數來處理數組而不是字符串。

function sortSurrogates(str){
  var cp = [];                 // array to hold code points
  while(str.length){           // loop till we've done the whole string
    if(/[\uD800-\uDFFF]/.test(str.substr(0,1))){ // test the first character
                               // High surrogate found low surrogate follows
      cp.push(str.substr(0,2)); // push the two onto array
      str = str.substr(2);     // clip the two off the string
    }else{                     // else BMP code point
      cp.push(str.substr(0,1)); // push one onto array
      str = str.substr(1);     // clip one from string 
    }
  }                            // loop
  return cp;                   // return the array
}

我的問題是，有什么比我更缺的東西嗎？ 我看到很多人重申javascript本身處理utf-16，但我的測試讓我相信，這可能是數據格式，但功能還不知道。 我錯過了一些簡單的事嗎？

編輯：幫助說明問題：

var a = "0123456789"; // U+0030 - U+0039 2 bytes each
var b = "𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡"; // U+1D7D8 - U+1D7E1 4 bytes each
alert(a.length); // javascript shows 10
alert(b.length); // javascript shows 20

Twitter看到並計算這兩個長度為10個字符。

Answer 1

Javascript內部使用UCS-2，而不是UTF-16。 因此，在Javascript中處理Unicode非常困難，我不建議嘗試這樣做。

至於Twitter的作用，你似乎在說代碼單元並不是瘋狂地用代碼點來計算。

除非你別無選擇，否則你應該使用一種實際支持Unicode的編程語言，它具有代碼點接口，而不是代碼單元接口。 正如你所發現的，Javascript還不夠好。

它有UCS-2詛咒，甚至比UTF-16詛咒更糟糕，已經足夠糟糕了。 我在OSCON講話中談論所有這些， 🔫Unicode支持槍戰：👍好，壞，和（大多）丑👎 。

由於它的可怕詛咒，你必須用Javascript中的UCS-2手工模擬UTF-16，這簡直就是瘋了。

Javascript也遭受各種其他可怕的Unicode問題。 它不支持字形或標准化或校對，所有這些都是你真正需要的。 它的正則表達式被打破了，有時候是因為詛咒，有時只是因為人們弄錯了。 例如，Javascript無法表達像[𝒜-𝒵]這樣的正則表達式。 Javascript甚至不支持casefolding，所以你不能寫像/ΣΤΙΓΜΑΣ/i這樣的模式，並且正確匹配στιγμας 。

您可以嘗試使用XRegEXp插件，但不會以這種方式消除詛咒。 只有改為使用Unicode支持的語言才能做到這一點，而𝒥𝒶𝓋𝒶𝓈𝒸𝓇𝒾𝓅𝓉只是不是其中之一。

Answer 2

我把Unicode字符串處理對象的起點拼湊在了一起。 它創建了一個名為UnicodeString()的函數，它接受JavaScript字符串或表示Unicode代碼點的整數數組，並提供length和codePoints屬性以及toString()和slice()方法。 添加正則表達式支持會非常復雜，但是indexOf()和split() （沒有正則表達式支持）之類的東西應該很容易實現。

 var UnicodeString = (function() { function surrogatePairToCodePoint(charCode1, charCode2) { return ((charCode1 & 0x3FF) << 10) + (charCode2 & 0x3FF) + 0x10000; } function stringToCodePointArray(str) { var codePoints = [], i = 0, charCode; while (i < str.length) { charCode = str.charCodeAt(i); if ((charCode & 0xF800) == 0xD800) { codePoints.push(surrogatePairToCodePoint(charCode, str.charCodeAt(++i))); } else { codePoints.push(charCode); } ++i; } return codePoints; } function codePointArrayToString(codePoints) { var stringParts = []; for (var i = 0, len = codePoints.length, codePoint, offset, codePointCharCodes; i < len; ++i) { codePoint = codePoints[i]; if (codePoint > 0xFFFF) { offset = codePoint - 0x10000; codePointCharCodes = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)]; } else { codePointCharCodes = [codePoint]; } stringParts.push(String.fromCharCode.apply(String, codePointCharCodes)); } return stringParts.join(""); } function UnicodeString(arg) { if (this instanceof UnicodeString) { this.codePoints = (typeof arg == "string") ? stringToCodePointArray(arg) : arg; this.length = this.codePoints.length; } else { return new UnicodeString(arg); } } UnicodeString.prototype = { slice: function(start, end) { return new UnicodeString(this.codePoints.slice(start, end)); }, toString: function() { return codePointArrayToString(this.codePoints); } }; return UnicodeString; })(); var ustr = UnicodeString("f𝌆𝌆bar"); document.getElementById("output").textContent = "String: '" + ustr + "', length: " + ustr.length + ", slice(2, 4): " + ustr.slice(2, 4);

 <div id="output"></div>

Answer 3

以下是一些在JavaScript中處理代理項對時可能有用的腳本：

用於ES3 +的ES6 Unicode fromCharCode添加了ECMAScript 6中的String.fromCodePoint和String.prototype.codePointAt方法。來自fromCharCode和charCodeAt方法的ES3 / 5不考慮代理對，因此給出了錯誤的結果。
XRegExp中與\\u{10FFFF}匹配的完整21位Unicode代碼點允許匹配XRegExp正則表達式中的任何單個代碼點。

Answer 4

Javascript字符串迭代器可以為您提供實際字符而不是代理代碼點：

>>> [..."0123456789"]
["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
>>> [..."𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡"]
["𝟘", "𝟙", "𝟚", "𝟛", "𝟜", "𝟝", "𝟞", "𝟟", "𝟠", "𝟡"]
>>> [..."0123456789"].length
10
>>> [..."𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡"].length
10

Answer 5

這與我所尋找的一致。 它需要更好地支持不同的字符串函數。 當我添加它時，我將更新這個答案。

function wString(str){
  var T = this; //makes 'this' visible in functions
  T.cp = [];    //code point array
  T.length = 0; //length attribute
  T.wString = true; // (item.wString) tests for wString object

//member functions
  sortSurrogates = function(s){  //returns array of utf-16 code points
    var chrs = [];
    while(s.length){             // loop till we've done the whole string
      if(/[\uD800-\uDFFF]/.test(s.substr(0,1))){ // test the first character
                                 // High surrogate found low surrogate follows
        chrs.push(s.substr(0,2)); // push the two onto array
        s = s.substr(2);         // clip the two off the string
      }else{                     // else BMP code point
        chrs.push(s.substr(0,1)); // push one onto array
        s = s.substr(1);         // clip one from string 
      }
    }                            // loop
    return chrs;
  };
//end member functions

//prototype functions
  T.substr = function(start,len){
    if(len){
      return T.cp.slice(start,start+len).join('');
    }else{
      return T.cp.slice(start).join('');
    }
  };

  T.substring = function(start,end){
    return T.cp.slice(start,end).join('');
  };

  T.replace = function(target,str){
    //allow wStrings as parameters
    if(str.wString) str = str.cp.join('');
    if(target.wString) target = target.cp.join('');
    return T.toString().replace(target,str);
  };

  T.equals = function(s){
    if(!s.wString){
      s = sortSurrogates(s);
      T.cp = s;
    }else{
        T.cp = s.cp;
    }
    T.length = T.cp.length;
  };

  T.toString = function(){return T.cp.join('');};
//end prototype functions

  T.equals(str)
};

檢測結果：

// plain string
var x = "0123456789";
alert(x);                    // 0123456789
alert(x.substr(4,5))         // 45678
alert(x.substring(2,4))      // 23
alert(x.replace("456","x")); // 0123x789
alert(x.length);             // 10

// wString object
x = new wString("𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡");
alert(x);                    // 𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡
alert(x.substr(4,5))         // 𝟜𝟝𝟞𝟟𝟠
alert(x.substring(2,4))      // 𝟚𝟛
alert(x.replace("𝟜𝟝𝟞","x")); // 𝟘𝟙𝟚𝟛x𝟟𝟠𝟡
alert(x.length);             // 10

javascript和字符串操作w / utf-16代理對

問題描述

5 個解決方案

解決方案1
21 已采納 2011-07-30 22:05:03

解決方案2
8 2011-07-31 00:19:36

解決方案3
5 2012-05-28 07:28:53

解決方案4
4 2016-06-05 17:12:54

解決方案5
3 2011-07-31 12:59:23

javascript和字符串操作w / utf-16代理對

問題描述

5 個解決方案

解決方案1 21 已采納 2011-07-30 22:05:03

解決方案2 8 2011-07-31 00:19:36

解決方案3 5 2012-05-28 07:28:53

解決方案4 4 2016-06-05 17:12:54

解決方案5 3 2011-07-31 12:59:23

解決方案1
21 已采納 2011-07-30 22:05:03

解決方案2
8 2011-07-31 00:19:36

解決方案3
5 2012-05-28 07:28:53

解決方案4
4 2016-06-05 17:12:54

解決方案5
3 2011-07-31 12:59:23