I have a JavaScript string that contains characters that have a charCode
greater than 255 .
I want to be able to encode/decode that string into another string that has all its charCode
less than or equal to 255.
There is no restriction on the characters (ex: can be non-printable).
I want a solution that is as fast as possible and that produces a string as small as possible.
It must also work for any UTF-8 character.
I found out that encodeURI
does exactly that, but it seems that it takes a lot of space.
encodeURI('ĉ') === "%C4%89" // 6 bytes...
Is there anything better than encodeURI
?
What you want to do is encode your string as UTF8. Googling for how to do that in Javascript, I found http://monsur.hossa.in/2012/07/20/utf-8-in-javascript.html , which gives:
function encode_utf8( s ) {
return unescape( encodeURIComponent( s ) );
}
function decode_utf8( s ) {
return decodeURIComponent( escape( s ) );
}
or in short, almost exactly what you found already, plus unescaping the '%xx' codes to a byte.
You can get the ASCII value of a character with .charCodeAt(position)
. You can split a character into multiple characters using this.
First, get the char code for every character, by looping trough the string. Create a temporary empty string, and while the char code is higher than 255 of the current character, divide 255 from it, and put a ÿ
(the 256th character of the extended ASCII table), then once it's under 255 use String.fromCharCode(charCode)
, to convert it to a character, and put it at the end of the temporary string, and at last, replace the character with this string.
function encode(string) {
var result = [];
for (var i = 0; i < string.length; i++) {
var charCode = string.charCodeAt(i);
var temp = "";
while (charCode > 255) {
temp += "ÿ";
charCode -= 255;
}
result.push(temp + String.fromCharCode(charCode));
}
return result.join(",");
}
The above encoder puts a comma after every group, this could cause problems at decode, so we need to use the ,(?!,)
regex to match the last comma from multiple commas.
function decode(string) {
var characters = string.split(/,(?!,)/g);
var result = "";
for (var i = 0; i < characters.length; i++) {
var charCode = 0;
for (var j = 0; j < characters[i].length; j++) {
charCode += characters[i].charCodeAt(j);
}
result += String.fromCharCode(charCode);
}
return result;
}
UTF-8 is already an encoding for unicode text that uses 8 bits per character. You can simply send the UTF-8 string over the wire.
Generally, JavaScript strings consist of UTF-16 characters.
For such strings, you can either encode each UTF-16 character as two 8-bit characters or use a dynamic length encoding such as UTF-8.
If you have many non-ASCII characters, the first might produce smaller results.
// See http://monsur.hossa.in/2012/07/20/utf-8-in-javascript.html function encode_utf8(s) { return unescape(encodeURIComponent(s)); } function decode_utf8(s) { return decodeURIComponent(escape(s)); } function encode_fixed_length(s) { let length = s.length << 1, bytes = new Array(length); for (let i = 0; i < length; ++i) { let code = s.charCodeAt(i >> 1); bytes[i] = code >> 8; bytes[++i] = code & 0xFF; } return String.fromCharCode.apply(undefined, bytes); } function decode_fixed_length(s) { let length = s.length, chars = new Array(length >> 1); for (let i = 0; i < length; ++i) { chars[i >> 1] = (s.charCodeAt(i) << 8) + s.charCodeAt(++i); } return String.fromCharCode.apply(undefined, chars); } string_1 = "\ \\ÿ"; string_2 = "\ÿ\\"; console.log(encode_fixed_length(string_1)); // "\\x00\\x00\\x00\\x0F\\x00\\xFF" console.log(encode_fixed_length(string_2)); // "\\x00\\xFF\\x0F\\xFF\\xFF\\xFF" console.log(encode_utf8(string_1)); // "\\x00\\x0F\\xC3\\xBF" console.log(encode_utf8(string_2)); // "\\xC3\\xBF\\xE0\\xBF\\xBF\\xEF\\xBF\\xBF"
Performance comparison: See https://jsfiddle.net/r0d9pm25/1/
Results for 500000 iterations in Firefox 47:
encode_fixed_length()
encode_utf8()
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.