繁体   English   中英

将包含 UTF8 字符串的变量转换为包含 latin1 字符串的变量 - 用于 Javascript 中的浏览器

[英]Convert a variable containing UTF8 string to a variable containing latin1 string - for browser in Javascript

Bash 替代方案(shell 设置为 UTF8):

输入:

在.json

$ file -I in.json
in.json: text/plain; charset=utf-8

{"it-it":"Città"}

Bash 命令我需要 JS 替代方案:

$ iconv -f utf8 -t latin1 in.json > out.json

输出.json

$ file -I in.json
out.json: text/plain; charset=iso-8859-1

{"it-it":"Citt?"}

当从输入 type="file" 读取为 base64 时,Javascript 在浏览器中看到 in.json 的内容(尽管内容类型和脚本编码设置为 utf8):

{"it-it":"Città"}

Javascript 在浏览器中看到的 out.json 是什么:

{"it-it":"Città"}

问题 - 我怎样才能以最原生的 Javascript 方式让大多数现代浏览器转换这个 utf8 字符串

({"it-it":"Città "} as latin1 and {"it-it":"Città"} as utf8) 

到 latin1 字符串?

我更喜欢本机解决方案,或者最坏的情况是 JQuery,请尽量不要用 npm + node 依赖地狱来解决它。

Ps:我只需要支持最现代的浏览器,这是一个仅限管理员的页面。

下面,我用iso-8859-1版本的CittÃ创建了一个数组,然后使用TextDecoder对其进行解码。

因此,如果您可以获取JSON的二进制版本,则应该可以为您进行转换。

 //Città var latinSource = new Uint8Array([67, 105, 116, 116, 195]); var tc = new TextDecoder("iso-8859-1"); console.log(tc.decode(latinSource)); 

对我来说,'new TextDecoder("iso-8859-1")' 不起作用......

1.

 var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]); var tc = new TextDecoder("iso-8859-1"); console.log(tc.decode(latinSource)); //return windows-1252 string

我明白了,结果,它不是 latin1 字符串,因为它包含字符 '€'。

2.

 //windows-1252 console.log('new TextDecoder("iso-8859-1")', new TextDecoder("iso-8859-1")); // ----> //new TextDecoder("iso-8859-1") { // "encoding": "windows-1252", // "fatal": false, // "ignoreBOM": false, // "decode": function decode() { [native code] } //}

  1. 编码解码latin-1的工作方式:

 //Decode Latin1-string (iso-8859-1 encoded string) -> into Uint8Array function Latin1ToUint8Array(iso_8859_1){ var uInt8Arr = new Uint8Array(iso_8859_1.length); for(var i=0; i<iso_8859_1.length; i++){ uInt8Arr[i] = iso_8859_1.charCodeAt(i); } return uInt8Arr; } //encode Uint8Array -> into iso-8859-1 encoded string (latin1-string) function Uint8ToLatin1Str(Uint8Arr){ var iso_8859_1_string = ''; for(var i=0; i<Uint8Arr.length; i++){iso_8859_1_string+= String.fromCharCode(Uint8Arr[i]);} return iso_8859_1_string; } var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]); console.log( Uint8ToLatin1Str(latinSource) ); //valid latin1-string (iso-8859-1)

  1. 最后,Windows-1252 转换:

 function Windows1252EncodeDecode( cp1252 //string (to encode into bytes), or Uint8Array (to decode into string) ){ var replaceCharCodesForLatin1 = { //_______________________________________________________________________ //|"windows-1252"| iso-8859-1 | //Unicode | //|'character' | charcode, | //charcode(commented), | //|______________|______________________|_______________________________| '€' : 128, //8364, '‚' : 130, //8218, 'ƒ' : 131, //402, '„' : 132, //8222, '…' : 133, //8230, '†' : 134, //8224, '‡' : 135, //8225, 'ˆ' : 136, //710, '‰' : 137, //8240, 'Š' : 138, //352, '‹' : 139, //8249, 'Œ' : 140, //338, 'Ž' : 142, //381, ''' : 145, //8216, ''' : 146, //8217, '“' : 147, //8220, '”' : 148, //8221, '•' : 149, //8226, '–' : 150, //8211, '—' : 151, //8212, '˜' : 152, //732, '™' : 153, //8482, 'š' : 154, //353, '›' : 155, //8250, 'œ' : 156, //339, 'ž' : 158, //382, 'Ÿ' : 159, //376 }; if(typeof cp1252 === 'string'){ //if that was been string to encode to bytes var resultUint8 = new Uint8Array(cp1252.length); for(var i = 0; i<cp1252.length; i++){ var charCode = cp1252[i].charCodeAt(0); resultUint8[i] = ((charCode>256) ? replaceCharCodesForLatin1[cp1252[i]] : charCode); } return resultUint8; //return Uint8Array }else if(cp1252 instanceof Uint8Array){ //else if that was been Uint8Array to decode to string var resultString = ""; for(var i = 0; i<cp1252.length; i++){ var charCode = (Object.keys(replaceCharCodesForLatin1).find(key => replaceCharCodesForLatin1[key] === cp1252[i])); charCode = (typeof charCode === 'undefined') ? String.fromCharCode(cp1252[i]) : charCode; resultString += charCode; } return resultString; //return Uint8Array } } var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]); var windows1252 = new TextDecoder("iso-8859-1").decode(latinSource); //windows-1252 string on output console.log('new TextDecoder("iso-8859-1").decode(latinSource)', (new TextDecoder("iso-8859-1").decode(latinSource))) var bytesBack = Windows1252EncodeDecode(windows1252); console.log('bytesBack', bytesBack.toString()); var Windows1252StringBack = Windows1252EncodeDecode(bytesBack) console.log('string back', Windows1252StringBack); console.log('Compare with TextDecoder', (Windows1252StringBack === windows1252 ));

  1. 修改 latin-1 函数,来自 3:

 function isLatin1String(str){return (str.match(/[^\-\ÿ]/) === null);} //check is string "iso-8859-1"-encoded or not (true/false) //Decode Latin1 or utf-8 string -> into Uint8Array function StringToUint8Array(str){ if(!isLatin1String(str)){ return new TextEncoder("utf-8").encode(str); //encode to bytes as utf-8 } //else, as ASCII-compatible latin1-string var uInt8Arr = new Uint8Array(str.length); for(var i=0; i<str.length; i++){ uInt8Arr[i] = str.charCodeAt(i); } return uInt8Arr; } //encode Uint8Array -> to latin1-string function Uint8ToStr(Uint8Arr){ var iso_8859_1_string = ''; for(var i=0; i<Uint8Arr.length; i++){iso_8859_1_string+= String.fromCharCode(Uint8Arr[i]);} return iso_8859_1_string; } function latin1ToUtf8(latin1str){ return new TextDecoder("utf-8").decode(StringToUint8Array(latin1str)); } console.log('StringToUint8Array("CittÃ")', StringToUint8Array("CittÃ")); //Latin1 console.log('StringToUint8Array("Città€")', StringToUint8Array("Città€")); //utf-8 console.log('Uint8ToStr(StringToUint8Array("CittÃ"))', Uint8ToStr(StringToUint8Array("CittÃ"))); //latin1 console.log('Uint8ToStr(StringToUint8Array("Città"))', Uint8ToStr(StringToUint8Array("Città€"))); //utf-8 console.log('latin1ToUtf8(Uint8ToStr(StringToUint8Array("Città€")))', latin1ToUtf8(Uint8ToStr(StringToUint8Array("Città€")))); //utf-8

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM