I was doing some web crawling and noticed that I was getting some weird documents with characters like so " ".
I visited the problematic website, but there was no visible problem with the document enconding itself.
I took the buffer that was being displayed incorrectly and started testing, and the problem seems to be in node js?
var actual = new Buffer([0x50, 0x72, 0x65, 0xe7, 0x6f]) // this is the buffer I got
var correct = 'Preço' // This is what I expected to be displayed
console.log('Correct: ', correct)
console.log('Actual:', actual.toString('utf8'))
// Test code per code
console.log(correct.charCodeAt(0) + '=' + parseInt(actual[0]))
console.log(correct.charCodeAt(1) + '=' + parseInt(actual[1]))
console.log(correct.charCodeAt(2) + '=' + parseInt(actual[2]))
console.log(correct.charCodeAt(3) + '=' + parseInt(actual[3]))
console.log(correct.charCodeAt(4) + '=' + parseInt(actual[4]))
Outputs:
Correct: Preço
Actual: Pre�o
80=80
114=114
101=101
231=231
111=111
As you see, all bytes corresponds to the same char code! How could they be yielding different results?
Try iconv
:
var actual = new Buffer([0x50, 0x72, 0x65, 0xe7, 0x6f]) // this is the buffer I got
var correct = 'Preço' // This is what I expected to be displayed
console.log('Correct: ', correct)
console.log('Actual:', actual.toString('utf8'))
var iconv = require('iconv');
var converter = new iconv.Iconv('windows-1250', 'utf8');
var data = converter.convert(actual).toString();
console.log('iconv: ',data);
Use this code sample for 2-byte presentation of chars in strings. Buffer is trimming higher byte in your above example.
function SpecialCharsTest (str)
{
//mix-data = '€uro' // This is what I expected to be displayed, € sign is 2-byte 0x20AC
console.log('InStr: ', str);
var buf = new ArrayBuffer(str.length*2); // 2 bytes for each char
var bufView = new Uint16Array(buf);
var strLen=str.length;
for (var i=0; i < strLen; i++) {
bufView[i] = str.charCodeAt(i);
}
console.log('InStr to bufView Array (2-byte): ', bufView);
console.log('InStr to buf back to String (2-byte): ' + String.fromCharCode.apply(null, new Uint16Array(buf)));
return buf;
}
Result:
InStr: €uro
InStr to bufView Array (2-byte): Uint16Array(4) [8364, 117, 114, 111]
InStr to buf back to String (2-byte): €uro
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.