简体   繁体   中英

java does not convert from latin to utf8 properly

I select data from mysql, the database is not in utf8 (the unicode character is save as latin, for example the unicode string Đỗ Tiến (correct form) is save as Äá»— Tiến ). If I use PHP to echo to html, I just set <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> the webpage displays correctly. If I do not set the meta tag, when open by Chrome, the Chrome detect that is in windows-1258 encode, manually change to Unicode (utf-8), the webpage displays correctly.

The problem is: when I select data from mysql using jdbc I convert like this:

    byte[] asciiBytes1 = "Äá»— tiến".getBytes("Cp1258");
    byte[] asciiBytes2 = "Äá»— tiến".getBytes("ISO-8859-1");
    String unicode1 = new String(asciiBytes1, "UTF-8");
    String unicode2 = new String(asciiBytes2, "UTF-8");
    System.out.println(unicode1);//�?ỗ tiến
    System.out.println(unicode2);//Đ�? tiến

as the result, java does not convert properly, I try many encodings in http://docs.oracle.com/javase/1.4.2/docs/guide/intl/encoding.doc.html , not only Cp1258 and ISO-8859-1, but none works. The 2 simple method to converting is use html file with Äá»— tiến string as I mention before or using notepad++, set encoding ANSI, paste Äá»— tiến string then change to utf-8, it will displays Đỗ Tiến (is the correct string I want)

That's kinda complicated, it's in modified Windows-1252 where 0x81, 0x8d, 0x8f, 0x90 and 0x9d that are normally not assigned are replaced with respective C1 characters. It seems Java doesn't take this into account by default when using Windows-1252.

It is easiest to just fix your database and use UTF-8 everywhere.

Here's the code anyway

public static byte[] getBytesModifiedW1252( String str ) {
    final int[] windows1252 = {
            0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
            ,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
            ,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
            ,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
            ,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
            ,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
            ,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
            ,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
            ,0x20AC,0x0081,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0x008D,0x017D,0x008F
            ,0x0090,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0x009D,0x017E,0x0178
            ,0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF
            ,0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF
            ,0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF
            ,0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF
            ,0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF
            ,0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
        };
    Map<Integer, Integer> map = new HashMap<Integer, Integer>();

    for( int i = 0; i < windows1252.length; ++i ) {
        map.put( windows1252[i], i);
    }
    byte replacement = (byte)0x003F;

    byte[] ret = new byte[str.length()];

    for( int i = 0; i < str.length(); ++i ) {
        int cp = str.charAt(i);
        Integer w1252 = map.get(cp);
        ret[i] = w1252 == null ? replacement : (byte)(int)w1252;
    }

    return ret;
}

public static void main(String args[]) throws UnsupportedEncodingException {
    byte[] bytes = getBytesModifiedW1252( "Äá»— tiến" );
    System.out.println(new String(bytes, "UTF-8"));
    //Đỗ tiến
}

Here's the opposite:

public static String getStringModifiedW1252( byte[] bytes ) {

    final int[] windows1252 = {
            0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
            ,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
            ,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
            ,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
            ,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
            ,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
            ,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
            ,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
            ,0x20AC,0x0081,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0x008D,0x017D,0x008F
            ,0x0090,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0x009D,0x017E,0x0178
            ,0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF
            ,0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF
            ,0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF
            ,0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF
            ,0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF
            ,0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
        };

    StringBuilder ret = new StringBuilder(bytes.length);

    for( int i = 0; i < bytes.length; ++i ) {
        ret.append( (char) windows1252[(bytes[i] < 0 ? 256 + bytes[i] : bytes[i] )] );
    }

    return ret.toString();

}

public static void main(String args[]) throws UnsupportedEncodingException {
    String str = "Đỗ tiến";
    String w1252 = getStringModifiedW1252( str.getBytes("UTF-8"));
    System.out.println(w1252);
    //Äá»— tiến
}

You probably want to stash the map and array somewhere instead of creating them when the methods are called

try this

byte[] asciiBytes1 = "Äá»— tiến".getBytes("Cp1258");
byte[] asciiBytes2 = "Äá»— tiến".getBytes("ISO-8859-1");
String unicode1 = new String(asciiBytes1, "Cp1258");
String unicode2 = new String(asciiBytes2, "ISO-8859-1");
System.out.println(unicode1);//�?ỗ tiến
System.out.println(unicode2);//Đ�? tiến

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM