简体   繁体   中英

How to escape utf-8 and convert utf-8 code to bytes

I need to convert string (char[]) to string with unicode escape (in format: \ą). I write unicode code to convert string from file example:

"ąćżźóÓŻŹĆĄŚś ƐƑ ƁƂ ЁЂ" => "\ą\ć\ż\ź\ó\Ó\Ż\Ź\Ć\Ą\Ś\ś \Ɛ\Ƒ \Ɓ\Ƃ \Ё\Ђ".

Now i need write reverse so for example: "\ą" => "ą" (char[] = {0xC4, 0x85}).

How do this (using only C)?
Let's say i have utf-8 code in uint32_t code = 0x0105;

Here my code to convert string to unicode escape:

static const uint32_t unicode[48] = {
    0x0000, 0x0040, 0x0080, 0x00C0, 0x0100, 0x0140, 0x0180, 0x01C0, 0x0200, 0x0240, 0x0280, 0x02C0, 0x0300, 0x0340, 0x0380, 0x03C0, 
    0x0400, 0x0440, 0x0480, 0x04C0, 0x0500, 0x0540, 0x0580, 0x05C0, 0x0600, 0x0640, 0x0680, 0x06C0, 0x0700, 0x0740, 0x0780, 0x07C0, 
    0x0800, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000, 0x6000, 0x7000, 0x8000, 0x9000, 0xA000, 0xB000, 0xC000, 0xD000, 0xE000, 0xF000, 
};

...

    FILE* fh = fopen("utf.txt", "r");
    char* result;
    char* tmpMemoryBuffer;
    size_t currentSize = 255, currentIndex = 0;
    result = (char*) malloc(sizeof(char) * currentSize);
    memset(result, 0, sizeof(char) * currentSize);


    if (fh != NULL)
    {
        uint8_t c2, c = (uint8_t) getc(fh);
        uint32_t tmp = 0;

        while (c != EOF && c != 0xFF)
        {
            if ((currentIndex - 1) == currentSize)
            {
                tmpMemoryBuffer = (char*) malloc(sizeof(char) * currentSize);
                memcpy(tmpMemoryBuffer, result, sizeof(char) * currentSize);
                result = (char*) realloc(result, sizeof(char) * (currentSize + 255));
                memcpy(result, tmpMemoryBuffer, sizeof(char) * currentSize);
                currentSize += 255;
            }

            if (c >= 0x20 && c <= 0x7E)
            {
                //Is normal char
                printf("Normal:\t%c\n", c);
                result[currentIndex++] = (char) c;
            }
            else if (c >= 0xC0 && c <= 0xEF && (c2 = (uint8_t) getc(fh)) != EOF)
            {
                //Is unicode
                c &= 0x3F;
                c2 &= 0x7F;
                tmp = unicode[c];
                tmp += c2;
                sprintf(result + currentIndex, "\\u%04X", tmp);
                currentIndex += 6;
                printf("Unicode:\t%04X\n", tmp);

            }
            else
            {
                printf("Wrong format for 0x%X\n", c);
                break;
            }
            c = (uint8_t) getc(fh);
        }

        result[currentIndex] = '\0';
        fclose(fh);
...
        free(result);     

There is some better way to do this?

If any will search i write this as continuation of code in question:

char result[] = "\ą\ć\ż\ź\ó";

char* resultStr = (char*)malloc(sizeof(char) * currentIndex + 1);
size_t reIndex = 0;

for (size_t i = 0; i < strlen(result); i++) 
{
    if (result[i] == '\\')
    {
        if (result[i + 1] != '\0')
        {
            i++;
            switch (result[i])
            {
                case 'u':
                    if (result[i + 1] != '\0' && result[i + 2] != '\0' && result[i + 3] != '\0' && result[i + 4] != '\0')
                    {
                        const char hexstring[5] = {result[i + 1], result[i + 2], result[i + 3], result[i + 4], '\0'};
                        uint32_t code = (uint32_t)strtol(hexstring, NULL, 16);
                        printf ("Code = 0x%X\n", code);
                        uint8_t firstByte = 47;
                        uint8_t secondByte = 0;

                        for (size_t i = 1; i < 48; i++)
                        {
                            if (unicode[i] > code)
                            {
                                firstByte = i - 1;
                                secondByte = (uint8_t)(code - unicode[i - 1]);
                                break;
                            }
                        }

                        firstByte |= 0xC0;
                        secondByte |= 0x80;
                        resultStr[reIndex++] = (char)firstByte;
                        resultStr[reIndex++] = (char)secondByte;
                        i += 4;
                    }
                break;
            }
        }
        else
        {
            //Error
        }
    }
    else
    {
        resultStr[reIndex++] = result[i];
    }
}

resultStr[reIndex] = '\0';
printf("Result = %s\n", resultStr);

This require refactor and add some features like handle '\\n', '\\t', '\\r', but is light and fast.

Anyone have better ideas?

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM