繁体   English   中英

如何转义utf-8并将utf-8代码转换为字节

[英]How to escape utf-8 and convert utf-8 code to bytes

我需要将字符串(char [])转换为带有unicode转义的字符串(格式为:\\ u0105)。 我编写unicode代码来从文件示例转换字符串:

“ąćżźóÓŻŹĆĄŚśƐƑƁƂЁЂ”=>“\\ u0105 \\ u0107 \\ u017C \\ u017A \\ u00F3 \\ u00D3 \\ u017B \\ u0176 \\ u0104 \\ u015A \\ u015B \\ u0190 \\ u0191 \\ u0181 \\ u0182 \\ u0401 \\ u0402”。

现在我需要写反向,例如:“\\ u0105”=>“±”(char [] = {0xC4,0x85})。

怎么做(只使用C)?
假设我在uint32_t code = 0x0105;有utf-8代码uint32_t code = 0x0105;

这里我的代码将字符串转换为unicode转义:

static const uint32_t unicode[48] = {
    0x0000, 0x0040, 0x0080, 0x00C0, 0x0100, 0x0140, 0x0180, 0x01C0, 0x0200, 0x0240, 0x0280, 0x02C0, 0x0300, 0x0340, 0x0380, 0x03C0, 
    0x0400, 0x0440, 0x0480, 0x04C0, 0x0500, 0x0540, 0x0580, 0x05C0, 0x0600, 0x0640, 0x0680, 0x06C0, 0x0700, 0x0740, 0x0780, 0x07C0, 
    0x0800, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000, 0x6000, 0x7000, 0x8000, 0x9000, 0xA000, 0xB000, 0xC000, 0xD000, 0xE000, 0xF000, 
};

...

    FILE* fh = fopen("utf.txt", "r");
    char* result;
    char* tmpMemoryBuffer;
    size_t currentSize = 255, currentIndex = 0;
    result = (char*) malloc(sizeof(char) * currentSize);
    memset(result, 0, sizeof(char) * currentSize);


    if (fh != NULL)
    {
        uint8_t c2, c = (uint8_t) getc(fh);
        uint32_t tmp = 0;

        while (c != EOF && c != 0xFF)
        {
            if ((currentIndex - 1) == currentSize)
            {
                tmpMemoryBuffer = (char*) malloc(sizeof(char) * currentSize);
                memcpy(tmpMemoryBuffer, result, sizeof(char) * currentSize);
                result = (char*) realloc(result, sizeof(char) * (currentSize + 255));
                memcpy(result, tmpMemoryBuffer, sizeof(char) * currentSize);
                currentSize += 255;
            }

            if (c >= 0x20 && c <= 0x7E)
            {
                //Is normal char
                printf("Normal:\t%c\n", c);
                result[currentIndex++] = (char) c;
            }
            else if (c >= 0xC0 && c <= 0xEF && (c2 = (uint8_t) getc(fh)) != EOF)
            {
                //Is unicode
                c &= 0x3F;
                c2 &= 0x7F;
                tmp = unicode[c];
                tmp += c2;
                sprintf(result + currentIndex, "\\u%04X", tmp);
                currentIndex += 6;
                printf("Unicode:\t%04X\n", tmp);

            }
            else
            {
                printf("Wrong format for 0x%X\n", c);
                break;
            }
            c = (uint8_t) getc(fh);
        }

        result[currentIndex] = '\0';
        fclose(fh);
...
        free(result);     

还有更好的方法吗?

如果有人会搜索我写这个作为问题的延续:

char result[] = "\ą\ć\ż\ź\ó";

char* resultStr = (char*)malloc(sizeof(char) * currentIndex + 1);
size_t reIndex = 0;

for (size_t i = 0; i < strlen(result); i++) 
{
    if (result[i] == '\\')
    {
        if (result[i + 1] != '\0')
        {
            i++;
            switch (result[i])
            {
                case 'u':
                    if (result[i + 1] != '\0' && result[i + 2] != '\0' && result[i + 3] != '\0' && result[i + 4] != '\0')
                    {
                        const char hexstring[5] = {result[i + 1], result[i + 2], result[i + 3], result[i + 4], '\0'};
                        uint32_t code = (uint32_t)strtol(hexstring, NULL, 16);
                        printf ("Code = 0x%X\n", code);
                        uint8_t firstByte = 47;
                        uint8_t secondByte = 0;

                        for (size_t i = 1; i < 48; i++)
                        {
                            if (unicode[i] > code)
                            {
                                firstByte = i - 1;
                                secondByte = (uint8_t)(code - unicode[i - 1]);
                                break;
                            }
                        }

                        firstByte |= 0xC0;
                        secondByte |= 0x80;
                        resultStr[reIndex++] = (char)firstByte;
                        resultStr[reIndex++] = (char)secondByte;
                        i += 4;
                    }
                break;
            }
        }
        else
        {
            //Error
        }
    }
    else
    {
        resultStr[reIndex++] = result[i];
    }
}

resultStr[reIndex] = '\0';
printf("Result = %s\n", resultStr);

这需要重构并添加一些功能,如句柄'\\ n','\\ t','\\ r',但是轻巧快速。

谁有更好的想法?

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM