簡體   English   中英

從 UTF-8 轉換為 unicode C++

[英]Convert from UTF-8 to unicode c++

如何在 C++ 應用程序中轉換 ú,其中應用程序接收字符為 UTF-8 編碼 %C3%BA 並將其存儲為 unicode 等效 %FA。 我只想知道我將如何編寫代碼來執行此編碼過程

我昨天剛剛寫了一些代碼來做到這一點......

我並不是說這是做到這一點的“完美”方式,但它似乎適用於我運行過的所有測試用例(為此我寫了兩個方向)。

我會讓你把 "%NN" 轉換成一個整數值。

#include <iostream>
#include <deque>

std::deque<int> unicode_to_utf8(int charcode)
{
    std::deque<int> d;
    if (charcode < 128)
    {
        d.push_back(charcode);
    }
    else
    {
        int first_bits = 6; 
        const int other_bits = 6;
        int first_val = 0xC0;
        int t = 0;
        while (charcode >= (1 << first_bits))
        {
            {
                t = 128 | (charcode & ((1 << other_bits)-1));
                charcode >>= other_bits;
                first_val |= 1 << (first_bits);
                first_bits--;
            }
            d.push_front(t);
        }
        t = first_val | charcode;
        d.push_front(t);
    }
    return d;
}


int utf8_to_unicode(std::deque<int> &coded)
{
    int charcode = 0;
    int t = coded.front();
    coded.pop_front();
    if (t < 128)
    {
        return t;
    }
    int high_bit_mask = (1 << 6) -1;
    int high_bit_shift = 0;
    int total_bits = 0;
    const int other_bits = 6;
    while((t & 0xC0) == 0xC0)
    {
        t <<= 1;
        t &= 0xff;
        total_bits += 6;
        high_bit_mask >>= 1; 
        high_bit_shift++;
        charcode <<= other_bits;
        charcode |= coded.front() & ((1 << other_bits)-1);
        coded.pop_front();
    } 
    charcode |= ((t >> high_bit_shift) & high_bit_mask) << total_bits;
    return charcode;
}

int main()
{
    int charcode; 

    for(;;)
    {
        std::cout << "Enter unicode value:" << std::endl;
        std::cin >> charcode; 
        auto x = unicode_to_utf8(charcode);
        for(auto c : x)
        {
            std::cout << "\\x" << std::hex << c << " ";
        }
        std::cout << std::endl;
        int c = utf8_to_unicode(x);
        std::cout << "reversed:" << std::dec << c << std::hex << " in hex:" << c << std::endl;
    }
}

這實際上在標准庫中:

#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale>  // for std::wstring_convert


std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;


int main() {

    std::string utf8_bytes = "ú";
    std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);

    return 0;
}

另一種方法是使用conv_utf8_utf32.to_bytes完成。

使用 printf 以%hex格式打印的示例:

#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale>  // for std::wstring_convert
#include <cstdio>


std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;


int main() {

    std::string utf8_bytes = "ú";
    // print the bytes in %hex format
    for (char byte: utf8_bytes) {
        printf("%%%2X", reinterpret_cast<unsigned char&>(byte));
    }   
    printf("\n");


    std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);

    // print the code points in %hex format
    for (char32_t chr: unicode_codepoints) {
        printf("%%%2X", chr);
    }   
    printf("\n");


    return 0;
}

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM