[英]Convert from UTF-8 to unicode c++
如何在 C++ 應用程序中轉換 ú,其中應用程序接收字符為 UTF-8 編碼 %C3%BA 並將其存儲為 unicode 等效 %FA。 我只想知道我將如何編寫代碼來執行此編碼過程
我昨天剛剛寫了一些代碼來做到這一點......
我並不是說這是做到這一點的“完美”方式,但它似乎適用於我運行過的所有測試用例(為此我寫了兩個方向)。
我會讓你把 "%NN" 轉換成一個整數值。
#include <iostream>
#include <deque>
std::deque<int> unicode_to_utf8(int charcode)
{
std::deque<int> d;
if (charcode < 128)
{
d.push_back(charcode);
}
else
{
int first_bits = 6;
const int other_bits = 6;
int first_val = 0xC0;
int t = 0;
while (charcode >= (1 << first_bits))
{
{
t = 128 | (charcode & ((1 << other_bits)-1));
charcode >>= other_bits;
first_val |= 1 << (first_bits);
first_bits--;
}
d.push_front(t);
}
t = first_val | charcode;
d.push_front(t);
}
return d;
}
int utf8_to_unicode(std::deque<int> &coded)
{
int charcode = 0;
int t = coded.front();
coded.pop_front();
if (t < 128)
{
return t;
}
int high_bit_mask = (1 << 6) -1;
int high_bit_shift = 0;
int total_bits = 0;
const int other_bits = 6;
while((t & 0xC0) == 0xC0)
{
t <<= 1;
t &= 0xff;
total_bits += 6;
high_bit_mask >>= 1;
high_bit_shift++;
charcode <<= other_bits;
charcode |= coded.front() & ((1 << other_bits)-1);
coded.pop_front();
}
charcode |= ((t >> high_bit_shift) & high_bit_mask) << total_bits;
return charcode;
}
int main()
{
int charcode;
for(;;)
{
std::cout << "Enter unicode value:" << std::endl;
std::cin >> charcode;
auto x = unicode_to_utf8(charcode);
for(auto c : x)
{
std::cout << "\\x" << std::hex << c << " ";
}
std::cout << std::endl;
int c = utf8_to_unicode(x);
std::cout << "reversed:" << std::dec << c << std::hex << " in hex:" << c << std::endl;
}
}
這實際上在標准庫中:
#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale> // for std::wstring_convert
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;
int main() {
std::string utf8_bytes = "ú";
std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);
return 0;
}
另一種方法是使用conv_utf8_utf32.to_bytes
完成。
使用 printf 以%hex
格式打印的示例:
#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale> // for std::wstring_convert
#include <cstdio>
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;
int main() {
std::string utf8_bytes = "ú";
// print the bytes in %hex format
for (char byte: utf8_bytes) {
printf("%%%2X", reinterpret_cast<unsigned char&>(byte));
}
printf("\n");
std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);
// print the code points in %hex format
for (char32_t chr: unicode_codepoints) {
printf("%%%2X", chr);
}
printf("\n");
return 0;
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.