[英]How to correctly convert cp1251 text to utf-8 in C++? (Linux)
我正在尝试将 cp1251 文本转换为 utf-8。 我在这里所做的是在 cp1251 中创建给定符号的十六进制数的缓冲区,以便稍后将这些十六进制符号转换为 utf-8。 问题是有时转换后的字符串最后会有一些垃圾符号。
多次转换同一字符串的 output (203 ñòåï ÒÖÍÐ.466219.007 Èíòåðàêòèâíûé êîìïëåêñ NextPanel 43/NAUO1):
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1омплекс NextPanelВ 43/NAUO1
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO14V
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1sгцf¤№
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1п}тУї0bЊ.Z¶ї¬ЁЌ€/ГїаА Om›Ґї
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1sгцf¤№
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1™™™™Щ?
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1
theNamePrefix = 203 степ ТЦНР.466219.007 Интерактивный комплекс NextPanel 43/NAUO1
char *ConvertWindows1251ToUtf8(string stringToConvert)
{
// string tmpString = stringToConvert.ToCString();
const char *tmpCharArray = stringToConvert.c_str();
vector<char> charBuffer;
char *buffer = new char[char_traits<char>::length(tmpCharArray)];
int latter = 0;
for (int i = 0; i < std::char_traits<char>::length(tmpCharArray); i++)
{
string tmpHexLatter;
int hexLatter = 0xFF & tmpCharArray[i];
stringstream ss;
ss << hex << hexLatter;
tmpHexLatter = ss.str();
if (hexLatter != 0xc3)
{
if (hexLatter != 0x30 && hexLatter != 0x31 && hexLatter != 0x32 && hexLatter != 0x33 && hexLatter != 0x34 && hexLatter != 0x35 && hexLatter != 0x36 && hexLatter != 0x37 && hexLatter != 0x38 && hexLatter != 0x39 && hexLatter != 0x61 && hexLatter != 0x62 && hexLatter != 0x63 && hexLatter != 0x64 && hexLatter != 0x65 && hexLatter != 0x66 && hexLatter != 0x67 && hexLatter != 0x68 && hexLatter != 0x69 && hexLatter != 0x6A && hexLatter != 0x6B && hexLatter != 0x6C && hexLatter != 0x6D && hexLatter != 0x6E && hexLatter != 0x6F && hexLatter != 0x70 && hexLatter != 0x71 && hexLatter != 0x72 && hexLatter != 0x73 && hexLatter != 0x74 && hexLatter != 0x75 && hexLatter != 0x76 && hexLatter != 0x77 && hexLatter != 0x78 && hexLatter != 0x79 && hexLatter != 0x7A && hexLatter != 0x41 && hexLatter != 0x42 && hexLatter != 0x43 && hexLatter != 0x44 && hexLatter != 0x45 && hexLatter != 0x46 && hexLatter != 0x47 && hexLatter != 0x48 && hexLatter != 0x49 && hexLatter != 0x4A && hexLatter != 0x4B && hexLatter != 0x4C && hexLatter != 0x4D && hexLatter != 0x4E && hexLatter != 0x4F && hexLatter != 0x50 && hexLatter != 0x51 && hexLatter != 0x52 && hexLatter != 0x53 && hexLatter != 0x54 && hexLatter != 0x55 && hexLatter != 0x56 && hexLatter != 0x57 && hexLatter != 0x58 && hexLatter != 0x59 && hexLatter != 0x5A && hexLatter != 0x2E)
hexLatter += 64;
if (hexLatter == 0x60)
hexLatter = 0xA0;
if (hexLatter == 0x6F)
hexLatter = 0x2F;
stringstream ss;
ss << hex << hexLatter;
string tmpHex = ss.str();
tmpHexLatter = "0x" + tmpHex;
latter = stoi(tmpHexLatter, {}, 16);
charBuffer.push_back((char)latter);
}
}
for (int i = 0; i < charBuffer.size(); i++)
{
buffer[i] = charBuffer[i];
}
return g_convert(buffer, -1, "utf-8", "Windows-1251", NULL, NULL, NULL);
/*string tmpStr = stringToConvert.ToCString();
std::unique_ptr<gchar, void (*)(gpointer)> p(g_convert(tmpStr.c_str(), -1, "utf-8", "Windows-1251", NULL, NULL, NULL), g_free);
return TCollection_AsciiString(p.get());*/
}
您根本不需要buffer
,您可以将charBuffer.data()
甚至stringToConvert.c_str()
传递给g_convert()
。
但是,更重要的是, buffer
和charBuffer
是null-terminated ,并且您不会将最终长度传递给g_convert()
,因此g_convert()
最终会超出范围,或者尝试转换未初始化的数据导致未定义行为的方式,这就是为什么您在结果末尾看到垃圾的原因。
附带说明一下,使用base=16
调用std::stoi()
时不需要"0x"
前缀。
另外,为什么要返回char*
而不是std::string
? 谁负责分配和释放 memory? 你真的应该让std::string
处理它。
如果您使用的是 linux,则可以使用iconv(3)
API在字符编码之间进行转换。 不幸的是,它是一个 C 接口,因此从 C++ 使用它可能有点难看,但它仍然比您尝试将代码点转换为十六进制字符串的任何操作要好。
我从 C++(C++17 或更高版本)挖出了一个旧包装器 class 我写过一次以使 iconv 更易于使用:
#include <cerrno>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <iterator>
#include <memory>
#include <string>
#include <string_view>
#include <iconv.h>
// Wrap iconv API in a RAII-styled class for ease of use
class iconv_wrapper {
private:
iconv_t cd;
public:
iconv_wrapper(std::string_view from, std::string_view to = "UTF-8")
: cd{iconv_open(to.data(), from.data())} {
if (cd == reinterpret_cast<iconv_t>(-1)) {
// Need better error handling
std::cerr << "Unable to open converter from " << from << " to " << to
<< ": " << std::strerror(errno) << '\n';
std::exit(EXIT_FAILURE);
}
}
~iconv_wrapper() noexcept { iconv_close(cd); }
std::string convert(std::string_view);
};
std::string iconv_wrapper::convert(std::string_view input) {
// Work out the maximum output size (Assuming converting from a
// single-byte encoding to UTF-8) and allocate a buffer and the
// other args needed for iconv
std::size_t insize = input.size();
std::size_t outsize = insize * 4;
std::size_t orig_outsize = outsize;
auto outbuf = std::make_unique<char[]>(outsize);
char *indata = const_cast<char *>(&input[0]);
char *outdata = &outbuf[0];
// Convert the input argument
std::size_t ret = iconv(cd, &indata, &insize, &outdata, &outsize);
if (ret == static_cast<std::size_t>(-1)) {
// Need better error handling
std::cerr << "Couldn't convert input data: " << std::strerror(errno)
<< '\n';
std::exit(EXIT_FAILURE);
}
// And return it
return std::string(outbuf.get(), orig_outsize - outsize);
}
int main(int argc, char **argv) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " cp1251-encoded-file\n";
return EXIT_FAILURE;
}
std::ifstream in{argv[1], std::ios_base::in | std::ios_base::binary};
if (!in) {
std::cerr << "Unable to open " << argv[1] << " for reading!\n";
return EXIT_FAILURE;
}
iconv_wrapper conv{"CP1251"};
std::string input{std::istreambuf_iterator<char>{in},
std::istreambuf_iterator<char>{}};
std::cout << conv.convert(input);
return 0;
}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.