繁体   English   中英

C ++将UTF-8字符串作为UTF-16输出到std :: cout

[英]C++ Output UTF-8 strings as UTF-16 to std::cout

我有很多使用C ++ 03,STL和Boost 1.54基于UTF-8编写的代码。
所有代码都通过std::coutstd::cerr数据输出到控制台。
我不想在我的代码库中引入新的库或切换到C ++ 11,但是我想将代码移植到Windows。
重写所有代码以使用std::wcoutstd::wcerr而不是std::coutstd::cerr并不是我想要的,但我仍然想在控制台上将所有内容显示为UTF-16。
有没有一种方法可以在将std::coutstd::cerr写入控制台之前,将所有基于char的数据(采用UTF-8编码)转换为基于wchar_t的数据(采用UTF-16编码)?
很高兴看到仅使用C ++ 03,STL和Boost 1.54的解决方案。
我发现Boost Locale具有针对单个字符串的转换功能,并且在Boost Spirit中有一个UTF-8到UTF-32迭代器可用,但是我没有找到任何方面的编解码器就可以将UTF-8转换为UTF-16,而无需使用其他库或切换到C ++ 11。

提前致谢。

PS:我知道它是可行的东西像这样 ,但我希望在这里找到一个更好的解决方案。

我没有想出一个比已经暗示的更好的解决方案。
因此,我将在这里向任何对此感兴趣的人分享基于streambuf的解决方案。 希望有人会提出更好的解决方案并在这里分享。

#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>


#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
#define TEST_ARG_TYPE wchar_t
#else /* not windows, unicode */
#define TEST_ARG_TYPE char
#endif /* windows, unicode */


#ifndef _O_U16TEXT
#define _O_U16TEXT 0x20000
#endif


static size_t countValidUtf8Bytes(const unsigned char * buf, const size_t size) {
    size_t i, charSize;
    const unsigned char * src = buf;
    for (i = 0; i < size && (*src) != 0; i += charSize, src += charSize) {
        charSize = 0;
        if ((*src) >= 0xFC) {
            charSize = 6;
        } else if ((*src) >= 0xF8) {
            charSize = 5;
        } else if ((*src) >= 0xF0) {
            charSize = 4;
        } else if ((*src) >= 0xE0) {
            charSize = 3;
        } else if ((*src) >= 0xC0) {
            charSize = 2;
        } else if ((*src) >= 0x80) {
            /* Skip continuous UTF-8 character (should never happen). */
            for (; (i + charSize) < size && src[charSize] != 0 && src[charSize] >= 0x80; charSize++) {
                charSize++;
            }
        } else {
            /* ASCII character. */
            charSize = 1;
        }
        if ((i + charSize) > size) break;
    }
    return i;
}


#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
#include <locale>
#include <streambuf>
#include <boost/locale.hpp>

extern "C" {
#include <fcntl.h>
#include <io.h>
#include <windows.h>

int _CRT_glob;
extern void __wgetmainargs(int *, wchar_t ***, wchar_t ***, int, int *);
}


class Utf8ToUtf16Buffer : public std::basic_streambuf< char, std::char_traits<char> > {
private:
    char * outBuf;
    FILE * outFd;
public:
    static const size_t BUFFER_SIZE = 1024;
    typedef std::char_traits<char> traits_type;
    typedef traits_type::int_type int_type;
    typedef traits_type::pos_type pos_type;
    typedef traits_type::off_type off_type;

    explicit Utf8ToUtf16Buffer(FILE * o) : outBuf(new char[BUFFER_SIZE]), outFd(o) {
        /* Initialize the put pointer. Overflow won't get called until this
         * buffer is filled up, so we need to use valid pointers.
         */
        this->setp(outBuf, outBuf + BUFFER_SIZE - 1);
    }

    ~Utf8ToUtf16Buffer() {
        delete[] outBuf;
    }
protected:
    virtual int_type overflow(int_type c);
    virtual int_type sync();
};


Utf8ToUtf16Buffer::int_type Utf8ToUtf16Buffer::overflow(Utf8ToUtf16Buffer::int_type c) {
    char * iBegin = this->outBuf;
    char * iEnd = this->pptr();
    int_type result = traits_type::not_eof(c);

    /* If this is the end, add an eof character to the buffer.
     * This is why the pointers passed to setp are off by 1
     * (to reserve room for this).
     */
    if ( ! traits_type::eq_int_type(c, traits_type::eof()) ) {
        *iEnd = traits_type::to_char_type(c);
        iEnd++;
    }

    /* Calculate output data length. */
    int_type iLen = static_cast<int_type>(iEnd - iBegin);
    int_type iLenU8 = static_cast<int_type>(
        countValidUtf8Bytes(reinterpret_cast<const unsigned char *>(iBegin), static_cast<size_t>(iLen))
    );

    /* Convert string to UTF-16 and write to defined file descriptor. */
    if (fwprintf(this->outFd, boost::locale::conv::utf_to_utf<wchar_t>(std::string(outBuf, outBuf + iLenU8)).c_str()) < 0) {
        /* Failed to write data to output file descriptor. */
        result = traits_type::eof();
    }

    /* Reset the put pointers to indicate that the buffer is free. */
    if (iLenU8 == iLen) {
        this->setp(outBuf, outBuf + BUFFER_SIZE + 1);
    } else {
        /* Move incomplete UTF-8 characters remaining in buffer. */
        const size_t overhead = static_cast<size_t>(iLen - iLenU8);
        memmove(outBuf, outBuf + iLenU8, overhead);
        this->setp(outBuf + overhead, outBuf + BUFFER_SIZE + 1);
    }

    return result;
}


Utf8ToUtf16Buffer::int_type Utf8ToUtf16Buffer::sync() {
    return traits_type::eq_int_type(this->overflow(traits_type::eof()), traits_type::eof()) ? -1 : 0;
}

#endif /* windows, unicode */


int test_main(int argc, TEST_ARG_TYPE ** argv);


#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
int main(/*int argc, char ** argv*/) {
    wchar_t ** wenpv, ** wargv;
    int wargc, si = 0;
    /* this also creates the global variable __wargv */
    __wgetmainargs(&wargc, &wargv, &wenpv, _CRT_glob, &si);
    /* enable UTF-16 output to standard output console */
    _setmode(_fileno(stdout), _O_U16TEXT);
    std::locale::global(boost::locale::generator().generate("UTF-8"));
    Utf8ToUtf16Buffer u8cout(stdout);
    std::streambuf * out = std::cout.rdbuf();
    std::cout.rdbuf(&u8cout);
    /* process user defined main function */
    const int result = test_main(wargc, wargv);
    /* revert stream buffers to let cout clean up remaining memory correctly */
    std::cout.rdbuf(out);
    return result;
#else /* not windows or unicode */
int main(int argc, char ** argv) {
    return test_main(argc, argv);
#endif /* windows, unicode */
}

int test_main(int /*argc*/, TEST_ARG_TYPE ** /*argv*/) {
    const std::string str("\x61\x62\x63\xC3\xA4\xC3\xB6\xC3\xBC\xE3\x81\x82\xE3\x81\x88\xE3\x81\x84\xE3\x82\xA2\xE3\x82\xA8\xE3\x82\xA4\xE4\xBA\x9C\xE6\xB1\x9F\xE6\x84\x8F");

    for (size_t i = 1; i <= str.size(); i++) {
        const std::string part(str.begin(), str.begin() + i);
        const size_t validByteCount = countValidUtf8Bytes(reinterpret_cast<const unsigned char *>(part.c_str()), part.size());
        wprintf(L"i = %u, v = %u\n", i, validByteCount);
        const std::string valid(str.begin(), str.begin() + validByteCount);
        std::cout << valid << std::endl;
        std::cout.flush();
        for (size_t j = 0; j < part.size(); j++) {
            wprintf(L"%02X", static_cast<int>(part[j]) & 0xFF);
        }
        wprintf(L"\n");
    }

    return EXIT_SUCCESS;
}

我觉得这可能不是个好主意。但是,只要控制台使用正确的字体,我仍然认为它可以正常工作。

#include <iostream>
#include <windows.h>
//#include <io.h>
//#include <fcntl.h>

std::wstring UTF8ToUTF16(const char* utf8)
{
    std::wstring utf16;
    int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
    if (len > 1)
    {
        utf16.resize(len);
        MultiByteToWideChar(CP_UTF8, 0, utf8, -1, &utf16[0], len);
    }
    return utf16;
}

std::ostream& operator << (std::ostream& os, const char* data)
{
    //_setmode(_fileno(stdout), _O_U16TEXT);
    SetConsoleCP(1200);
    std::wstring str = UTF8ToUTF16(data);
    DWORD slen = str.size();
    WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), str.c_str(), slen, &slen, nullptr);

    MessageBoxW(NULL, str.c_str(), L"", 0);
    return os;
}

std::ostream& operator << (std::ostream& os, const std::string& data)
{
    //_setmode(_fileno(stdout), _O_U16TEXT);
    SetConsoleCP(1200);
    std::wstring str = UTF8ToUTF16(&data[0]);
    DWORD slen = str.size();
    WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), str.c_str(), slen, &slen, nullptr);
    return os;
}

std::wostream& operator <<(std::wostream& os, const wchar_t* data)
{
    DWORD slen = wcslen(data);
    WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), data, slen, &slen, nullptr);
    return os;
}

std::wostream& operator <<(std::wostream& os, const std::wstring& data)
{
    WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), data.c_str(), data.size(), nullptr, nullptr);
    return os;
}

int main()
{
    std::cout<<"Россия";
}

现在coutstd::wcout都使用WriteConsoleW函数。您必须为const char*, char*, std::string, char等重载它。无论您需要什么。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM