UTF8 string to int

Question

I have been struggling for some time with trying to extract an int from a UTF8 file:

#include <iostream>
#include <fstream>
#include <sstream>

using namespace std;

int main()
{
    ifstream file("UTF8.txt");
    if(file.is_open())
    {
        string line;
        getline(file, line);
        istringstream ss(line);
        int a;
        ss >> a;
        if(ss.fail())
        {
            cout << "Error parsing" << endl;
            ss.clear();
        }
        getline(file, line);
        cout << a << endl << line << endl;
        file.close();
    }
}

The file contains 2 lines: "42" and "è_é", and is saved in Notepad as UTF8. The above works when the file is ANSI, but fails when it is Unicode. I've tried a number of things, the most promising one being to set the locale, but I would like the program to be independent on the locale of the computer (ie read chinese characters even if the PC is a US one). Honestly, I'm out of ideas now. I'd like to avoid using CStrings from Qt if possible.

UPDATE

The following displays "0", "Error parsing" because of one weird character at the very beginning of the file. An empty line, discarded when read, just before the number makes it work but I can't change the file in the final program. Accents are not displayed properly in the console, but when I write the output to a file all is well and that's all I need. So it's only that issue with the beginning of the file!

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::ifstream file("UTF8.srt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::string line;
        std::getline(file,line);
        std::istringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::cout << "Error parsing" << std::endl;
            ss.clear();
        }
        getline(file,line);
        std::cout << a << std::endl << line << std::endl;
        file.close();
    }
}

SOLUTION

The following works, with the input file content as follows:

5
bla bla é_è

6
truc è_é

Code:

#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>

// Do not get used to it:
// using namespace std;

inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
    if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
        s += 3;
    return s;
}

int main()
{
    std::ifstream file("UTF8.txt");
    std::ofstream fileO("UTF8_copy.txt");
    if(!file || !fileO) {
        std::cout << "Error opening files" << std::endl;
    }
    else {
        std::string line;

        //Parse the first number
        std::getline(file, line);
        {
            const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
            std::istringstream input(linePtr);
            int a = -1;
            input >> a;
            if( ! input) {
                std::cout << "Error parsing" << std::endl;
            }
            std::cout << "Number 1: " << a << std::endl;
            fileO << a << std::endl;
        }

        //Copy the following line as is
        std::getline(file, line);
        fileO << line << std::endl;

        //Discard empty line, copy it in the output file
        std::getline(file, line);
        fileO << std::endl;

        //Parse the second number
        std::getline(file, line);
        {
            const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
            std::istringstream input(linePtr);
            int a = -1;
            input >> a;
            if( ! input) {
                std::cout << "Error parsing" << std::endl;
            }
            std::cout << "Number 1: " << a << std::endl;
            fileO << a << std::endl;
        }

        //Copy the following line as is
        std::getline(file, line);
        fileO << line << std::endl;

        file.close();
        fileO.close();
    }

    return 0;
}

Answer 1

Read the file with std::codecvt_mode

Example from the link above:

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>

int main()
{
    // UTF-8 data with BOM
    std::ofstream("text.txt") << u8"\ufeffz\u6c34\U0001d10b";
    // read the UTF8 file, skipping the BOM
    std::wifstream fin("text.txt");
    fin.imbue(std::locale(fin.getloc(),
                          new std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>));
    for (wchar_t c; fin.get(c); )
        std::cout << std::hex << std::showbase << c << '\n';
}

Note the std::consume_header setting.

Adapted to your question it might be:

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::ifstream file("UTF8.txt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<char,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::string line;
        std::getline(file,line);
        std::istringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::cout << "Error parsing" << std::endl;
            ss.clear();
        }
        getline(file,line);
        std::cout << a << std::endl << line << std::endl;
        file.close();
    }
}

Or with wchar_t :

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::wifstream file("UTF8.txt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::wstring line;
        std::getline(file,line);
        std::wistringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::wcout << L"Error parsing" << std::endl;
            ss.clear();
        }
        std::getline(file,line);
        std::wcout << a << std::endl << line << std::endl;
        file.close();
    }
}

Answer 2

Just skip the leading BOM (Byte Order Mark):

#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>

// Do not get used to it:
// using namespace std;

inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
    if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
        s += 3;
    return s;
}


int main()
{
    std::istringstream file(u8"\xEF\xBB\xBF""42\n\u00E8_\u00E9\n");
    std::string line;
    getline(file, line);
    const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
    std::istringstream input(linePtr);
    int a = -1;
    input >> a;
    if( ! input) {
        std::cout << "Error parsing" << std::endl;
    }
    getline(file, line);
    std::cout << a << std::endl << line << std::endl;
}

UTF8 string to int

Question

UPDATE

SOLUTION

2 answers

solution1
2 2016-03-17 12:23:24

solution2
2 ACCPTED

UTF8 string to int

Question

UPDATE

SOLUTION

2 answers

solution1 2 2016-03-17 12:23:24

solution2 2 ACCPTED

solution1
2 2016-03-17 12:23:24

solution2
2 ACCPTED