在c ++中讀取由文本文件中的行分隔的數百萬個整數的最有效方法是什么

Question

我的文本文件中有大約25百萬個由行分隔的整數。 我的第一個任務是采用那些整數並對它們進行排序。 我實際上已經實現了讀取整數並將它們放入數組中（因為我的排序函數將未排序的數組作為參數）。 但是，從文件中讀取整數是一個非常漫長且昂貴的過程。 我已經搜索了許多其他解決方案，以獲得更便宜和有效的方式來做到這一點，但我找不到一個解決這種大小的解決方案。 因此，您的建議是從巨大的（大約260MB）文本文件中讀取整數。 而且我如何才能有效地獲得相同問題的行數。

ifstream myFile("input.txt");

int currentNumber;
int nItems = 25000000;
int *arr = (int*) malloc(nItems*sizeof(*arr));
int i = 0;
while (myFile >> currentNumber)
{
    arr[i++] = currentNumber;
}

這就是我從文本文件中獲取整數的方法。 它並不復雜。 我假設行數是固定的（實際上是固定的）

順便說一下，當然不是太慢。 它使用2.2GHz i7處理器在OS X中完成大約9秒的讀取。 但我覺得它會好得多。

Answer 1

最有可能的是，對此的任何優化都可能產生相當小的影響。 在我的機器上，讀取大文件的限制因素是磁盤傳輸速度。 是的，提高讀取速度可以稍微提高一點，但最有可能的是，你不會從中獲得很多。

我在之前的測試中發現[我會看到我是否可以找到答案 - 我在“我的”實驗代碼“目錄中找不到源代碼]最快的方法是使用mmap加載文件。 但它只比使用ifstream快一點。

編輯：我的自制基准，用於以幾種不同的方式讀取文件。 getline在讀取文件時讀取整個文件然后根據換行符分割

按照慣例，基准測量衡量基准測量的內容，對環境或代碼編寫方式的微小變化有時會產生很大的不同。

編輯：以下是“從文件中讀取數字並將其存儲在矢量中”的一些實現：

#include <iostream>
#include <fstream>
#include <vector>
#include <sys/time.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>


using namespace std;

const char *file_name = "lots_of_numbers.txt";

void func1()
{
    vector<int> v;
    int num;
    ifstream fin(file_name);
    while( fin >> num )
    {
    v.push_back(num);
    }
    cout << "Number of values read " << v.size() << endl;
}


void func2()
{
    vector<int> v;
    v.reserve(42336000);
    int num;

    ifstream fin(file_name);
    while( fin >> num )
    {
    v.push_back(num);
    }
    cout << "Number of values read " << v.size() << endl;
}

void func3()
{
    int *v = new int[42336000];
    int num;

    ifstream fin(file_name);
    int i = 0;
    while( fin >> num )
    {
    v[i++] = num;
    }
    cout << "Number of values read " << i << endl;
    delete [] v;
}


void func4()
{
    int *v = new int[42336000];
    FILE *f = fopen(file_name, "r");
    int num;
    int i = 0;
    while(fscanf(f, "%d", &num) == 1)
    {
    v[i++] = num;
    }
    cout << "Number of values read " << i << endl;
    fclose(f);
    delete [] v;
}    

void func5()
{
    int *v = new int[42336000];
    int num = 0;

    ifstream fin(file_name);
    char buffer[8192];
    int i = 0;
    int bytes = 0;
    char *p;
    int hasnum = 0;
    int eof = 0;
    while(!eof)
    {
    fin.read(buffer, sizeof(buffer));
    p = buffer;
    bytes = 8192;
    while(bytes > 0)
    {
        if (*p == 26)   // End of file marker...
        {
        eof = 1;
        break;
        }
        if (*p == '\n' || *p == ' ')
        {
        if (hasnum)
            v[i++] = num;
        num = 0;
        p++;
        bytes--;
        hasnum = 0;
        }
        else if (*p >= '0' &&  *p <= '9')
        {
        hasnum = 1;
        num *= 10;
        num += *p-'0';
        p++;
        bytes--;
        }
        else 
        {
        cout << "Error..." << endl;
        exit(1);
        }
    }
    memset(buffer, 26, sizeof(buffer));  // To detect end of files. 
    }
    cout << "Number of values read " << i << endl;
    delete [] v;
}

void func6()
{
    int *v = new int[42336000];
    int num = 0;

    FILE *f = fopen(file_name, "r");
    char buffer[8192];
    int i = 0;
    int bytes = 0;
    char *p;
    int hasnum = 0;
    int eof = 0;
    while(!eof)
    {
    fread(buffer, 1, sizeof(buffer), f);
    p = buffer;
    bytes = 8192;
    while(bytes > 0)
    {
        if (*p == 26)   // End of file marker...
        {
        eof = 1;
        break;
        }
        if (*p == '\n' || *p == ' ')
        {
        if (hasnum)
            v[i++] = num;
        num = 0;
        p++;
        bytes--;
        hasnum = 0;
        }
        else if (*p >= '0' &&  *p <= '9')
        {
        hasnum = 1;
        num *= 10;
        num += *p-'0';
        p++;
        bytes--;
        }
        else 
        {
        cout << "Error..." << endl;
        exit(1);
        }
    }
    memset(buffer, 26, sizeof(buffer));  // To detect end of files. 
    }
    fclose(f);
    cout << "Number of values read " << i << endl;
    delete [] v;
}


void func7()
{
    int *v = new int[42336000];
    int num = 0;

    FILE *f = fopen(file_name, "r");
    int ch;
    int i = 0;
    int hasnum = 0;
    while((ch = fgetc(f)) != EOF)
    {
    if (ch == '\n' || ch == ' ')
    {
        if (hasnum)
        v[i++] = num;
        num = 0;
        hasnum = 0;
    }
    else if (ch >= '0' &&  ch <= '9')
    {
        hasnum = 1;
        num *= 10;
        num += ch-'0';
    }
    else 
    {
        cout << "Error..." << endl;
        exit(1);
    }
    }
    fclose(f);
    cout << "Number of values read " << i << endl;
    delete [] v;
}


void func8()
{
    int *v = new int[42336000];
    int num = 0;

    int f = open(file_name, O_RDONLY);

    off_t size = lseek(f, 0, SEEK_END);
    char *buffer = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, f, 0);

    int i = 0;
    int hasnum = 0;
    int bytes = size;
    char *p = buffer;
    while(bytes > 0)
    {
    if (*p == '\n' || *p == ' ')
    {
        if (hasnum)
        v[i++] = num;
        num = 0;
        p++;
        bytes--;
        hasnum = 0;
    }
    else if (*p >= '0' &&  *p <= '9')
    {
        hasnum = 1;
        num *= 10;
        num += *p-'0';
        p++;
        bytes--;
    }
    else 
    {
        cout << "Error..." << endl;
        exit(1);
    }
    }
    close(f);
    munmap(buffer, size);
    cout << "Number of values read " << i << endl;
    delete [] v;
}






struct bm
{
    void (*f)();
    const char *name;
};

#define BM(f) { f, #f }

bm b[] = 
{
    BM(func1),
    BM(func2),
    BM(func3),
    BM(func4),
    BM(func5),
    BM(func6),
    BM(func7),
    BM(func8),
};


double time_to_double(timeval *t)
{
    return (t->tv_sec + (t->tv_usec/1000000.0)) * 1000.0;
}

double time_diff(timeval *t1, timeval *t2)
{
    return time_to_double(t2) - time_to_double(t1);
}



int main()
{
    for(int i = 0; i < sizeof(b) / sizeof(b[0]); i++)
    {
    timeval t1, t2;
    gettimeofday(&t1, NULL);
    b[i].f();
    gettimeofday(&t2, NULL);
    cout << b[i].name << ": " << time_diff(&t1, &t2) << "ms" << endl;
    }
    for(int i = sizeof(b) / sizeof(b[0])-1; i >= 0; i--)
    {
    timeval t1, t2;
    gettimeofday(&t1, NULL);
    b[i].f();
    gettimeofday(&t2, NULL);
    cout << b[i].name << ": " << time_diff(&t1, &t2) << "ms" << endl;
    }
}

結果（連續兩次運行，向前和向后以避免文件緩存的好處）：

Number of values read 42336000
func1: 6068.53ms
Number of values read 42336000
func2: 6421.47ms
Number of values read 42336000
func3: 5756.63ms
Number of values read 42336000
func4: 6947.56ms
Number of values read 42336000
func5: 941.081ms
Number of values read 42336000
func6: 962.831ms
Number of values read 42336000
func7: 2572.4ms
Number of values read 42336000
func8: 816.59ms
Number of values read 42336000
func8: 815.528ms
Number of values read 42336000
func7: 2578.6ms
Number of values read 42336000
func6: 948.185ms
Number of values read 42336000
func5: 932.139ms
Number of values read 42336000
func4: 6988.8ms
Number of values read 42336000
func3: 5750.03ms
Number of values read 42336000
func2: 6380.36ms
Number of values read 42336000
func1: 6050.45ms

總之，正如有人在評論中指出的那樣，整數的實際解析是整個時間的重要部分，因此閱讀文件並不像我最初做的那樣重要。 即使是一種非常天真的讀取文件的方式（使用fgetc()使用ifstream operator>>來獲取整數。

可以看出，使用mmap加載文件比通過fstream讀取文件要快一些，但只是略微如此。

Answer 2

您可以使用外部排序對文件中的值進行排序，而無需將它們全部加載到內存中。 排序速度將受到硬盤驅動器功能的限制，但您將能夠處理真正龐大的文件。 這是實施。

Answer 3

Qt會很簡單：

QFile file("h:/1.txt");
file.open(QIODevice::ReadOnly);
QDataStream in(&file);

QVector<int> ints;
ints.reserve(25000000);

while (!in.atEnd()) {
    int integer;
    qint8 line; 
    in >> integer >> line; // read an int into integer, a char into line
    ints.append(integer); // append the integer to the vector
}

最后，你有ints QVector您可以輕松地進行排序。 如果文件格式正確，則行數與向量的大小相同。

在我的機器上，i7 3770k @ 4.2 Ghz，讀取2500萬個整數需要大約490毫秒並將它們放入一個矢量中。 從普通的機械硬盤讀取，而不是SSD。

將整個文件緩沖到內存中並沒有多大幫助，時間下降到420毫秒。

Answer 4

嘗試讀取整數塊並解析這些塊而不是逐行讀取。

Answer 5

一種可能的解決方案是將大文件分成更小的塊。 分別對每個塊進行排序，然后逐個合並所有已排序的塊。

編輯：顯然這是一個成熟的方法。 請參閱http://en.wikipedia.org/wiki/External_sorting上的 “外部合並排序”

Answer 6

260MB並不是那么大。 您應該能夠將整個內容加載到內存中，然后通過它進行解析。 進入后，您可以使用嵌套循環讀取行結尾之間的整數，並使用常用函數進行轉換。 在開始之前，我會嘗試為你的整數數組預分配足夠的內存。

哦，您可能會發現粗略的舊C風格文件訪問功能是這類事情的更快選擇。

Answer 7

你沒有說你是如何讀取價值的，所以很難說。 實際上，實際上只有兩種解決方案：`someItream

anInt and fscanf（someFd，“％d”，＆anInt）`邏輯上，這些應該具有相似的性能，但實現方式各不相同; 可能值得嘗試和測量兩者。

要檢查的另一件事是你如何存儲它們。 如果你知道你有大約2500萬，在閱讀它們之前在std::vector上做3000萬的reserve可能會有所幫助。 構造具有3000萬個元素的vector也可能更便宜，然后在看到結束時修剪它，而不是使用push_back 。

最后，你可能會考慮寫一個immapstreambuf ，並用它來mmap輸入，並直接從映射內存讀取它。 或者甚至手動迭代它，調用strtol （但這是更多的工作）; 所有流媒體解決方案可能最終都會調用strtol或類似的東西，但首先要圍繞調用做一些重要的工作。

編輯：

FWIW，我在我的家用機器上做了一些非常快速的測試（一個相當新的LeNova，運行Linux），結果讓我感到驚訝：

作為參考，我使用std::cin >> tmp和v.push_back( tmp );完成了瑣碎，天真的實現v.push_back( tmp ); ，沒有嘗試優化。 在我的系統上，這只用了不到10秒。
簡單的優化，例如在向量上使用reserve ，或者最初創建大小為25000000的向量，並沒有太大變化 - 時間仍然超過9秒。
使用一個非常簡單的mmapstreambuf ，時間下降到大約3秒 - 最簡單的循環，沒有reserve等。
使用fscanf ，時間下降到不到3秒。 我懷疑FILE*的Linux實現也使用mmap （而std::filebuf沒有）。
最后，使用mmapbuffer ，使用兩個char*迭代，並使用stdtol進行轉換，時間降至一秒以下，

這些測試很快完成（編寫和運行所有這些測試不到一個小時），並且遠非嚴格（當然，不要告訴你有關其他環境的任何信息），但這些差異讓我感到驚訝。 我沒想到差別太大。

Answer 8

我會這樣做：

#include <fstream>
#include <iostream>
#include <string>

using namespace std;

int main() {

    fstream file;
    string line;
    int intValue;
    int lineCount = 0;
    try {
        file.open("myFile.txt", ios_base::in); // Open to read
        while(getline(file, line)) {
            lineCount++;
            try {
                intValue = stoi(line);
                // Do something with your value
                cout << "Value for line " << lineCount << " : " << intValue << endl;

            } catch (const exception& e) {
                cerr << "Failed to convert line " << lineCount << " to an int : " << e.what() << endl;
            }
        }
    } catch (const exception& e) {
        cerr << e.what() << endl;
        if (file.is_open()) {
            file.close();
        }
    }

    cout << "Line count : " << lineCount << endl;

    system("PAUSE");
}

在c ++中讀取由文本文件中的行分隔的數百萬個整數的最有效方法是什么

問題描述

8 個解決方案

解決方案1
8 已采納 2013-02-27 15:41:34

解決方案2
3 2013-02-27 16:11:40

解決方案3
1 2013-02-27 15:58:31

解決方案4
0 2013-02-27 15:38:29

解決方案5
0 2013-02-27 15:42:19

解決方案6
0 2013-02-27 15:42:46

解決方案7
0 2013-02-27 15:49:40

編輯：

解決方案8
0 2013-02-27 15:53:10

在c ++中讀取由文本文件中的行分隔的數百萬個整數的最有效方法是什么

問題描述

8 個解決方案

解決方案1 8 已采納 2013-02-27 15:41:34

解決方案2 3 2013-02-27 16:11:40

解決方案3 1 2013-02-27 15:58:31

解決方案4 0 2013-02-27 15:38:29

解決方案5 0 2013-02-27 15:42:19

解決方案6 0 2013-02-27 15:42:46

解決方案7 0 2013-02-27 15:49:40

編輯：

解決方案8 0 2013-02-27 15:53:10

解決方案1
8 已采納 2013-02-27 15:41:34

解決方案2
3 2013-02-27 16:11:40

解決方案3
1 2013-02-27 15:58:31

解決方案4
0 2013-02-27 15:38:29

解決方案5
0 2013-02-27 15:42:19

解決方案6
0 2013-02-27 15:42:46

解決方案7
0 2013-02-27 15:49:40

解決方案8
0 2013-02-27 15:53:10