C ++排序索引文件就地（帶有heapsort）

Question

更新的問題：

你好 我試圖就地對索引文件進行排序。 該文件由14B數據塊組成，通常太大而無法加載到RAM中。 前8B是我要排序的字節。 我實現了一個Heapsort算法，到目前為止，除性能外，效果都很好！

我想知道是否可以改善我的實現，以及如何通過使用一些RAM來加快此過程。 我當時正在考慮可能將堆部分保留在RAM中，但是我不確定這將如何工作。

到目前為止，我的代碼：

sortidx.h

#ifndef SORTIDX_H
#define SORTIDX_H

// Includes
#include <atomic>
#include <fstream>
#include <iostream>
#include <limits>
#include <string>
#include <thread>

// Constants
constexpr size_t hashSize = 8;
constexpr size_t offsetSize = 6;
constexpr size_t writeSize = hashSize + offsetSize;

// Typedefs & Structs
typedef std::lock_guard<std::mutex> scoped_lock;

struct IndexEntry {
    unsigned char hash[hashSize]; // First 64 bits of the hash
    unsigned char position[offsetSize]; // Position of word in dictionary (48-bit little endian integer)
} __attribute__( (__packed__) );

// Functions
bool operator>( const IndexEntry &rhs, const IndexEntry &lhs );

constexpr size_t getParent( size_t i ) {
    return (i - 1) / 2;
}

constexpr size_t getLeft( size_t i ) {
    return i * 2 + 1;
}

constexpr size_t getRight( size_t i ) {
    return i * 2 + 2;
}

void sortIDX( std::string idxFile );

void heapifyIDX( size_t heapifyLimit );
void sortIDXHeap( size_t numDataSets );

void readData( IndexEntry* entry, size_t pos );
void writeData( IndexEntry* entry, size_t pos );
bool isInHeap( size_t pos );
void orderHeap( IndexEntry &top, size_t posTop );

#endif

sortidx.cpp

#include "sortidx.h"

using namespace std;

streampos fileSize;
size_t numDataSets;
size_t limit;
atomic<size_t> pos;
fstream* file;

bool operator>( const IndexEntry &rhs, const IndexEntry &lhs ) {
    for ( size_t i = 0; i < hashSize; i++ ) {
        if ( rhs.hash[i] > lhs.hash[i] )
            return true;
        else if ( rhs.hash[i] < lhs.hash[i] )
            return false;
    }

    return false;
}

void sortIDX( string idxFile ) {
    file = new fstream( idxFile, ios::in | ios::out | ios::binary | ios::ate );
    fileSize = file->tellg();
    numDataSets = fileSize / writeSize;
    limit = numDataSets - 1;
    const size_t localLimit = limit;
    const size_t heapifyLimit = getParent( limit );
    thread* sorterThread;

    sorterThread = new thread( heapifyIDX, heapifyLimit );

    while ( pos <= heapifyLimit ) {
        // Some progressbar stuff (uses pos)
    }

    sorterThread->join();
    delete sorterThread;

    pos = 0;
    sorterThread = new thread( sortIDXHeap, localLimit );

    while ( pos < localLimit ) {
        // Some progressbar stuff (uses pos)
    }

    sorterThread->join();
    delete sorterThread;

    file->close();
    delete file;
}

void heapifyIDX( size_t heapifyLimit ) {
    IndexEntry top;
    size_t posTop;

    for ( pos = 0; pos <= heapifyLimit; pos++ ) {
        posTop = heapifyLimit - pos;

        readData( &top, posTop );

        orderHeap( top, posTop );
    }
}

void sortIDXHeap( size_t numDataSets ) {
    IndexEntry last;
    IndexEntry top;
    size_t posLast;
    size_t posTop;

    for ( pos = 0; pos < numDataSets; pos++ ) {
        posLast = numDataSets - pos;
        posTop = 0;
        limit = posLast - 1;

        readData( &last, posTop );
        readData( &top, posLast );
        writeData( &last, posLast );

        orderHeap( top, posTop );
    }
}

void readData( IndexEntry* entry, size_t pos ) {
    file->seekg( pos * writeSize );
    file->read( (char*)entry, writeSize );
}

void writeData( IndexEntry* entry, size_t pos ) {
    file->seekp( pos * writeSize );
    file->write( (char*)entry, writeSize );
}

bool isInHeap( size_t pos ) {
    return pos <= limit;
}

void orderHeap( IndexEntry &top, size_t posTop ) {
    static IndexEntry left;
    static IndexEntry right;
    static size_t posLeft;
    static size_t posRight;
    static bool swapped;

    do {
        posLeft = getLeft( posTop );
        posRight = getRight( posTop );

        if ( isInHeap( posLeft ) ) {
            readData( &left, posLeft );

            if ( isInHeap( posRight ) ) {
                readData( &right, posRight );

                if ( right > left ) {
                    if ( right > top ) {
                        writeData( &right, posTop );
                        posTop = posRight;

                        swapped = true;
                    } else {
                        swapped = false;
                    }
                } else {
                    if ( left > top ) {
                        writeData( &left, posTop );
                        posTop = posLeft;

                        swapped = true;
                    } else {
                        swapped = false;
                    }
                }
            } else {
                if ( left > top ) {
                    writeData( &left, posTop );
                    posTop = posLeft;

                    swapped = true;
                } else {
                    swapped = false;
                }
            }
        } else {
            swapped = false;
        }
    } while ( swapped );

    writeData( &top, posTop );
}

原始問題：

希望您能為我遇到一些困擾的問題有所幫助。
我正在實現一個簡單的查找表，以快速搜索文件。 我當前的問題是索引文件。 目前，我正在遍歷數據文件，並使用要查找的8個字節的數據和隨后的6個字節的數據（指示該數據集在原始文件中的位置）創建索引條目。 因此，我的索引文件包含14個字節的數據塊。 現在，我想對該文件進行排序，以便可以通過在索引文件中進行二進制搜索輕松地找到我的數據。 到目前為止，這是我一直在努力的部分。

我需要按其前8個字節就地對這14個字節的條目進行排序。 僅按前8個字節排序應該不是問題。 我對如何對文件本身進行排序感到困惑。
我正在考慮嘗試為文件實現“迭代器”類，以便將其傳遞給std::sort ，這應該可以很好地完成工作。 但是由於我不確定應該提供什么接口才能正常工作，而且我也無法閱讀當前的進展，因此我做了一些研究，使我想起了Heapsort算法，它聽起來非常好，因為它具有O(n*log(n)) ，就位，我可以很好地估計進度。

到現在為止還挺好。 對於此方法的實際實現，我仍然有些疑惑，因為我不確定在文件中交換幾個字節數據的最佳方法是什么。 我也很想聽聽您是否對文件排序有其他建議，因為索引文件的大小為數GB，而性能則非常重要！

Answer 1

為什么需要就地排序？ 您的光碟可以毫無問題地存儲另一個17 GB的文件。

我會這樣排序。

讀取可以在RAM中處理的文件塊。 在該塊上使用您想要的任何排序算法（例如，快速排序，堆排序，..）
將排序后的數據塊寫入had光盤（每個數據塊在單獨的文件中）
轉到1，直到到達數據末尾
合並塊的排序文件，並將整體排序結果寫為最終排序文件。
刪除排序的大塊文件。

Answer 2

由於訪問數組的前幾個元素的次數最多，因此我決定將第一個元素加載到RAM中，直到達到限制（通過參數傳遞）為止。 我修改代碼的成就是這樣的：

// ...
size_t arraySize = 0;
IndexEntry* cacheArray;

void readIntoArray( size_t numElements ) {
    if ( arraySize != 0 )
        writeFromArray();

    arraySize = numElements;
    cacheArray = new IndexEntry[arraySize];
    file->seekg( 0 );

    for ( size_t i = 0; i < arraySize; i++ ) {
        file->read( (char*)(cacheArray + i), writeSize );
    }
}

void writeFromArray() {
    file->seekp( 0 );

    for ( size_t i = 0; i < arraySize; i++ ) {
        file->write( (char*)(cacheArray + i), writeSize );
    }

    arraySize = 0;
    delete[] cacheArray;
}

void sortIDX( string idxFile, size_t cacheSize, bool quiet ) {
    // ...

    cacheSize /= writeSize;
    readIntoArray( min(cacheSize, numDataSets) );

    sorterThread = new thread( heapifyIDX, heapifyLimit );

    // ...

    sorterThread->join();
    delete sorterThread;

    writeFromArray();

    file->close();
    delete file;
}

void readData( IndexEntry* entry, size_t pos ) {
    if ( pos < arraySize ) {
        *entry = cacheArray[pos];
    } else {
        file->seekg( pos * writeSize );
        file->read( (char*)entry, writeSize );
    }
}

void writeData( IndexEntry* entry, size_t pos ) {
    if ( pos < arraySize ) {
        cacheArray[pos] = *entry;
    } else {
        file->seekp( pos * writeSize );
        file->write( (char*)entry, writeSize );
    }
}

C ++排序索引文件就地（帶有heapsort）

問題描述

更新的問題：

sortidx.h

sortidx.cpp

原始問題：

2 個解決方案

解決方案1
0 2016-09-19 08:41:17

解決方案2
0 已采納 2016-09-21 17:09:52

C ++排序索引文件就地（帶有heapsort）

問題描述

更新的問題：

sortidx.h

sortidx.cpp

原始問題：

2 個解決方案

解決方案1 0 2016-09-19 08:41:17

解決方案2 0 已采納 2016-09-21 17:09:52

解決方案1
0 2016-09-19 08:41:17

解決方案2
0 已采納 2016-09-21 17:09:52