在這種情況下，為什么STL priority_queue不比multiset快得多？

Question

我正在比較STL（g ++）priority_queue的性能，發現push和pop沒有我想象的那么快。 請參閱以下代碼：

#include <set>
#include <queue>

using namespace std;

typedef multiset<int> IntSet;

void testMap()
{
    srand( 0 );

    IntSet iSet;

    for ( size_t i = 0; i < 1000; ++i )
    {
        iSet.insert(rand());
    }

    for ( size_t i = 0; i < 100000; ++i )
    {
        int v = *(iSet.begin());
        iSet.erase( iSet.begin() );
        v = rand();
        iSet.insert(v);
    }
}

typedef priority_queue<int> IntQueue;

void testPriorityQueue()
{
    srand(0);
    IntQueue q;

    for ( size_t i = 0; i < 1000; ++i )
    {
        q.push(rand());
    }

    for ( size_t i = 0; i < 100000; ++i )
    {
        int v = q.top();
        q.pop();
        v = rand();
        q.push(v);
    }
}

int main(int,char**)
{
   testMap();
   testPriorityQueue();
}

我編譯了這個-O3，然后運行了valgrind --tool = callgrind，KCachegrind testMap占用了總CPU的54％testPriorityQueue占用了44％的CPU

（沒有-O3 testMap比testPriorityQueue快很多）調用testPriorityQueue似乎大部分時間的函數被調用

void std::__adjust_heap<__gbe_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > >, long, int, std::less<int> >

該函數似乎是從pop（）調用中調用的。

這個功能究竟做了什么？ 有沒有辦法通過使用不同的容器或分配器來避免它？

Answer 1

優先級隊列實現為堆：每次刪除head元素時都必須“重新平衡”。 在鏈接描述中， delete-min是一個O(log n)操作，實際上因為min （或head）元素是展平二叉樹的根。

該集合通常實現為紅黑樹，而min元素將是最左邊的節點（因此要么是葉子，要么最多只有一個右子節點）。 因此，它最多可以移動1個孩子，並且可以根據允許的不平衡程度在多個pop調用中分攤重新平衡。

請注意，如果堆具有任何優勢，則它可能位於引用位置（因為它是連續的而不是基於節點的）。 這正是callgrind 可能難以准確測量的那種優勢，所以我建議在接受這個結果之前運行一些已經過時的實時基准測試。

Answer 2

我已經實現了一個優先級隊列，當使用-O3編譯時，該隊列似乎運行得更快。 也許只是因為編譯器能夠在STL情況下內聯更多？

#include <set>
#include <queue>
#include <vector>
#include <iostream>

using namespace std;

typedef multiset<int> IntSet;

#define TIMES 10000000

void testMap()
{
    srand( 0 );

    IntSet iSet;

    for ( size_t i = 0; i < 1000; ++i ) {
        iSet.insert(rand());
    }

    for ( size_t i = 0; i < TIMES; ++i ) {
        int v = *(iSet.begin());
        iSet.erase( iSet.begin() );
        v = rand();
        iSet.insert(v);
    }
}

typedef priority_queue<int> IntQueue;

void testPriorityQueue()
{
    srand(0);
    IntQueue q;

    for ( size_t i = 0; i < 1000; ++i ) {
        q.push( rand() );
    }

    for ( size_t i = 0; i < TIMES; ++i ) {
        int v = q.top();
        q.pop();
        v = rand();
        q.push(v);
    }
}


template <class T>
class fast_priority_queue
{
public:
    fast_priority_queue()
        :size(1) {
        mVec.resize(1); // first element never used
    }
    void push( const T& rT ) {
        mVec.push_back( rT );
        size_t s = size++;
        while ( s > 1 ) {
            T* pTr = &mVec[s];
            s = s / 2;
            if ( mVec[s] > *pTr ) {
                T tmp = mVec[s];
                mVec[s] = *pTr;
                *pTr = tmp;
            } else break;
        }
    }
    const T& top() const {
        return mVec[1];
    }
    void pop() {
        mVec[1] = mVec.back();
        mVec.pop_back();
        --size;
        size_t s = 1;
        size_t n = s*2;
        T& rT = mVec[s];
        while ( n < size ) {
            if ( mVec[n] < rT ) {
                T tmp = mVec[n];
                mVec[n] = rT;
                rT = tmp;
                s = n;
                n = 2 * s;
                continue;
            }
            ++n;
            if ( mVec[n] < rT ) {
                T tmp = mVec[n];
                mVec[n] = rT;
                rT = tmp;
                s = n;
                n = 2 * s;
                continue;
            }
            break;
        }
    }
    size_t size;
    vector<T> mVec;
};

typedef fast_priority_queue<int> MyQueue;

void testMyPriorityQueue()
{
    srand(0);
    MyQueue q;

    for ( size_t i = 0; i < 1000; ++i ) {
        q.push( rand() );
    }

    for ( size_t i = 0; i < TIMES; ++i ) {
        int v = q.top();
        q.pop();
        v = rand();
        q.push(v);
    }
}


int main(int,char**)
{
    clock_t t1 = clock();
    testMyPriorityQueue();
    clock_t t2 = clock();
    testMap();
    clock_t t3 = clock();
    testPriorityQueue();
    clock_t t4 = clock();

    cout << "fast_priority_queue: " << t2 - t1 << endl;
    cout << "std::multiset: " << t3 - t2 << endl;
    cout << "std::priority_queue: " << t4 - t3 << endl;
}

當用64位Linux上的g ++ 4.1.2標志：-O3編譯時，這給了我：

fast_priority_queue: 260000
std::multiset: 620000
std::priority_queue: 490000

在這種情況下，為什么STL priority_queue不比multiset快得多？

問題描述

2 個解決方案

解決方案1
9 2012-08-03 17:49:26

解決方案2
2 2012-08-03 20:00:37

在這種情況下，為什么STL priority_queue不比multiset快得多？

問題描述

2 個解決方案

解決方案1 9 2012-08-03 17:49:26

解決方案2 2 2012-08-03 20:00:37

解決方案1
9 2012-08-03 17:49:26

解決方案2
2 2012-08-03 20:00:37