簡體   English   中英

為什么這個普通數組實現比std :: vector實現性能慢?

[英]Why is this plain array implementation slower than the std::vector implementation performance?

為什么這個普通數組實現比std :: vector實現性能慢?

由於我在處理某些工作時出現了一些奇怪的結果,因此我決定編寫一個簡化的測試來比較std::vector與純數組效率。

我有兩種方法都可以實現的結構,

1個使用普通數組(大小不同)

typedef struct {
    uint16_t index;
     uint16_t nvals;
     uint16_t vals[50];
     double mean;
} a_segment_t;

2使用STL

 typedef struct {
      uint16_t index;
      uint16_t nvals;
      vector<uint16_t> vals;
      uint32_t mean;
} b_segment_t;

在內存中創建該對象不是我感興趣的(因此我不介意push_back() ),一旦該對象在內存中,它將用於操作 ,而效率就是我要分析的。 vals填充有一些隨機數據。

操作將遍歷存儲在每個段中的值,在這種情況下,將進行簡單的均值計算。 測試如下:

using namespace std;
#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>

#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000

// plain array approach
typedef struct {
    uint16_t index;
    uint16_t nvals;
    uint16_t vals[MAX_NPXS];
    double mean;
} a_segment_t;
uint16_t operation(uint16_t, a_segment_t*);
uint16_t print(uint16_t nsegments, a_segment_t* p_segments);

// stl vector approach
typedef struct {
    uint16_t index;
    uint16_t nvals;
    vector<uint16_t> vals;
    uint32_t mean;
} b_segment_t;
uint16_t operation(uint16_t, vector<b_segment_t>*);
uint16_t print(uint16_t nsegments, vector<b_segment_t>*);

void delta_time(struct timespec*, struct timespec*, struct timespec*);

uint16_t operation(uint16_t nsegments, a_segment_t* p_segments) {
    // the operation (plain array approach)
    uint64_t sum;
    for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
        sum = 0;
        for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
            sum = sum + p_segments[nsegment].vals[nval];
        }
        p_segments[nsegment].mean = sum/p_segments[nsegment].nvals;
    }
    return nsegments;
}

uint16_t print(uint16_t nsegments, a_segment_t* p_segments) {
    // print data (plain array approach)
    for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
        cout << "index : " << setfill('0') << setw(3) << p_segments[nsegment].index;
        cout << "\tnval : " << setfill('0') << setw(3) << p_segments[nsegment].nvals;
        cout << "\tvals : [";
        for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
            cout << p_segments[nsegment].vals[nval] << ",";
        }
        cout << "\b]" << endl;
    }
    return nsegments;
}

uint16_t operation(uint16_t nsegments, vector<b_segment_t>* p_segments) {
    // the operation (stl vector approach)
    uint32_t sum;
    for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
        sum = 0;
        for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
            sum = sum + (*p_val);
        }
        p_segment->mean = sum/(p_segment->nvals);
    }
    return nsegments;
}

uint16_t print(uint16_t nsegments, vector<b_segment_t>* p_segments) {
    // print data (stl vector approach)
    for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
        cout << "index : " << setfill('0') << setw(3) << p_segment->index;
        cout << "\tnval : " << setfill('0') << setw(3) << p_segment->nvals;
        cout << "\tvals : [";
        for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
            cout << *p_val << ",";
        }
        cout << "\b]" << endl;
    }
    return nsegments;
}

void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
    if ((t2->tv_nsec - t1->tv_nsec) < 0) {
        dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
        dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
    } else {
        dt->tv_sec = t2->tv_sec - t1->tv_sec;
        dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
    }
    return;
}

int main(int argc, char const *argv[]) {

    uint16_t nsegments = NSEGMENTS;
    uint16_t nsegment = 0;
    uint16_t i = 0;

    //create an populate the segments with dummy data (plain array approach)
    a_segment_t* a_segments = new a_segment_t[nsegments];
    for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
        a_segments[nsegment].index = nsegment;
        srand(nsegment);
        a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
        for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
            a_segments[nsegment].vals[nval] = nval;
        }
    }

    //create an populate the segments with dummy data (stl vector approach)
    nsegment = 0;
    vector<b_segment_t> b_segments(nsegments);
    for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
        p_segment->index = nsegment;
        srand(nsegment);
        p_segment->nvals = rand() % MAX_NPXS + 1;
        for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
            p_segment->vals.push_back(nval);
        }
        nsegment++;
    }

    // print(nsegments, a_segments);
    // cout << "===================================" << endl;

    // print(nsegments, &b_segments);
    // cout << "===================================" << endl;

    // ======================= plain array timing measure ========================
    struct timespec a_times[N];
    for(i = 0; i < N; i++) {
        nsegments = operation(nsegments, a_segments);
        clock_gettime(CLOCK_REALTIME, &(a_times[i]));
    }
    // ===========================================================================

    // ========================= vector timing measure ===========================
    struct timespec b_times[N];
    for(i = 0; i < N; i++) {
        nsegments = operation(nsegments, &b_segments);
        clock_gettime(CLOCK_REALTIME, &(b_times[i]));
    }
    // ===========================================================================

    // =========================== timing console log ============================
    struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
    cout << "\t\t  plain array\t\t       stl vector" << endl;
    cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
    for(i = 0; i < N-1; i=i+1000) {
        delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
        delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
        delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
        delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
        cout << i << ",\t"
        << a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
        << a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
        << b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
        << b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
    }
    // ===========================================================================

}

一個在線版本 注意:所有測試均使用-O3進行編譯

  • 有人可以指出為什么純數組實現比std::vector實現慢嗎?

  • 普通數組實現不應該更快嗎?

  • 我該如何提高純數組實現的速度?

如果您使用迭代器表達算法,則編譯器將在優化代碼方面做得更好。 原因之一是它可以對數組索引的大小和溢出特性進行假設(這將轉換為機器代碼中帶有偏移量的索引尋址)。

重構以迭代器(可以是指針)的形式表示operation()print() ):

#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>
#include <numeric>

using namespace std;

#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000

// plain array approach
typedef struct {
    uint16_t index;
    uint16_t nvals;
    uint16_t vals[MAX_NPXS];
    double mean;
} a_segment_t;

// stl vector approach
typedef struct {
    uint16_t index;
    uint16_t nvals;
    vector<uint16_t> vals;
    uint32_t mean;
} b_segment_t;

void delta_time(struct timespec*, struct timespec*, struct timespec*);

template<class Iter>
uint16_t operation(Iter first, Iter last) {
    auto result = std::uint16_t(std::distance(first, last));
    // the operation (plain array approach)
    for( ; first != last ; ++first ) {
        auto sum = std::accumulate(std::begin(first->vals), std::begin(first->vals) + first->nvals, uint64_t(0), std::plus<>());
        first->mean = sum / first->nvals;
    }
    return result;
}


template<class Iter>
uint16_t print(Iter first, Iter last) {
    auto result = std::uint16_t(std::distance(first, last));
    // print data (plain array approach)
    for( ; first != last ; ++first ) {
        cout << "index : " << setfill('0') << setw(3) << first->index;
        cout << "\tnval : " << setfill('0') << setw(3) << first->nvals;
        cout << "\tvals : [";
        for_each(std::begin(first->vals), std::begin(first->vals) + first->nvals, [](const auto& val)
        {
            cout << val << ",";
        });
        cout << "\b]" << endl;
    }
    return result;
}

void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
    if ((t2->tv_nsec - t1->tv_nsec) < 0) {
        dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
        dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
    } else {
        dt->tv_sec = t2->tv_sec - t1->tv_sec;
        dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
    }
    return;
}

int main(int argc, char const *argv[]) {

    uint16_t nsegments = NSEGMENTS;
    uint16_t nsegment = 0;
    uint16_t i = 0;

    //create an populate the segments with dummy data (plain array approach)
    a_segment_t* a_segments = new a_segment_t[nsegments];
    for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
        a_segments[nsegment].index = nsegment;
        srand(nsegment);
        a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
        for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
            a_segments[nsegment].vals[nval] = nval;
        }
    }

    //create an populate the segments with dummy data (stl vector approach)
    nsegment = 0;
    vector<b_segment_t> b_segments(nsegments);
    for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
        p_segment->index = nsegment;
        srand(nsegment);
        p_segment->nvals = rand() % MAX_NPXS + 1;
        for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
            p_segment->vals.push_back(nval);
        }
        nsegment++;
    }

    // print(a_segments, a_segments + nsegments);
    // cout << "===================================" << endl;

    // print(b_segments.begin(), b_segments.end());
    // cout << "===================================" << endl;

    // ======================= plain array timing measure ========================
    struct timespec a_times[N];
    for(i = 0; i < N; i++) {
        nsegments = operation(a_segments, a_segments + nsegments);
        clock_gettime(CLOCK_REALTIME, &(a_times[i]));
    }
    // ===========================================================================

    // ========================= vector timing measure ===========================
    struct timespec b_times[N];
    for(i = 0; i < N; i++) {
        nsegments = operation(b_segments.begin(), b_segments.begin() + nsegments);
        clock_gettime(CLOCK_REALTIME, &(b_times[i]));
    }
    // ===========================================================================

    // =========================== timing console log ============================
    struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
    cout << "\t\t  plain array\t\t       stl vector" << endl;
    cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
    for(i = 0; i < N-1; i=i+1000) {
        delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
        delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
        delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
        delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
        cout << i << ",\t"
        << a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
        << a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
        << b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
        << b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
    }
    // ===========================================================================

}

產生預期結果:

          plain array              stl vector
frame # elapsedtime deltatime   elapsedtime deltatime
0,  0.000000000,    0.000002000,    0.000000000,    0.000002000
1000,   0.001533000,    0.000001000,    0.001551000,    0.000002000
2000,   0.003061000,    0.000002000,    0.003096000,    0.000002000
3000,   0.004589000,    0.000001000,    0.004771000,    0.000002000
4000,   0.006255000,    0.000001000,    0.006433000,    0.000002000
5000,   0.007785000,    0.000002000,    0.007975000,    0.000001000
6000,   0.009326000,    0.000002000,    0.009494000,    0.000001000
7000,   0.010893000,    0.000002000,    0.011012000,    0.000001000
8000,   0.012435000,    0.000002000,    0.012650000,    0.000002000
9000,   0.014024000,    0.000002000,    0.014273000,    0.000001000

這兩個版本實際上並不等效。

首先,“數組版本”的meandouble ,“ STL版本”的meanuint32_t 為了使兩個函數在遠程上等效, mean的計算必須相同。

其次,您的“數組版本”使用數組下標,而STL版本會遞增和取消引用迭代器。 由於編譯器/優化器將需要考慮數組版本中的更多問題(例如指針別名),因此可能無法充分優化性能。

嘗試將數組版本轉換為類似的格式;

uint16_t operation(uint16_t nsegments, a_segment_t* p_segments)
{
    uint64_t sum;
    for(a_segment *pseg = p_segments, *eseg = p_segments + nsegments; pseg < eseg; ++pseg)
    {
        sum = 0;
        for(uint16_t *val = pseg->vals, *eval = pseg->vals + pseg->nvals; val < eval; ++val)
        {
            sum = sum + (*val);
        }
        p_seg->mean = sum/(pseg->nvals);
    }
    return nsegments;
}

這將(除非我在轉換為這種形式時犯了錯誤-我尚未測試)會產生相同的結果,但至少將為編譯器提供一個將相同類型的性能優化應用於“數組”的機會版本”作為“ STL版本”。

這種事情是C ++標准算法使用迭代器而不是像vector這樣的容器進行數組索引的原因之一。 編譯器更有可能優化性能。 注意,指針是迭代器的一種。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM