繁体   English   中英

为什么这个普通数组实现比std :: vector实现性能慢?

[英]Why is this plain array implementation slower than the std::vector implementation performance?

为什么这个普通数组实现比std :: vector实现性能慢?

由于我在处理某些工作时出现了一些奇怪的结果,因此我决定编写一个简化的测试来比较std::vector与纯数组效率。

我有两种方法都可以实现的结构,

1个使用普通数组(大小不同)

typedef struct {
    uint16_t index;
     uint16_t nvals;
     uint16_t vals[50];
     double mean;
} a_segment_t;

2使用STL

 typedef struct {
      uint16_t index;
      uint16_t nvals;
      vector<uint16_t> vals;
      uint32_t mean;
} b_segment_t;

在内存中创建该对象不是我感兴趣的(因此我不介意push_back() ),一旦该对象在内存中,它将用于操作 ,而效率就是我要分析的。 vals填充有一些随机数据。

操作将遍历存储在每个段中的值,在这种情况下,将进行简单的均值计算。 测试如下:

using namespace std;
#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>

#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000

// plain array approach
typedef struct {
    uint16_t index;
    uint16_t nvals;
    uint16_t vals[MAX_NPXS];
    double mean;
} a_segment_t;
uint16_t operation(uint16_t, a_segment_t*);
uint16_t print(uint16_t nsegments, a_segment_t* p_segments);

// stl vector approach
typedef struct {
    uint16_t index;
    uint16_t nvals;
    vector<uint16_t> vals;
    uint32_t mean;
} b_segment_t;
uint16_t operation(uint16_t, vector<b_segment_t>*);
uint16_t print(uint16_t nsegments, vector<b_segment_t>*);

void delta_time(struct timespec*, struct timespec*, struct timespec*);

uint16_t operation(uint16_t nsegments, a_segment_t* p_segments) {
    // the operation (plain array approach)
    uint64_t sum;
    for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
        sum = 0;
        for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
            sum = sum + p_segments[nsegment].vals[nval];
        }
        p_segments[nsegment].mean = sum/p_segments[nsegment].nvals;
    }
    return nsegments;
}

uint16_t print(uint16_t nsegments, a_segment_t* p_segments) {
    // print data (plain array approach)
    for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
        cout << "index : " << setfill('0') << setw(3) << p_segments[nsegment].index;
        cout << "\tnval : " << setfill('0') << setw(3) << p_segments[nsegment].nvals;
        cout << "\tvals : [";
        for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
            cout << p_segments[nsegment].vals[nval] << ",";
        }
        cout << "\b]" << endl;
    }
    return nsegments;
}

uint16_t operation(uint16_t nsegments, vector<b_segment_t>* p_segments) {
    // the operation (stl vector approach)
    uint32_t sum;
    for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
        sum = 0;
        for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
            sum = sum + (*p_val);
        }
        p_segment->mean = sum/(p_segment->nvals);
    }
    return nsegments;
}

uint16_t print(uint16_t nsegments, vector<b_segment_t>* p_segments) {
    // print data (stl vector approach)
    for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
        cout << "index : " << setfill('0') << setw(3) << p_segment->index;
        cout << "\tnval : " << setfill('0') << setw(3) << p_segment->nvals;
        cout << "\tvals : [";
        for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
            cout << *p_val << ",";
        }
        cout << "\b]" << endl;
    }
    return nsegments;
}

void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
    if ((t2->tv_nsec - t1->tv_nsec) < 0) {
        dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
        dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
    } else {
        dt->tv_sec = t2->tv_sec - t1->tv_sec;
        dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
    }
    return;
}

int main(int argc, char const *argv[]) {

    uint16_t nsegments = NSEGMENTS;
    uint16_t nsegment = 0;
    uint16_t i = 0;

    //create an populate the segments with dummy data (plain array approach)
    a_segment_t* a_segments = new a_segment_t[nsegments];
    for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
        a_segments[nsegment].index = nsegment;
        srand(nsegment);
        a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
        for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
            a_segments[nsegment].vals[nval] = nval;
        }
    }

    //create an populate the segments with dummy data (stl vector approach)
    nsegment = 0;
    vector<b_segment_t> b_segments(nsegments);
    for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
        p_segment->index = nsegment;
        srand(nsegment);
        p_segment->nvals = rand() % MAX_NPXS + 1;
        for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
            p_segment->vals.push_back(nval);
        }
        nsegment++;
    }

    // print(nsegments, a_segments);
    // cout << "===================================" << endl;

    // print(nsegments, &b_segments);
    // cout << "===================================" << endl;

    // ======================= plain array timing measure ========================
    struct timespec a_times[N];
    for(i = 0; i < N; i++) {
        nsegments = operation(nsegments, a_segments);
        clock_gettime(CLOCK_REALTIME, &(a_times[i]));
    }
    // ===========================================================================

    // ========================= vector timing measure ===========================
    struct timespec b_times[N];
    for(i = 0; i < N; i++) {
        nsegments = operation(nsegments, &b_segments);
        clock_gettime(CLOCK_REALTIME, &(b_times[i]));
    }
    // ===========================================================================

    // =========================== timing console log ============================
    struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
    cout << "\t\t  plain array\t\t       stl vector" << endl;
    cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
    for(i = 0; i < N-1; i=i+1000) {
        delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
        delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
        delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
        delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
        cout << i << ",\t"
        << a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
        << a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
        << b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
        << b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
    }
    // ===========================================================================

}

一个在线版本 注意:所有测试均使用-O3进行编译

  • 有人可以指出为什么纯数组实现比std::vector实现慢吗?

  • 普通数组实现不应该更快吗?

  • 我该如何提高纯数组实现的速度?

如果您使用迭代器表达算法,则编译器将在优化代码方面做得更好。 原因之一是它可以对数组索引的大小和溢出特性进行假设(这将转换为机器代码中带有偏移量的索引寻址)。

重构以迭代器(可以是指针)的形式表示operation()print() ):

#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>
#include <numeric>

using namespace std;

#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000

// plain array approach
typedef struct {
    uint16_t index;
    uint16_t nvals;
    uint16_t vals[MAX_NPXS];
    double mean;
} a_segment_t;

// stl vector approach
typedef struct {
    uint16_t index;
    uint16_t nvals;
    vector<uint16_t> vals;
    uint32_t mean;
} b_segment_t;

void delta_time(struct timespec*, struct timespec*, struct timespec*);

template<class Iter>
uint16_t operation(Iter first, Iter last) {
    auto result = std::uint16_t(std::distance(first, last));
    // the operation (plain array approach)
    for( ; first != last ; ++first ) {
        auto sum = std::accumulate(std::begin(first->vals), std::begin(first->vals) + first->nvals, uint64_t(0), std::plus<>());
        first->mean = sum / first->nvals;
    }
    return result;
}


template<class Iter>
uint16_t print(Iter first, Iter last) {
    auto result = std::uint16_t(std::distance(first, last));
    // print data (plain array approach)
    for( ; first != last ; ++first ) {
        cout << "index : " << setfill('0') << setw(3) << first->index;
        cout << "\tnval : " << setfill('0') << setw(3) << first->nvals;
        cout << "\tvals : [";
        for_each(std::begin(first->vals), std::begin(first->vals) + first->nvals, [](const auto& val)
        {
            cout << val << ",";
        });
        cout << "\b]" << endl;
    }
    return result;
}

void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
    if ((t2->tv_nsec - t1->tv_nsec) < 0) {
        dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
        dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
    } else {
        dt->tv_sec = t2->tv_sec - t1->tv_sec;
        dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
    }
    return;
}

int main(int argc, char const *argv[]) {

    uint16_t nsegments = NSEGMENTS;
    uint16_t nsegment = 0;
    uint16_t i = 0;

    //create an populate the segments with dummy data (plain array approach)
    a_segment_t* a_segments = new a_segment_t[nsegments];
    for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
        a_segments[nsegment].index = nsegment;
        srand(nsegment);
        a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
        for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
            a_segments[nsegment].vals[nval] = nval;
        }
    }

    //create an populate the segments with dummy data (stl vector approach)
    nsegment = 0;
    vector<b_segment_t> b_segments(nsegments);
    for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
        p_segment->index = nsegment;
        srand(nsegment);
        p_segment->nvals = rand() % MAX_NPXS + 1;
        for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
            p_segment->vals.push_back(nval);
        }
        nsegment++;
    }

    // print(a_segments, a_segments + nsegments);
    // cout << "===================================" << endl;

    // print(b_segments.begin(), b_segments.end());
    // cout << "===================================" << endl;

    // ======================= plain array timing measure ========================
    struct timespec a_times[N];
    for(i = 0; i < N; i++) {
        nsegments = operation(a_segments, a_segments + nsegments);
        clock_gettime(CLOCK_REALTIME, &(a_times[i]));
    }
    // ===========================================================================

    // ========================= vector timing measure ===========================
    struct timespec b_times[N];
    for(i = 0; i < N; i++) {
        nsegments = operation(b_segments.begin(), b_segments.begin() + nsegments);
        clock_gettime(CLOCK_REALTIME, &(b_times[i]));
    }
    // ===========================================================================

    // =========================== timing console log ============================
    struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
    cout << "\t\t  plain array\t\t       stl vector" << endl;
    cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
    for(i = 0; i < N-1; i=i+1000) {
        delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
        delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
        delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
        delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
        cout << i << ",\t"
        << a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
        << a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
        << b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
        << b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
    }
    // ===========================================================================

}

产生预期结果:

          plain array              stl vector
frame # elapsedtime deltatime   elapsedtime deltatime
0,  0.000000000,    0.000002000,    0.000000000,    0.000002000
1000,   0.001533000,    0.000001000,    0.001551000,    0.000002000
2000,   0.003061000,    0.000002000,    0.003096000,    0.000002000
3000,   0.004589000,    0.000001000,    0.004771000,    0.000002000
4000,   0.006255000,    0.000001000,    0.006433000,    0.000002000
5000,   0.007785000,    0.000002000,    0.007975000,    0.000001000
6000,   0.009326000,    0.000002000,    0.009494000,    0.000001000
7000,   0.010893000,    0.000002000,    0.011012000,    0.000001000
8000,   0.012435000,    0.000002000,    0.012650000,    0.000002000
9000,   0.014024000,    0.000002000,    0.014273000,    0.000001000

这两个版本实际上并不等效。

首先,“数组版本”的meandouble ,“ STL版本”的meanuint32_t 为了使两个函数在远程上等效, mean的计算必须相同。

其次,您的“数组版本”使用数组下标,而STL版本会递增和取消引用迭代器。 由于编译器/优化器将需要考虑数组版本中的更多问题(例如指针别名),因此可能无法充分优化性能。

尝试将数组版本转换为类似的格式;

uint16_t operation(uint16_t nsegments, a_segment_t* p_segments)
{
    uint64_t sum;
    for(a_segment *pseg = p_segments, *eseg = p_segments + nsegments; pseg < eseg; ++pseg)
    {
        sum = 0;
        for(uint16_t *val = pseg->vals, *eval = pseg->vals + pseg->nvals; val < eval; ++val)
        {
            sum = sum + (*val);
        }
        p_seg->mean = sum/(pseg->nvals);
    }
    return nsegments;
}

这将(除非我在转换为这种形式时犯了错误-我尚未测试)会产生相同的结果,但至少将为编译器提供一个将相同类型的性能优化应用于“数组”的机会版本”作为“ STL版本”。

这种事情是C ++标准算法使用迭代器而不是像vector这样的容器进行数组索引的原因之一。 编译器更有可能优化性能。 注意,指针是迭代器的一种。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM