繁体   English   中英

为什么向量上的“调整大小删除”比“擦除删除”更快?

[英]Why is 'resize-remove' faster than 'erase-remove' on vectors?

当涉及到从容器中删除多个元素时,C++ 中有一个“擦除-删除”惯用语,并且有关于替代“调整大小-删除”方式的讨论,例如, 此处 人们说“erase-remove”比“resize-remove”好,但根据我的测试,后者在矢量上(稍微)更快。 那么,当涉及到矢量时,我应该使用“resize-remove”吗?

这是我的基准测试代码:

#include <benchmark/benchmark.h>

#include <algorithm>
#include <functional>
#include <iostream>
#include <random>
#include <vector>

using namespace std;

constexpr size_t N_ELEMS = 1000000;
constexpr int MAX_VAL = N_ELEMS / 10;
constexpr int THRESH = MAX_VAL / 5 * 3;

static vector<int> generate_input() {
  vector<int> nums(N_ELEMS);

  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<> dist(0, N_ELEMS);

  std::generate(nums.begin(), nums.end(), std::bind(dist, std::ref(gen)));

  return std::move(nums);
}

static void bm_erase_remove(benchmark::State &state) {
  for (auto _ : state) {
    state.PauseTiming();
    auto nums = generate_input();
    state.ResumeTiming();
    nums.erase(std::remove_if(nums.begin(), nums.end(),
                              [](int x) { return x < THRESH; }),
               nums.end());
    benchmark::DoNotOptimize(nums);
  }
}
BENCHMARK(bm_erase_remove);

static void bm_resize_remove(benchmark::State &state) {
  for (auto _ : state) {
    state.PauseTiming();
    auto nums = generate_input();
    state.ResumeTiming();

    nums.resize(std::distance(
        nums.begin(), std::remove_if(nums.begin(), nums.end(),
                                     [](int x) { return x < THRESH; })));
    benchmark::DoNotOptimize(nums);
  }
}
BENCHMARK(bm_resize_remove);

BENCHMARK_MAIN();

Output:

$ g++ main.cpp -lbenchmark -O3 -pthread
$ ./a.out 
2023-05-24T20:07:22+08:00
Running ./a.out
Run on (16 X 3193.91 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
Load Average: 0.16, 0.14, 0.16
-----------------------------------------------------------
Benchmark                 Time             CPU   Iterations
-----------------------------------------------------------
bm_erase_remove      822789 ns       759162 ns          838
bm_resize_remove     818217 ns       754749 ns          935

使用clang++时差异更大:

$ clang++ main.cpp -lbenchmark -O3 -pthread
$ ./a.out
Load Average: 0.25, 0.18, 0.17
-----------------------------------------------------------
Benchmark                 Time             CPU   Iterations
-----------------------------------------------------------
bm_erase_remove     1165085 ns      1074667 ns          611
bm_resize_remove     958856 ns       884584 ns          782

额外的信息:

  • g++的版本是13.1.1 , clang++的版本是15.0.7
  • 我在 WSL 上使用 Arch Linux,kernel 版本是5.15.90.1-microsoft-standard-WSL2
  • CPU Model 是AMD Ryzen 7 6800H with Radeon Graphics

更新:有趣的是,当我单独运行基准测试时(使用benchmark_filter选项),结果是相同的。 这是因为缓存吗? 如果是这样,缓存机制如何工作?

更新(2023 年 5 月 25 日):如果交换两个BENCHMARK语句,则会显示完全相反的结果。

它们的性能大致相同,这是因为std::remove_if是唯一修改数组的 function,然后区别来自其他函数, std::vector::resize重新分配以适应新大小( std::distance只是返回大小,所以它可以忽略不计)和 std::vector::erase只是采用容器使其稍微快一些。

正如@Peter Cordes 所指出的“它实际上保证不会重新分配” ,如果新的大小更小, std::vector::resize不会调整向量的大小,所以区别应该来自额外的移动,即擦除调整大小不吨。

为了能够测量差异generate_input()需要为所有测试返回相同的向量,在您的实现中,每个调用都会返回一个新的随机向量,这使得无法区分运行变化与向量变化。

就此而言,@463035818_is_not_a_number 提出了一个有趣的观点,但出于与之前相同的原因,这些函数之间没有区别。 这并不意味着它是相同的情况,对于一个结构来说,来自坏分支的惩罚要大得多,这使得std::remove_if成为优化的主要目标。

在这两个数字中(在 windows 中使用 1600X(6x3.6GHz)和 1067MHz RAM 运行 50 次),绿色是最佳运行和平均值之间的范围,红色是从平均值到最差结果的范围,这是显示运行之间的差异。 基准测试结果

我真的不知道的一件事是为什么simple_remove<u64>表现如此之差,upruf 确实向我展示了 mov 的表现非常糟糕,但与其他功能没有什么不同。

#include <cstdint>
#include <random>
#include <string>
#include <vector>
#include <tuple>

#include <benchmark/benchmark.h>

#pragma warning(disable:4267)  // Conversion errors
#pragma warning(disable:4244)  // Conversion errors

constexpr int N_ELEMS   = 500;
constexpr int RAND_SEED = 8888;

template <typename T>
struct Filler {

    T * ptr = nullptr;

    int64_t padd [sizeof(T)];

    Filler() {
        ptr = new T(0);
        memset(padd, 0, sizeof(T) * 8);
    }

    Filler(const T num){
        ptr = new T(num);
        for (int64_t& e : padd){
            e = num;
        }
    }

    Filler(const Filler& other){
        ptr = new T(*other.ptr);
        memcpy(padd, other.padd, sizeof(T) * 8);
    }

    ~Filler() {
        delete ptr;
    }

    Filler& operator=(Filler const& other){
        memcpy(padd, other.padd, sizeof(T) * 8);
        *ptr = *other.ptr;
        return *this;
    }

    inline bool operator <  (const Filler& other) { return *ptr <  *other.ptr; }
    inline bool operator <= (const Filler& other) { return *ptr <= *other.ptr; }
    inline bool operator >  (const Filler& other) { return *ptr >  *other.ptr; }
    inline bool operator >= (const Filler& other) { return *ptr >= *other.ptr; }
    inline bool operator == (const Filler& other) { return *ptr == *other.ptr; }
    inline bool operator != (const Filler& other) { return *ptr != *other.ptr; }

    inline bool operator <  (const T other) { return *ptr <  other; }
    inline bool operator <= (const T other) { return *ptr <= other; }
    inline bool operator >  (const T other) { return *ptr >  other; }
    inline bool operator >= (const T other) { return *ptr >= other; }
    inline bool operator == (const T other) { return *ptr == other; }
    inline bool operator != (const T other) { return *ptr != other; }

};


static size_t THRESH;   

template <typename T>
struct Foo {


    static std::vector<T> generate_input(size_t max = 0) {
        static size_t dist_max = 0;
        static std::vector<T> nums;

        if (nums.empty() || max){

            if (max) {
                THRESH = max / 2;
                dist_max = max;
            }

            std::mt19937 gen(RAND_SEED);
            std::uniform_int_distribution<uint64_t> dist(0, dist_max);

            for (auto& n : nums = std::vector<T>(N_ELEMS)){
                n = T(dist(gen));
            }
        }
        return nums;
    }

    static void just_remove(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming(); 
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        std::ignore = std::remove_if(
            nums.begin(), nums.end(),
            [](T x) { return x < THRESH; }
        );

        benchmark::DoNotOptimize(nums);
      }
    }

    static void erase_remove(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming();
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        nums.erase(
            std::remove_if(
                nums.begin(), nums.end(),
                [](T x) { return x < THRESH; }
            ),
            nums.end()
        );

        benchmark::DoNotOptimize(nums);
      }
    }

    static void resize_remove(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming(); 
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        nums.resize(
          std::distance(
            nums.begin(), 
            std::remove_if(
                nums.begin(), nums.end(),
                [](T x) { return x < THRESH; }
            )
          )
        );

        benchmark::DoNotOptimize(nums);
      }
    }


    static void simple_remove(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming();
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        T * n = &nums.front();
        T * m = &nums.front();

        const T thresh = T(THRESH);
        const T * back = &nums.back();
        do {

            if (*m >= thresh){
                *(n++) = std::move(*m);
            }

        } while (m++ < back);

        nums.resize(n - &nums.front());

        benchmark::DoNotOptimize(nums);
      }
    }

    static void simple_remove_unroll(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming();
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        T * n = &nums.front();
        T * m = &nums.front();
        const T thresh = T(THRESH);
        const T * back = &nums.back();

        switch (nums.size() % 4) {
            case 3:
                if (*m >= thresh){
                    *(n++) = std::move(*(m++));
                } else {
                    m++;
                }
            case 2:
                if (*m >= thresh){
                    *(n++) = std::move(*(m++));
                } else {
                    m++;
                }
            case 1:
                if (*m >= thresh){
                    *(n++) = std::move(*(m++));
                } else {
                    m++;
                }
        }

        do {

            if (*(m + 0) >= thresh){ *(n++) = std::move(*(m + 0)); }
            if (*(m + 1) >= thresh){ *(n++) = std::move(*(m + 1)); }
            if (*(m + 2) >= thresh){ *(n++) = std::move(*(m + 2)); }
            if (*(m + 3) >= thresh){ *(n++) = std::move(*(m + 3)); }

            m += 4;

        } while (m < back);

        nums.resize(n - &nums.front());

        benchmark::DoNotOptimize(nums);
      }
    }

};

template<typename T>
void benchmark_batch(size_t max_num) {

    std::string type = typeid(T).name();
    Foo<T>::generate_input(max_num);

    benchmark::RegisterBenchmark(
        std::string("just_remove/") + type, 
        Foo<T>::just_remove
    );
    benchmark::RegisterBenchmark(
        std::string("erase_remove/") + type, 
        Foo<T>::erase_remove
    );
    benchmark::RegisterBenchmark(
        std::string("resize_remove/") + type, 
        Foo<T>::resize_remove
    );
    benchmark::RegisterBenchmark(
        std::string("simple_remove/") + type, 
        Foo<T>::simple_remove
    );
    benchmark::RegisterBenchmark(
        std::string("simple_remove_unroll/") + type, 
        Foo<T>::simple_remove_unroll
    );
}

int main(int argc, char** argv) {
 
    benchmark_batch<uint8_t>(INT8_MAX); 
    benchmark_batch<uint32_t>(INT32_MAX);
    benchmark_batch<uint64_t>(INT64_MAX);

    benchmark_batch<Filler<uint8_t>>(INT8_MAX);
    benchmark_batch<Filler<uint32_t>>(INT32_MAX);
    benchmark_batch<Filler<uint64_t>>(INT64_MAX);

    benchmark::Initialize(&argc, argv);
    benchmark::RunSpecifiedBenchmarks();
    benchmark::Shutdown();

    return 0;
}

以及重新创建 plot 的代码。

0..49 | % { .\Release\test_cxx.exe --benchmark_min_warmup_time=0.1 --benchmark_format=csv > "./data/run_$_.csv" }
Get-ChildItem ./data | Select-Object -ExpandProperty FullName | Import-Csv | Export-Csv .\benchmark.csv -NoTypeInformation -Append
import matplotlib.ticker as tck
import matplotlib.pyplot as plt
import csv

def read_data():
    data = {}
    test_len = 0
    with open('./build/benchmark.csv') as csvfile:
        reader  = csv.DictReader(csvfile)
        for row in reader :
            test_len += 1
            name = row['name'];
            if not name in data:
                data[name] = {
                    'min': {
                        'iterations': float(row['iterations']),
                        'real_time': float(row['real_time']),
                        'cpu_time': float(row['cpu_time']),
                    }, 
                    'max': {
                        'iterations': float(row['iterations']),
                        'real_time': float(row['real_time']),
                        'cpu_time': float(row['cpu_time']),
                    }, 
                    'avg': {
                        'iterations': float(row['iterations']),
                        'real_time': float(row['real_time']),
                        'cpu_time': float(row['cpu_time']),
                    },
                }
            else:
                for k in ['iterations', 'real_time', 'cpu_time']:
                    data[name]['avg'][k] += float(row[k])
                    if float(row[k]) < float(data[name]['min'][k]):
                        data[name]['min'][k] = float(row[k])
                    if float(row[k]) > float(data[name]['max'][k]):
                        data[name]['max'][k] = float(row[k])

        test_len /= len(data.keys())
        for k in data:
            for kk in data[k]['avg']:
                data[k]['avg'][kk] /= test_len
    return data

def plot_data(data, key):
    labels = []
    values = {
        'max': [], 'avg': [], 'min': [],
    }
    labels_struct = []
    values_struct = {
        'max': [], 'avg': [], 'min': [],
    }

    for k in list(data.keys()):
        if 'struct' in k:
            labels_struct.append(k.replace('/', '\n').replace('struct ', ''))
            values_struct['min'].append(data[k]['min'][key])
            values_struct['max'].append(data[k]['max'][key])
            values_struct['avg'].append(data[k]['avg'][key])
        else:
            labels.append(k.replace('/', '\n'))
            values['min'].append(data[k]['min'][key])
            values['max'].append(data[k]['max'][key])
            values['avg'].append(data[k]['avg'][key])

    return labels, values, labels_struct, values_struct

thickness = 0.8
benckmark_value = 'iterations'
colors = ['#1dad2b', '#af1c23', '#e0e0e0']

if __name__ == '__main__':

    data = read_data()
    labels, values, labels_struct, values_struct = plot_data(data, benckmark_value)

    fig = plt.figure(layout="constrained")
    spec = fig.add_gridspec(ncols=2, nrows=1)
    int_formater = lambda x, p: format(int(x), ',')

    ax0 = fig.add_subplot(spec[0, 0])
    ax0.set_ylabel(benckmark_value)
    ax0.set_title('std::vector<T>')
    ax0.set_xticklabels(labels, rotation=90)
    ax0.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))

    ax1 = fig.add_subplot(spec[0, 1])
    ax1.set_ylabel(benckmark_value)
    ax1.set_title('std::vector<Filler<T>>')
    ax1.set_xticklabels(labels_struct, rotation=90)
    ax1.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))

    for i, (k, v) in enumerate(values.items()):
        ax0.bar(labels, v, thickness, color=colors[i])

    for i, (k, v) in enumerate(values_struct.items()):
        ax1.bar(labels_struct, v, thickness, color=colors[i])

    plt.show()
暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2023 STACKOOM.COM