[英]Why is 'resize-remove' faster than 'erase-remove' on vectors?
当涉及到从容器中删除多个元素时,C++ 中有一个“擦除-删除”惯用语,并且有关于替代“调整大小-删除”方式的讨论,例如, 此处。 人们说“erase-remove”比“resize-remove”好,但根据我的测试,后者在矢量上(稍微)更快。 那么,当涉及到矢量时,我应该使用“resize-remove”吗?
这是我的基准测试代码:
#include <benchmark/benchmark.h>
#include <algorithm>
#include <functional>
#include <iostream>
#include <random>
#include <vector>
using namespace std;
constexpr size_t N_ELEMS = 1000000;
constexpr int MAX_VAL = N_ELEMS / 10;
constexpr int THRESH = MAX_VAL / 5 * 3;
static vector<int> generate_input() {
vector<int> nums(N_ELEMS);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dist(0, N_ELEMS);
std::generate(nums.begin(), nums.end(), std::bind(dist, std::ref(gen)));
return std::move(nums);
}
static void bm_erase_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
auto nums = generate_input();
state.ResumeTiming();
nums.erase(std::remove_if(nums.begin(), nums.end(),
[](int x) { return x < THRESH; }),
nums.end());
benchmark::DoNotOptimize(nums);
}
}
BENCHMARK(bm_erase_remove);
static void bm_resize_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
auto nums = generate_input();
state.ResumeTiming();
nums.resize(std::distance(
nums.begin(), std::remove_if(nums.begin(), nums.end(),
[](int x) { return x < THRESH; })));
benchmark::DoNotOptimize(nums);
}
}
BENCHMARK(bm_resize_remove);
BENCHMARK_MAIN();
Output:
$ g++ main.cpp -lbenchmark -O3 -pthread
$ ./a.out
2023-05-24T20:07:22+08:00
Running ./a.out
Run on (16 X 3193.91 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 512 KiB (x8)
L3 Unified 16384 KiB (x1)
Load Average: 0.16, 0.14, 0.16
-----------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------
bm_erase_remove 822789 ns 759162 ns 838
bm_resize_remove 818217 ns 754749 ns 935
使用clang++
时差异更大:
$ clang++ main.cpp -lbenchmark -O3 -pthread
$ ./a.out
Load Average: 0.25, 0.18, 0.17
-----------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------
bm_erase_remove 1165085 ns 1074667 ns 611
bm_resize_remove 958856 ns 884584 ns 782
额外的信息:
g++
的版本是13.1.1
, clang++
的版本是15.0.7
5.15.90.1-microsoft-standard-WSL2
AMD Ryzen 7 6800H with Radeon Graphics
更新:有趣的是,当我单独运行基准测试时(使用benchmark_filter
选项),结果是相同的。 这是因为缓存吗? 如果是这样,缓存机制如何工作?
更新(2023 年 5 月 25 日):如果交换两个BENCHMARK
语句,则会显示完全相反的结果。
它们的性能大致相同,这是因为std::remove_if
是唯一修改数组的 function,然后区别来自其他函数,
std::vector::resize
重新分配以适应新大小(
std::distance
只是返回大小,所以它可以忽略不计)和
std::vector::erase
只是采用容器使其稍微快一些。
正如@Peter Cordes 所指出的“它实际上保证不会重新分配” ,如果新的大小更小, std::vector::resize
不会调整向量的大小,所以区别应该来自额外的移动,即擦除和调整大小不吨。
为了能够测量差异generate_input()
需要为所有测试返回相同的向量,在您的实现中,每个调用都会返回一个新的随机向量,这使得无法区分运行变化与向量变化。
就此而言,@463035818_is_not_a_number 提出了一个有趣的观点,但出于与之前相同的原因,这些函数之间没有区别。 这并不意味着它是相同的情况,对于一个结构来说,来自坏分支的惩罚要大得多,这使得std::remove_if
成为优化的主要目标。
在这两个数字中(在 windows 中使用 1600X(6x3.6GHz)和 1067MHz RAM 运行 50 次),绿色是最佳运行和平均值之间的范围,红色是从平均值到最差结果的范围,这是显示运行之间的差异。
我真的不知道的一件事是为什么simple_remove<u64>
表现如此之差,upruf 确实向我展示了 mov 的表现非常糟糕,但与其他功能没有什么不同。
#include <cstdint>
#include <random>
#include <string>
#include <vector>
#include <tuple>
#include <benchmark/benchmark.h>
#pragma warning(disable:4267) // Conversion errors
#pragma warning(disable:4244) // Conversion errors
constexpr int N_ELEMS = 500;
constexpr int RAND_SEED = 8888;
template <typename T>
struct Filler {
T * ptr = nullptr;
int64_t padd [sizeof(T)];
Filler() {
ptr = new T(0);
memset(padd, 0, sizeof(T) * 8);
}
Filler(const T num){
ptr = new T(num);
for (int64_t& e : padd){
e = num;
}
}
Filler(const Filler& other){
ptr = new T(*other.ptr);
memcpy(padd, other.padd, sizeof(T) * 8);
}
~Filler() {
delete ptr;
}
Filler& operator=(Filler const& other){
memcpy(padd, other.padd, sizeof(T) * 8);
*ptr = *other.ptr;
return *this;
}
inline bool operator < (const Filler& other) { return *ptr < *other.ptr; }
inline bool operator <= (const Filler& other) { return *ptr <= *other.ptr; }
inline bool operator > (const Filler& other) { return *ptr > *other.ptr; }
inline bool operator >= (const Filler& other) { return *ptr >= *other.ptr; }
inline bool operator == (const Filler& other) { return *ptr == *other.ptr; }
inline bool operator != (const Filler& other) { return *ptr != *other.ptr; }
inline bool operator < (const T other) { return *ptr < other; }
inline bool operator <= (const T other) { return *ptr <= other; }
inline bool operator > (const T other) { return *ptr > other; }
inline bool operator >= (const T other) { return *ptr >= other; }
inline bool operator == (const T other) { return *ptr == other; }
inline bool operator != (const T other) { return *ptr != other; }
};
static size_t THRESH;
template <typename T>
struct Foo {
static std::vector<T> generate_input(size_t max = 0) {
static size_t dist_max = 0;
static std::vector<T> nums;
if (nums.empty() || max){
if (max) {
THRESH = max / 2;
dist_max = max;
}
std::mt19937 gen(RAND_SEED);
std::uniform_int_distribution<uint64_t> dist(0, dist_max);
for (auto& n : nums = std::vector<T>(N_ELEMS)){
n = T(dist(gen));
}
}
return nums;
}
static void just_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
std::ignore = std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
);
benchmark::DoNotOptimize(nums);
}
}
static void erase_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
nums.erase(
std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
),
nums.end()
);
benchmark::DoNotOptimize(nums);
}
}
static void resize_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
nums.resize(
std::distance(
nums.begin(),
std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
)
)
);
benchmark::DoNotOptimize(nums);
}
}
static void simple_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
T * n = &nums.front();
T * m = &nums.front();
const T thresh = T(THRESH);
const T * back = &nums.back();
do {
if (*m >= thresh){
*(n++) = std::move(*m);
}
} while (m++ < back);
nums.resize(n - &nums.front());
benchmark::DoNotOptimize(nums);
}
}
static void simple_remove_unroll(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
T * n = &nums.front();
T * m = &nums.front();
const T thresh = T(THRESH);
const T * back = &nums.back();
switch (nums.size() % 4) {
case 3:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
case 2:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
case 1:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
}
do {
if (*(m + 0) >= thresh){ *(n++) = std::move(*(m + 0)); }
if (*(m + 1) >= thresh){ *(n++) = std::move(*(m + 1)); }
if (*(m + 2) >= thresh){ *(n++) = std::move(*(m + 2)); }
if (*(m + 3) >= thresh){ *(n++) = std::move(*(m + 3)); }
m += 4;
} while (m < back);
nums.resize(n - &nums.front());
benchmark::DoNotOptimize(nums);
}
}
};
template<typename T>
void benchmark_batch(size_t max_num) {
std::string type = typeid(T).name();
Foo<T>::generate_input(max_num);
benchmark::RegisterBenchmark(
std::string("just_remove/") + type,
Foo<T>::just_remove
);
benchmark::RegisterBenchmark(
std::string("erase_remove/") + type,
Foo<T>::erase_remove
);
benchmark::RegisterBenchmark(
std::string("resize_remove/") + type,
Foo<T>::resize_remove
);
benchmark::RegisterBenchmark(
std::string("simple_remove/") + type,
Foo<T>::simple_remove
);
benchmark::RegisterBenchmark(
std::string("simple_remove_unroll/") + type,
Foo<T>::simple_remove_unroll
);
}
int main(int argc, char** argv) {
benchmark_batch<uint8_t>(INT8_MAX);
benchmark_batch<uint32_t>(INT32_MAX);
benchmark_batch<uint64_t>(INT64_MAX);
benchmark_batch<Filler<uint8_t>>(INT8_MAX);
benchmark_batch<Filler<uint32_t>>(INT32_MAX);
benchmark_batch<Filler<uint64_t>>(INT64_MAX);
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}
以及重新创建 plot 的代码。
0..49 | % { .\Release\test_cxx.exe --benchmark_min_warmup_time=0.1 --benchmark_format=csv > "./data/run_$_.csv" }
Get-ChildItem ./data | Select-Object -ExpandProperty FullName | Import-Csv | Export-Csv .\benchmark.csv -NoTypeInformation -Append
import matplotlib.ticker as tck
import matplotlib.pyplot as plt
import csv
def read_data():
data = {}
test_len = 0
with open('./build/benchmark.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader :
test_len += 1
name = row['name'];
if not name in data:
data[name] = {
'min': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
'max': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
'avg': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
}
else:
for k in ['iterations', 'real_time', 'cpu_time']:
data[name]['avg'][k] += float(row[k])
if float(row[k]) < float(data[name]['min'][k]):
data[name]['min'][k] = float(row[k])
if float(row[k]) > float(data[name]['max'][k]):
data[name]['max'][k] = float(row[k])
test_len /= len(data.keys())
for k in data:
for kk in data[k]['avg']:
data[k]['avg'][kk] /= test_len
return data
def plot_data(data, key):
labels = []
values = {
'max': [], 'avg': [], 'min': [],
}
labels_struct = []
values_struct = {
'max': [], 'avg': [], 'min': [],
}
for k in list(data.keys()):
if 'struct' in k:
labels_struct.append(k.replace('/', '\n').replace('struct ', ''))
values_struct['min'].append(data[k]['min'][key])
values_struct['max'].append(data[k]['max'][key])
values_struct['avg'].append(data[k]['avg'][key])
else:
labels.append(k.replace('/', '\n'))
values['min'].append(data[k]['min'][key])
values['max'].append(data[k]['max'][key])
values['avg'].append(data[k]['avg'][key])
return labels, values, labels_struct, values_struct
thickness = 0.8
benckmark_value = 'iterations'
colors = ['#1dad2b', '#af1c23', '#e0e0e0']
if __name__ == '__main__':
data = read_data()
labels, values, labels_struct, values_struct = plot_data(data, benckmark_value)
fig = plt.figure(layout="constrained")
spec = fig.add_gridspec(ncols=2, nrows=1)
int_formater = lambda x, p: format(int(x), ',')
ax0 = fig.add_subplot(spec[0, 0])
ax0.set_ylabel(benckmark_value)
ax0.set_title('std::vector<T>')
ax0.set_xticklabels(labels, rotation=90)
ax0.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))
ax1 = fig.add_subplot(spec[0, 1])
ax1.set_ylabel(benckmark_value)
ax1.set_title('std::vector<Filler<T>>')
ax1.set_xticklabels(labels_struct, rotation=90)
ax1.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))
for i, (k, v) in enumerate(values.items()):
ax0.bar(labels, v, thickness, color=colors[i])
for i, (k, v) in enumerate(values_struct.items()):
ax1.bar(labels_struct, v, thickness, color=colors[i])
plt.show()
问题未解决?试试以下方法:
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.