![](/img/trans.png)
[英]Fastest way to sum dot product of vector of unsigned 64 bit integers using 192/256 bit integer?
[英]Vector of 64-bit double faster to dot-product than a vector of 32-bit unsigned int?
我有兩種代碼迭代大小為500的矢量設計。其中一個設計包含64位雙精度數組,第二個設計使用包含32位整數的數組。 我期待32位設計更快,因為更多有用的數據可以打包在緩存中。
編譯器MSVC,CPU Ivy Bridge,編譯64位模式。
這是代碼1,使用32位整數(在2600個 CPU周期中運行):
#include <vector>
#include <iostream>
int main(){
std::vector<unsigned int> x1;
std::vector<unsigned int> x2;
std::vector<unsigned int> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtsc();
for(int i=0; i < 500; i++){
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp();
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
產生這個asm(-Os):
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r10,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov r8,rbx
mov r9d,1F4h
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
mov edx,dword ptr [r8+r15]
m = m + (a * g);
mov ecx,edx
imul ecx,dword ptr [r8+r14]
xorps xmm0,xmm0
cvtsi2sd xmm0,rcx
addsd xmm7,xmm0
n = n + (b * g);
imul edx,dword ptr [r8]
mov eax,edx
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm8,xmm0
for(int i=0; i < 500; i++){
add r8,4
dec r9
jne main+0E5h (013F681261h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
這是代碼2,使用64位雙精度(代碼在2000個 CPU周期中運行):
#include <vector>
#include <iostream>
int main(){
std::vector<double> x1;
std::vector<double> x2;
std::vector<unsigned long long> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtscp(&p);
for(int i=0; i < 500; i++){
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp(&q);
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
這里是asm(-Os)產生的:
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r9,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov rdx,rbx
mov r8d,1F4h
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
mov rcx,qword ptr [rdx+r15]
xorps xmm1,xmm1
m = m + (a * g);
cvtsi2sd xmm1,rcx
test rcx,rcx
jns main+120h (013F32129Ch)
addsd xmm1,xmm9
movaps xmm0,xmm1
mulsd xmm0,mmword ptr [rdx+r14]
addsd xmm6,xmm0
n = n + (b * g);
mulsd xmm1,mmword ptr [rdx]
addsd xmm7,xmm1
for(int i=0; i < 500; i++){
add rdx,8
dec r8
jne main+10Ah (013F321286h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
區別在於第一個代碼中整數到雙精度的轉換(向量包含unsigned int
,產品是整數運算,但積累使用double
,在匯編程序中,這會將cvtsi2sd
指令添加到代碼中)。
在第二個代碼中,您在任何地方都使用雙打,因此您沒有轉換,代碼運行得更快。
這種差異將是更加明顯具有定點和浮點處理單元之間的嚴格區分在CPU上(在POWER平台是一個這樣的例子)。 在這方面,X86平台非常寬容。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.