[英]Under what conditions does a C++ compiler use floating-point pipelines to do integer division with run-time-known values for higher performance?
例如,
https://godbolt.org/z/W5GbYxo7o
#include<cstdint>
void divTest1(int * const __restrict__ val1, int * const __restrict__ val2, int * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/val2[i]; // scalar idiv
}
}
void divTest2(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/val2[i]; // scalar div
}
}
void divTest3(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/(uint32_t)10; // shifting + permutation + multiplication
}
}
void divTest4ExplicitFloatOptimization(int * const __restrict__ val1, int * const __restrict__ val2, int * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = ((float)val1[i])/((float)val2[i]); // divps
}
}
void divTest5ExplicitFloatOptimization(int * const __restrict__ val1, int * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = ((float)val1[i])/10.0f; // divps (or *0.1f)
}
}
// lowering bits to 16
void divTest6(uint16_t * const __restrict__ val1, uint16_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/(uint16_t)10; // shifting + multiplication
}
}
// lowering bits to 8
void divTest7(uint8_t * const __restrict__ val1, uint8_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/(uint8_t)10; // still no float pipeline
}
}
在上面的所有函數中,只有兩個具有顯式浮點轉換函數的函數會強制編譯器執行浮點除法 SIMD 指令,看起來它可能不會對所有輸入值執行與 idiv 相同的工作。
有沒有辦法告訴編譯器它可以假設要計算的整數在使用浮點數學的可接受范圍內? 就像是:
int val = std::assume_30bits_precision(ptr[i]);
int var = std::assume_30bits_precision(ptr2[i]);
ptrInt[i] = val / var;
或這個:
int val = std::assume_positive_24bits(ptr[i]);
int var = std::assume_positive_24bits(ptr2[i]);
ptrInt[i] = val / var;
如果沒有這種方法,我可以使用 1-2 Newton-Raphson 或帶浮點數的類似方法來做同樣的事情但比 idiv 更快(非編譯時已知的 integer 值除以非編譯時已知的 integer 值)總是一次做一個操作?
float-to-int 和 int-to-float 轉換/轉換延遲是否可以隱藏在所述方法(即 Newton Raphson)的指令延遲之后?
似乎必須像下面的示例一樣創建自己的自定義模擬器。 (從 integer 到浮點數的直接按位轉換(在intMagic
函數中)為 1 到 1023 或最多尾數位的低范圍整數給出了部分正確的答案,但仍然避免了 1 次轉換/轉換延遲)(這只是一個示例,不適用於精確工作):
https://godbolt.org/z/x4c5dGndz
#include<cstdint>
#include<cmath>
#include<cstring>
#include <stdint.h> // <cstdint> is preferred in C++, but stdint.h works.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
// optional wrapper if you don't want to just use __rdtsc() everywhere
inline
uint64_t readTSC() {
// _mm_lfence(); // optionally wait for earlier insns to retire before reading the clock
uint64_t tsc = __rdtsc();
// _mm_lfence(); // optionally block later instructions until rdtsc retires
return tsc;
}
void intTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/val2[i]; // scalar idiv
}
}
void intEmulationTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
double v1 = val1[i];
double v2 = val2[i];
double v3 = v1/v2;
double t = v3 - (uint32_t)v3;
v3 += t<0.99?0.01:0.0;
val3[i] = v3; // 42-instruction code-bloat 2x faster than 1 idiv >:c
}
}
// writing bits of integer
// directly to bits of mantissa
// up to 23 bits shoul be ok
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
float v1;
float v2;
std::memcpy(
&v1, //mantissa dest
&val1[i], //23 least significant bits src
sizeof(float) // write all bytes anyway. Assume float is 4 bytes as uint32_t!
);
std::memcpy(&v2,&val2[i],sizeof(float));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
// writing bits of 32 integer (but in 64bit storage)
// directly to bits of mantissa of double (53 bits enough?)
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTestDouble(uint64_t * const __restrict__ val1, uint64_t * const __restrict__ val2, uint64_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
double v1;
double v2;
std::memcpy(
&v1, //mantissa dest
&val1[i], //53 least significant bits src
sizeof(double) // write all bytes anyway. Assume float is 8 bytes as uint64_t!
);
std::memcpy(&v2,&val2[i],sizeof(double));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
// writing bits of 32 integer (using temporary 64bit storage)
// directly to bits of mantissa of double (53 bits enough?)
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTestDoubleTmp(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
uint64_t tmp1 = val1[i];
uint64_t tmp2 = val2[i];
double v1;
double v2;
std::memcpy(
&v1, //mantissa dest
&tmp1, //53 least significant bits src
sizeof(double) // write all bytes anyway. Assume float is 8 bytes as uint64_t!
);
std::memcpy(&v2,&tmp2,sizeof(double));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
#include <iostream>
int main()
{
uint32_t a[1024],b[1024],c[1024];
for(int i=0;i<1024;i++)
{
a[i]=1+i*i; b[i]=1+i;
}
uint64_t a64[1024],b64[1024],c64[1024];
for(int i=0;i<1024;i++)
{
a64[i]=1+i*i; b64[i]=1+i;
}
std::cout<<"emulation:"<<std::endl;
auto t1 = readTSC() ;
intEmulationTest(a,b,c);
auto t2 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic:"<<std::endl;
auto t3 = readTSC() ;
intMagicTest(a,b,c);
auto t4 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"int:"<<std::endl;
auto t5 = readTSC() ;
intTest(a,b,c);
auto t6 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic double:"<<std::endl;
auto t7 = readTSC() ;
intMagicTestDouble(a64,b64,c64);
auto t8 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic double tmp:"<<std::endl;
auto t9 = readTSC() ;
intMagicTestDoubleTmp(a,b,c);
auto t10 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"emulation: "<<t2-t1<<" cycles"<<std::endl;
std::cout<<"magic: "<<t4-t3<<" cycles"<<std::endl;
std::cout<<"int: "<<t6-t5<<" cycles"<<std::endl;
std::cout<<"magic double: "<<t8-t7<<" cycles"<<std::endl;
std::cout<<"magic double tmp: "<<t10-t9<<" cycles"<<std::endl;
return 0;
}
godbolt.org 上的 output:
emulation: 7784 cycles <-- should be ok for positive values only, needs more corner-case checking maybe
magic: 1708 cycles <-- not performance-portable (denormals), only 23 bits
int: 16576 cycles
magic double: 11844 cycles <-- not performance-portable
magic double tmp: 5432 cycles <-- not performance-portable
為了彌補轉換開銷和位級黑客攻擊,SIMD 硬件需要更寬,如 8192 位或 16384 位,也許只有這樣,編譯器才能通過 FP SIMD( 100 條指令來檢查每個極端情況,但在 256 條管道上運行~2.5 倍加速)。
GPU 硬件每個 warp/wavefront 有 32 條流水線,每個內核最多有 192 條流水線。 也許它在那里可用,但看起來即使使用 AVX512(至少對於需要完整 32 位精度的通用用途),x86 CPU 的增益也不大。 對於精度非常低的 integer 數學,也可以在任何地方簡單地使用浮點數(假設極端情況不是問題)。
CPU Type: AMD EPYC 7R32 (GCC v11)
emulation: 8260 cycles
magic: 1904 cycles
int: 15708 cycles (this was compiled with uint64_t)
magic double: 12544 cycles
magic double tmp: 6188 cycles
CPU Type: AMD FX(tm)-8150 Eight-Core Processor (GCC v10)
emulation: 20687 cycles
magic: 67583 cycles
int: 32914 cycles
int: 31135 cycles (this was compiled with uint64_t)
magic double: 615307 cycles
magic double tmp: 141889 cycles
CPU Type: Intel(R) Xeon(R) E-2286G CPU @ 4.00GHz
emulation: 9964 cycles
magic: 138052 cycles
int: 6477 cycles
int: 19016 cycles (this was compiled with uint64_t)
magic double: 141443 cycles
magic double tmp: 137180 cycles
CPU Type: Intel(R) Xeon(R) CPU E3-1270 V2 @ 3.50GHz
emulation: 18282 cycles
magic: 210486 cycles
int: 14436 cycles
int: 33604 cycles (this was compiled with uint64_t)
magic double: 225920 cycles
magic double tmp: 217520 cycles
CPU Type: Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
emulation: 39483 cycles
magic: 153666 cycles
int: 33746 cycles (this was compiled with uint64_t)
magic double: 158076 cycles
magic double tmp: 159813 cycles
CPU Type: AMD Opteron(tm) Processor 4332 HE
emulation: 18633 cycles
magic: 114682 cycles
int: 16280 cycles
int: 31070 cycles (this was compiled with uint64_t)
magic double: 504295 cycles
magic double tmp: 104919 cycles
CPU Type: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
emulation: 3448 cycles <--- avx512:
magic: 13296 cycles
int: 7676 cycles
int: 84110 cycles (this was compiled with uint64_t)
magic double: 178162 cycles
magic double tmp: 27662 cycles
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.