简体   繁体   English

64位双向量的向量比32位无符号整数的向量更快?

[英]Vector of 64-bit double faster to dot-product than a vector of 32-bit unsigned int?

I have two designs of code iterating over vectors of size 500. One of the designs contains arrays of 64-bit doubles and the second design uses arrays containing 32-bit integers. 我有两种代码迭代大小为500的矢量设计。其中一个设计包含64位双精度数组,第二个设计使用包含32位整数的数组。 I was expecting the 32-bit design to be quicker because more useful data can be packed in the cache. 我期待32位设计更快,因为更多有用的数据可以打包在缓存中。

Compiler MSVC, CPU Ivy Bridge, compiling 64-bit mode. 编译器MSVC,CPU Ivy Bridge,编译64位模式。

This is code 1, using the 32-bit ints (runs in 2600 CPU cycles): 这是代码1,使用32位整数(在2600个 CPU周期中运行):

#include <vector>
#include <iostream>

int main(){

    std::vector<unsigned int> x1;
    std::vector<unsigned int> x2;
    std::vector<unsigned int> x3;
    x1.resize(500);
    x2.resize(500);
    x3.resize(500);

    for(int i =0; i<500; i++){
        x1[i] = i;
        x2[i] = 2*i;
        x3[i] = 4*i;
    }


    int counter = 0;
    while(counter < 1000){
        unsigned long long start = 0;
        unsigned long long end = 0;

        double m = 0;
        double n = 0;

        start = __rdtsc();

        for(int i=0; i < 500; i++){
            unsigned int a = x1[i];
            unsigned int b = x2[i];
            unsigned int g = x3[i];
            m = m + (a * g);
            n = n + (b * g);
        }

        end = __rdtscp();

        std::cout << (end-start) << "\t\t"<<m << n << std::endl;
        counter++;
    }
}

producing this asm (-Os): 产生这个asm(-Os):

start = __rdtscp(&p);
 rdtscp  
 lea         r8,[rbp+6Fh]  
 mov         dword ptr [r8],ecx  
 shl         rdx,20h  
 or          rax,rdx  
 mov         r10,rax  
        unsigned int p;
        unsigned int q;
        unsigned long long start = 0;
        unsigned long long end = 0;

        double m = 0;
 mov         r8,rbx  
 mov         r9d,1F4h  
            unsigned int a = x1[i];
            unsigned int b = x2[i];
            unsigned int g = x3[i];
 mov         edx,dword ptr [r8+r15]  
            m = m + (a * g);
 mov         ecx,edx  
 imul        ecx,dword ptr [r8+r14]  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rcx  
 addsd       xmm7,xmm0  
            n = n + (b * g);
 imul        edx,dword ptr [r8]  
 mov         eax,edx  
 xorps       xmm0,xmm0  
 cvtsi2sd    xmm0,rax  
 addsd       xmm8,xmm0  

        for(int i=0; i < 500; i++){
 add         r8,4  
 dec         r9  
 jne         main+0E5h (013F681261h)  
        }

        end = __rdtscp(&q);
 rdtscp  
        }

        end = __rdtscp(&q);
 lea         r8,[rbp+6Fh]  
 mov         dword ptr [r8],ecx  
 shl         rdx,20h  
 or          rdx,rax  

This is code 2, using the 64-bit doubles (code runs in 2000 CPU cycles): 这是代码2,使用64位双精度(代码在2000个 CPU周期中运行):

#include <vector>
#include <iostream>

int main(){

    std::vector<double> x1;
    std::vector<double> x2;
    std::vector<unsigned long long> x3;
    x1.resize(500);
    x2.resize(500);
    x3.resize(500);

    for(int i =0; i<500; i++){
        x1[i] = i;
        x2[i] = 2*i;
        x3[i] = 4*i;
    }

    int counter = 0;
    while(counter < 1000){
        unsigned int p;
        unsigned int q;
        unsigned long long start = 0;
        unsigned long long end = 0;

        double m = 0;
        double n = 0;

        start = __rdtscp(&p);

        for(int i=0; i < 500; i++){
            double a = x1[i];
            double b = x2[i];
            unsigned long long g = x3[i];
            m = m + (a * g);
            n = n + (b * g);
        }

        end = __rdtscp(&q);

        std::cout << (end-start) << "\t\t"<<m << n << std::endl;
        counter++;
    }
}

and here is the asm (-Os) produced: 这里是asm(-Os)产生的:

start = __rdtscp(&p);
 rdtscp  
 lea         r8,[rbp+6Fh]  
 mov         dword ptr [r8],ecx  
 shl         rdx,20h  
 or          rax,rdx  
 mov         r9,rax  
        unsigned int p;
        unsigned int q;
        unsigned long long start = 0;
        unsigned long long end = 0;

        double m = 0;
 mov         rdx,rbx  
 mov         r8d,1F4h  
            double a = x1[i];
            double b = x2[i];
            unsigned long long g = x3[i];
 mov         rcx,qword ptr [rdx+r15]  
 xorps       xmm1,xmm1  
            m = m + (a * g);
 cvtsi2sd    xmm1,rcx  
 test        rcx,rcx  
 jns         main+120h (013F32129Ch)  
 addsd       xmm1,xmm9  
 movaps      xmm0,xmm1  
 mulsd       xmm0,mmword ptr [rdx+r14]  
 addsd       xmm6,xmm0  
            n = n + (b * g);
 mulsd       xmm1,mmword ptr [rdx]  
 addsd       xmm7,xmm1  

        for(int i=0; i < 500; i++){
 add         rdx,8  
 dec         r8  
 jne         main+10Ah (013F321286h)  
        }

        end = __rdtscp(&q);
 rdtscp  
        }

        end = __rdtscp(&q);
 lea         r8,[rbp+6Fh]  
 mov         dword ptr [r8],ecx  
 shl         rdx,20h  
 or          rdx,rax

The difference is the conversion of integers to doubles in the first code (the vectors contain unsigned int , the product is in integer arithmetic, but the accumulation uses double , in assembler this adds the cvtsi2sd instruction to your code). 区别在于第一个代码中整数到双精度的转换(向量包含unsigned int ,产品是整数运算,但积累使用double ,在汇编程序中,这会将cvtsi2sd指令添加到代码中)。

In the second code, you use doubles everywhere, so you don't have a conversion and the code runs faster. 在第二个代码中,您在任何地方都使用双打,因此您没有转换,代码运行得更快。

This difference would have been much more pronounced on a CPU that has a stricter distinction between the fixed and floating point processing units (the POWER platform is an example for this). 这种差异将是更加明显具有定点和浮点处理单元之间的严格区分在CPU上(在POWER平台是一个这样的例子)。 The X86 platform is very forgiving in that respect. 在这方面,X86平台非常宽容。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

相关问题 使用 192/256 位整数求和无符号 64 位整数向量的点积的最快方法? - Fastest way to sum dot product of vector of unsigned 64 bit integers using 192/256 bit integer? 64位和32位整数 - 64-bit and 32-bit integers 如何将32位有符号整数放入64位无符号整数的高32位? - How to put 32-bit signed integer into higher 32 bits of 64-bit unsigned integer? 在Windows上,试用版代码的运行速度比32位快32倍,而在Linux上则高于64位 - Trial-division code runs 2x faster as 32-bit on Windows than 64-bit on Linux 与 32 位和 64 位平台的 int64_t 匹配的整数文字? - Integer literal that matches int64_t for both 32-bit and 64-bit platforms? 如何在一个 64 位整数中存储和使用两个 32 位带符号整数? - How to store and use two 32-bit signed int in one 64-bit int? 将有符号 32 位整数与无符号 64 位整数相加 - Sum signed 32-bit int with unsigned 64bit int uint32_t 作为向量索引在 64 位中比 size_t 具有更好的性能 - uint32_t as vector index has better performance than size_t in 64-bit 使用int32_t而不是double运行矢量点积是否更快? - Is it faster to run a vector dot product using int32_t instead of a double? 在32位Linux和64位Linux和MPFR上的long long int - long long int on 32-bit Linux vs 64-bit Linux and MPFR
 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM