简体   繁体   English

浮点数学矢量化,但整数数学不会

[英]Floating point math vectorizes, but integer math does not

I have a tight inner loop that is sucking up quite a bit of CPU power. 我有一个紧凑的内循环,吸收了相当多的CPU功率。 So I'm trying to optimize it. 所以我正在努力优化它。 I've got two versions of the code, one that operates on floating point numbers, the other on uint8_t . 我有两个版本的代码,一个用于浮点数,另一个用于uint8_t The floating point version is a bit faster, because it vectorizes, whereas the integer math does not. 浮点版本有点快,因为它会向量化,而整数数学则不会。 Is there anyway to force the integer math to vectorize? 无论如何强制整数数学进行向量化? Is that something that's even possible? 这有可能吗? Finally, would it be useful? 最后,它会有用吗? Or is integer math overrated? 或整数数学被高估?

#include <algorithm>
#include <array>
typedef std::array<float, 3> Vec3F;
Vec3F v3fSpread(Vec3F const& source, Vec3F dest, float drop) {
  if (source[0] <= dest[0] + drop && source[1] <= dest[1] + drop && source[2] <= dest[2] + drop) {
    return dest;
  } else {
    float denom = std::max(source[0], std::max(source[1], source[2]));
    dest[0] = std::max(source[0] - drop * source[0] / denom, dest[0]);
    dest[1] = std::max(source[1] - drop * source[1] / denom, dest[1]);
    dest[2] = std::max(source[2] - drop * source[2] / denom, dest[2]);
    return dest;
  }
}

Which assembles into (56 lines): 哪个组装成(56行):

v3fSpread(std::array<float, 3ul> const&, std::array<float, 3ul>, float):
movq    %xmm0, -40(%rsp)
movaps  %xmm2, %xmm0
movd    %xmm1, %rax
movss   -40(%rsp), %xmm6
movl    %eax, -32(%rsp)
movss   (%rdi), %xmm1
addss   %xmm6, %xmm0
movss   -36(%rsp), %xmm7
movss   -32(%rsp), %xmm8
movss   4(%rdi), %xmm3
ucomiss %xmm1, %xmm0
jb  .L24
movaps  %xmm2, %xmm0
movss   8(%rdi), %xmm4
addss   %xmm7, %xmm0
ucomiss %xmm3, %xmm0
jae .L4
.L5:
movaps  %xmm4, %xmm0
movaps  %xmm1, %xmm5
maxss   %xmm3, %xmm0
mulss   %xmm2, %xmm5
maxss   %xmm1, %xmm0
divss   %xmm0, %xmm5
subss   %xmm5, %xmm1
movaps  %xmm2, %xmm5
mulss   %xmm3, %xmm5
mulss   %xmm4, %xmm2
maxss   %xmm1, %xmm6
divss   %xmm0, %xmm5
movss   %xmm6, -24(%rsp)
divss   %xmm0, %xmm2
subss   %xmm5, %xmm3
maxss   %xmm3, %xmm7
movss   %xmm7, -20(%rsp)
movq    -24(%rsp), %xmm0
subss   %xmm2, %xmm4
maxss   %xmm4, %xmm8
movss   %xmm8, -16(%rsp)
movd    -16(%rsp), %xmm1
ret
 .L24:
movss   8(%rdi), %xmm4
jmp .L5
.L4:
movaps  %xmm2, %xmm0
addss   %xmm8, %xmm0
ucomiss %xmm4, %xmm0
jb  .L5
movss   %xmm6, -24(%rsp)
movss   %xmm7, -20(%rsp)
movss   %xmm8, -16(%rsp)
movq    -24(%rsp), %xmm0
movd    -16(%rsp), %xmm1
ret

And: 和:

#include <algorithm>
#include <array>
#include <inttypes.h>
typedef std::array<uint8_t, 3> Vec3B;
typedef std::array<int32_t, 3> Vec3I;
Vec3B v3bSpread(Vec3B const& source, Vec3B dest, int32_t drop) {
  Vec3I intSource = {source[0], source[1], source[2]};
  Vec3I intDest = {dest[0], dest[1], dest[2]};
  if (intSource[0] <= intDest[0] + drop && intSource[1] <= intDest[1] + drop && intSource[2] <= intDest[2] + drop) {
    return dest;
  } else { 
    int32_t denom = std::max(intSource[0], std::max(intSource[1], intSource[2]));
    dest[0] = (uint8_t)std::max<int32_t>(intSource[0] - drop * intSource[0] / denom, intDest[0]);
    dest[1] = (uint8_t)std::max<int32_t>(intSource[1] - drop * intSource[1] / denom, intDest[1]);
    dest[2] = (uint8_t)std::max<int32_t>(intSource[2] - drop * intSource[2] / denom, intDest[2]);
    return dest;
  } 
} 

Which assembles into (68 lines): 哪个组装成(68行):

v3bSpread(std::array<unsigned char, 3ul> const&, std::array<unsigned char, 3ul>, unsigned int):
pushq   %rbx
movzbl  %sil, %r11d
movl    %esi, %ebx
movzbl  (%rdi), %r8d
movzbl  %r11b, %eax
shrw    $8, %bx
addl    %edx, %eax
shrl    $16, %esi
movzbl  1(%rdi), %r10d
movl    %edx, %r9d
movzbl  2(%rdi), %edi
cmpl    %eax, %r8d
ja  .L4
movzbl  %bl, %eax
addl    %edx, %eax
cmpl    %eax, %r10d
jbe .L10
.L4:
cmpl    %edi, %r10d
movl    %edi, %ecx
movl    %r8d, %eax
cmovge  %r10d, %ecx
cmpl    %ecx, %r8d
cmovge  %r8d, %ecx
imull   %r9d, %eax
xorl    %edx, %edx
divl    %ecx
subl    %eax, %r8d
movl    %r10d, %eax
cmpl    %r11d, %r8d
cmovge  %r8d, %r11d
imull   %r9d, %eax
xorl    %edx, %edx
movb    %r11b, -32(%rsp)
divl    %ecx
movzbl  %bl, %edx
subl    %eax, %r10d
movl    %edi, %eax
cmpl    %edx, %r10d
cmovl   %edx, %r10d
imull   %r9d, %eax
xorl    %edx, %edx
movb    %r10b, -31(%rsp)
divl    %ecx
subl    %eax, %edi
movzbl  %sil, %eax
cmpl    %eax, %edi
cmovl   %eax, %edi
movb    %dil, -30(%rsp)
.L6:
movzbl  -31(%rsp), %eax
movzbl  -32(%rsp), %edx
movb    %al, %dh
movzbl  -30(%rsp), %eax
popq    %rbx
salq    $16, %rax
orq %rdx, %rax
ret
.L10:
movzbl  %sil, %eax
addl    %edx, %eax
cmpl    %eax, %edi
ja  .L4
movb    %r11b, -32(%rsp)
movb    %bl, -31(%rsp)
movb    %sil, -30(%rsp)
jmp .L6

What makes you think the generated floating-point code is vectorized? 是什么让你认为生成的浮点代码是矢量化的? All the SSE instructions I see are -ss suffixed, ie, Scalar-Single, not Packed-Single. 我看到的所有SSE指令都是-ss后缀,即Scalar-Single,而不是Packed-Single。

As far as the possibility of vectorizing this code goes, I don't think it's possible to vectorize the integer code with SSEx, since there are no SSE integer division instructions. 至于矢量化这个代码的可能性,我不认为用SSEx矢量化整数代码是可能的,因为没有SSE整数除法指令。

Sometimes hand optimisation leads to no optimisation at all. 有时手优化根本不会导致优化。 Since those three vector component update statements are essentially an unrolled loop, you should have better kept them as a loop to hint the compiler: 由于这三个向量组件更新语句本质上是一个展开循环,您最好将它们保存为循环以提示编译器:

for (int i = 0; i < 3; i++)
  dest[i] = std::max(source[i] - drop * source[i] / denom, dest[i]);

When in doubt if GCC got anything vectorised or not, make the tree vectoriser (much) more chatty with -ftree-vectorizer-verbose=7 : 如果怀疑GCC是否有任何矢量化,请使用-ftree-vectorizer-verbose=7使树矢量(更多)变得更加健谈:

Integer version with loop: 带循环的整数版本:

$ gcc-4.7 -O3 -std=c++0x -msse4.2 -ftree-vectorizer-verbose=7 -funroll-loops -S vec_int.cpp

Analyzing loop at vec_int.cpp:13

...
13: not vectorized: relevant stmt not supported: D.46751_60 = D.46750_59 / prephitmp.65_135;

vec_int.cpp:6: note: vectorized 0 loops in function.

This simply indicates what mattst88 has already pointed out - no packed integer division instruction is available in SSE (nor in AVX). 这简单地表明了mattst88已经指出的内容 - 在SSE(AVX中)中没有可用的整数除法指令。

Floating point version with loop: 带循环的浮点版本:

$ gcc-4.7 -O3 -std=c++0x -msse4.2 -ftree-vectorizer-verbose=7 -funroll-loops -S vec_float.cpp

Analyzing loop at vec_float.cpp:9

...
9: not vectorized: iteration count too small.
vec_float.cpp:4: note: vectorized 0 loops in function.

Again, no vectorisation - too few loop iteration. 同样,没有矢量化 - 循环迭代太少。 No vector luck with GCC. GCC没有矢量运气。

On the other hand the 13.0 beta version of the C++ compiler from that well-known chip vendor is able to vectorise the floating-point loop (previous versions are not able to do it) but not the integer one. 另一方面,来自知名芯片供应商的13.0 beta版C ++编译器能够对浮点循环进行矢量化(以前的版本无法做到),而不是整数。 Here is a sample of what assembly it produces: 以下是它生成的程序集的示例:

# parameter 1: %rdi
# parameter 2: %xmm0 %xmm1
# parameter 3: %xmm2
movlps    %xmm0, -24(%rsp)
movss     (%rdi), %xmm9
addss     %xmm2, %xmm0
movss     %xmm1, -16(%rsp)
movss     4(%rdi), %xmm3
movss     8(%rdi), %xmm1
comiss    %xmm9, %xmm0
jae       ..B1.3        # Prob 22%
..B1.2:
lea       -16(%rsp), %rax
jmp       ..B1.6
..B1.3:
movss     -20(%rsp), %xmm0
lea       -16(%rsp), %rax
addss     %xmm2, %xmm0
comiss    %xmm3, %xmm0
jb        ..B1.6        # Prob 78%
..B1.4:
movss     (%rax), %xmm0
addss     %xmm2, %xmm0
comiss    %xmm1, %xmm0
jb        ..B1.6        # Prob 43%
..B1.5:
movsd     -24(%rsp), %xmm0
movss     -16(%rsp), %xmm1
ret
..B1.6:
movaps    %xmm2, %xmm5
maxss     %xmm1, %xmm3
movsd     (%rdi), %xmm7
maxss     %xmm3, %xmm9
movaps    %xmm9, %xmm3
; ------------------------- here starts the SIMD part
shufps    $0, %xmm3, %xmm3
rcpps     %xmm3, %xmm4
mulps     %xmm4, %xmm3
mulps     %xmm4, %xmm3
addps     %xmm4, %xmm4
shufps    $0, %xmm5, %xmm5
subps     %xmm3, %xmm4
mulps     %xmm7, %xmm5
mulps     %xmm4, %xmm5
movsd     -24(%rsp), %xmm6
subps     %xmm5, %xmm7
maxps     %xmm6, %xmm7
movlpd    %xmm7, -24(%rsp)
; ------------------------- here ends the SIMD part :)
movss     8(%rdi), %xmm8
mulss     %xmm8, %xmm2
divss     %xmm9, %xmm2
subss     %xmm2, %xmm8
maxss     (%rax), %xmm8
movss     %xmm8, (%rax)
movsd     -24(%rsp), %xmm0
movss     -16(%rsp), %xmm1
ret

(AVX code looks almost the same) (AVX代码看起来几乎一样)

Still only two of three iterations are vectorised and there are some WTF parts in the generated code that make me scratch my head though... 仍然只有三次迭代中的两次被矢量化,并且在生成的代码中有一些WTF部分让我抓挠我的脑袋......

Note that vectorisation doesn't come for free and sometimes carefully crafted serial code might be more efficient than the respective SIMD version. 请注意,矢量化不是免费的,有时精心设计的串行代码可能比相应的SIMD版本更有效。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM