Floating point math vectorizes, but integer math does not

Question

I have a tight inner loop that is sucking up quite a bit of CPU power. So I'm trying to optimize it. I've got two versions of the code, one that operates on floating point numbers, the other on uint8_t . The floating point version is a bit faster, because it vectorizes, whereas the integer math does not. Is there anyway to force the integer math to vectorize? Is that something that's even possible? Finally, would it be useful? Or is integer math overrated?

#include <algorithm>
#include <array>
typedef std::array<float, 3> Vec3F;
Vec3F v3fSpread(Vec3F const& source, Vec3F dest, float drop) {
  if (source[0] <= dest[0] + drop && source[1] <= dest[1] + drop && source[2] <= dest[2] + drop) {
    return dest;
  } else {
    float denom = std::max(source[0], std::max(source[1], source[2]));
    dest[0] = std::max(source[0] - drop * source[0] / denom, dest[0]);
    dest[1] = std::max(source[1] - drop * source[1] / denom, dest[1]);
    dest[2] = std::max(source[2] - drop * source[2] / denom, dest[2]);
    return dest;
  }
}

Which assembles into (56 lines):

v3fSpread(std::array<float, 3ul> const&, std::array<float, 3ul>, float):
movq    %xmm0, -40(%rsp)
movaps  %xmm2, %xmm0
movd    %xmm1, %rax
movss   -40(%rsp), %xmm6
movl    %eax, -32(%rsp)
movss   (%rdi), %xmm1
addss   %xmm6, %xmm0
movss   -36(%rsp), %xmm7
movss   -32(%rsp), %xmm8
movss   4(%rdi), %xmm3
ucomiss %xmm1, %xmm0
jb  .L24
movaps  %xmm2, %xmm0
movss   8(%rdi), %xmm4
addss   %xmm7, %xmm0
ucomiss %xmm3, %xmm0
jae .L4
.L5:
movaps  %xmm4, %xmm0
movaps  %xmm1, %xmm5
maxss   %xmm3, %xmm0
mulss   %xmm2, %xmm5
maxss   %xmm1, %xmm0
divss   %xmm0, %xmm5
subss   %xmm5, %xmm1
movaps  %xmm2, %xmm5
mulss   %xmm3, %xmm5
mulss   %xmm4, %xmm2
maxss   %xmm1, %xmm6
divss   %xmm0, %xmm5
movss   %xmm6, -24(%rsp)
divss   %xmm0, %xmm2
subss   %xmm5, %xmm3
maxss   %xmm3, %xmm7
movss   %xmm7, -20(%rsp)
movq    -24(%rsp), %xmm0
subss   %xmm2, %xmm4
maxss   %xmm4, %xmm8
movss   %xmm8, -16(%rsp)
movd    -16(%rsp), %xmm1
ret
 .L24:
movss   8(%rdi), %xmm4
jmp .L5
.L4:
movaps  %xmm2, %xmm0
addss   %xmm8, %xmm0
ucomiss %xmm4, %xmm0
jb  .L5
movss   %xmm6, -24(%rsp)
movss   %xmm7, -20(%rsp)
movss   %xmm8, -16(%rsp)
movq    -24(%rsp), %xmm0
movd    -16(%rsp), %xmm1
ret

And:

#include <algorithm>
#include <array>
#include <inttypes.h>
typedef std::array<uint8_t, 3> Vec3B;
typedef std::array<int32_t, 3> Vec3I;
Vec3B v3bSpread(Vec3B const& source, Vec3B dest, int32_t drop) {
  Vec3I intSource = {source[0], source[1], source[2]};
  Vec3I intDest = {dest[0], dest[1], dest[2]};
  if (intSource[0] <= intDest[0] + drop && intSource[1] <= intDest[1] + drop && intSource[2] <= intDest[2] + drop) {
    return dest;
  } else { 
    int32_t denom = std::max(intSource[0], std::max(intSource[1], intSource[2]));
    dest[0] = (uint8_t)std::max<int32_t>(intSource[0] - drop * intSource[0] / denom, intDest[0]);
    dest[1] = (uint8_t)std::max<int32_t>(intSource[1] - drop * intSource[1] / denom, intDest[1]);
    dest[2] = (uint8_t)std::max<int32_t>(intSource[2] - drop * intSource[2] / denom, intDest[2]);
    return dest;
  } 
}

Which assembles into (68 lines):

v3bSpread(std::array<unsigned char, 3ul> const&, std::array<unsigned char, 3ul>, unsigned int):
pushq   %rbx
movzbl  %sil, %r11d
movl    %esi, %ebx
movzbl  (%rdi), %r8d
movzbl  %r11b, %eax
shrw    $8, %bx
addl    %edx, %eax
shrl    $16, %esi
movzbl  1(%rdi), %r10d
movl    %edx, %r9d
movzbl  2(%rdi), %edi
cmpl    %eax, %r8d
ja  .L4
movzbl  %bl, %eax
addl    %edx, %eax
cmpl    %eax, %r10d
jbe .L10
.L4:
cmpl    %edi, %r10d
movl    %edi, %ecx
movl    %r8d, %eax
cmovge  %r10d, %ecx
cmpl    %ecx, %r8d
cmovge  %r8d, %ecx
imull   %r9d, %eax
xorl    %edx, %edx
divl    %ecx
subl    %eax, %r8d
movl    %r10d, %eax
cmpl    %r11d, %r8d
cmovge  %r8d, %r11d
imull   %r9d, %eax
xorl    %edx, %edx
movb    %r11b, -32(%rsp)
divl    %ecx
movzbl  %bl, %edx
subl    %eax, %r10d
movl    %edi, %eax
cmpl    %edx, %r10d
cmovl   %edx, %r10d
imull   %r9d, %eax
xorl    %edx, %edx
movb    %r10b, -31(%rsp)
divl    %ecx
subl    %eax, %edi
movzbl  %sil, %eax
cmpl    %eax, %edi
cmovl   %eax, %edi
movb    %dil, -30(%rsp)
.L6:
movzbl  -31(%rsp), %eax
movzbl  -32(%rsp), %edx
movb    %al, %dh
movzbl  -30(%rsp), %eax
popq    %rbx
salq    $16, %rax
orq %rdx, %rax
ret
.L10:
movzbl  %sil, %eax
addl    %edx, %eax
cmpl    %eax, %edi
ja  .L4
movb    %r11b, -32(%rsp)
movb    %bl, -31(%rsp)
movb    %sil, -30(%rsp)
jmp .L6

Answer 1

What makes you think the generated floating-point code is vectorized? All the SSE instructions I see are -ss suffixed, ie, Scalar-Single, not Packed-Single.

As far as the possibility of vectorizing this code goes, I don't think it's possible to vectorize the integer code with SSEx, since there are no SSE integer division instructions.

Answer 2

Sometimes hand optimisation leads to no optimisation at all. Since those three vector component update statements are essentially an unrolled loop, you should have better kept them as a loop to hint the compiler:

for (int i = 0; i < 3; i++)
  dest[i] = std::max(source[i] - drop * source[i] / denom, dest[i]);

When in doubt if GCC got anything vectorised or not, make the tree vectoriser (much) more chatty with -ftree-vectorizer-verbose=7 :

Integer version with loop:

$ gcc-4.7 -O3 -std=c++0x -msse4.2 -ftree-vectorizer-verbose=7 -funroll-loops -S vec_int.cpp

Analyzing loop at vec_int.cpp:13

...
13: not vectorized: relevant stmt not supported: D.46751_60 = D.46750_59 / prephitmp.65_135;

vec_int.cpp:6: note: vectorized 0 loops in function.

This simply indicates what mattst88 has already pointed out - no packed integer division instruction is available in SSE (nor in AVX).

Floating point version with loop:

$ gcc-4.7 -O3 -std=c++0x -msse4.2 -ftree-vectorizer-verbose=7 -funroll-loops -S vec_float.cpp

Analyzing loop at vec_float.cpp:9

...
9: not vectorized: iteration count too small.
vec_float.cpp:4: note: vectorized 0 loops in function.

Again, no vectorisation - too few loop iteration. No vector luck with GCC.

On the other hand the 13.0 beta version of the C++ compiler from that well-known chip vendor is able to vectorise the floating-point loop (previous versions are not able to do it) but not the integer one. Here is a sample of what assembly it produces:

# parameter 1: %rdi
# parameter 2: %xmm0 %xmm1
# parameter 3: %xmm2
movlps    %xmm0, -24(%rsp)
movss     (%rdi), %xmm9
addss     %xmm2, %xmm0
movss     %xmm1, -16(%rsp)
movss     4(%rdi), %xmm3
movss     8(%rdi), %xmm1
comiss    %xmm9, %xmm0
jae       ..B1.3        # Prob 22%
..B1.2:
lea       -16(%rsp), %rax
jmp       ..B1.6
..B1.3:
movss     -20(%rsp), %xmm0
lea       -16(%rsp), %rax
addss     %xmm2, %xmm0
comiss    %xmm3, %xmm0
jb        ..B1.6        # Prob 78%
..B1.4:
movss     (%rax), %xmm0
addss     %xmm2, %xmm0
comiss    %xmm1, %xmm0
jb        ..B1.6        # Prob 43%
..B1.5:
movsd     -24(%rsp), %xmm0
movss     -16(%rsp), %xmm1
ret
..B1.6:
movaps    %xmm2, %xmm5
maxss     %xmm1, %xmm3
movsd     (%rdi), %xmm7
maxss     %xmm3, %xmm9
movaps    %xmm9, %xmm3
; ------------------------- here starts the SIMD part
shufps    $0, %xmm3, %xmm3
rcpps     %xmm3, %xmm4
mulps     %xmm4, %xmm3
mulps     %xmm4, %xmm3
addps     %xmm4, %xmm4
shufps    $0, %xmm5, %xmm5
subps     %xmm3, %xmm4
mulps     %xmm7, %xmm5
mulps     %xmm4, %xmm5
movsd     -24(%rsp), %xmm6
subps     %xmm5, %xmm7
maxps     %xmm6, %xmm7
movlpd    %xmm7, -24(%rsp)
; ------------------------- here ends the SIMD part :)
movss     8(%rdi), %xmm8
mulss     %xmm8, %xmm2
divss     %xmm9, %xmm2
subss     %xmm2, %xmm8
maxss     (%rax), %xmm8
movss     %xmm8, (%rax)
movsd     -24(%rsp), %xmm0
movss     -16(%rsp), %xmm1
ret

(AVX code looks almost the same)

Still only two of three iterations are vectorised and there are some WTF parts in the generated code that make me scratch my head though...

Note that vectorisation doesn't come for free and sometimes carefully crafted serial code might be more efficient than the respective SIMD version.

Floating point math vectorizes, but integer math does not

Question

2 answers

solution1
12 2012-07-03 04:34:08

solution2
8 ACCPTED 2012-07-03 10:50:19

Floating point math vectorizes, but integer math does not

Question

2 answers

solution1 12 2012-07-03 04:34:08

solution2 8 ACCPTED 2012-07-03 10:50:19

solution1
12 2012-07-03 04:34:08

solution2
8 ACCPTED 2012-07-03 10:50:19