Visual Studio 2012 different values Release/Debug mode

Question

This code produces different values in MSVS 2012, Windows 7, when switching between Debug and Release mode:

#include <iostream>
using namespace std;

int A[20000];

int main() {

    int shift = 0;
    int Period = 30;
    //Fill array
    for(int i = 0; i < 20000; i++) {
        A[i] = i * 2 + 123;
    }

    int sumTotal = 0;
    int sum = 0;

    for(int bars = Period + 10; bars < 1000; bars++) {
        sum = 0;
        for(int i = 0; i< Period; i++) {
            sum += A[bars - i];
        }
        sumTotal += sum;
    }
    cout << sumTotal << endl;
}

Can you reproduce or find why? I've been testing with all kind of settings on project properties.

Debug (the correct result): 32630400
Release: 32814720

/GS /GL /analyze- /W3 /Gy /Zc:wchar_t /I"C:\\Program Files (x86)\\Visual Leak Detector\\include" /Z7 /Gm- /O2 /Fd"Release\\vc110.pdb" /fp:precise /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oy- /Oi /MD /Fa"Release\\" /EHsc /nologo /Fo"Release\\" /Fp"Release\\Testing.pch"

Answer 1

I tested the "reduced" version of the code using VS2012 C compiler

int main()
{
  int A[12] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };

  int sum = 0;
  int i;

  for (i = 0; i < 12; ++i)
     sum += A[11 - i];

  printf("%d\n", sum);

  return 0;
}

I compiled it in x64 mode Release config optimized for speed. The bug is still there, but depending on other optimization and code generation settings it reveals itself differently. One version of the code generated "random" result, while another consistently produced 8 as the sum (instead of the proper 12 ).

This is what the generated code looks like for the version that consistently produces 8

000000013FC81DF0  mov         rax,rsp  
000000013FC81DF3  sub         rsp,68h  
000000013FC81DF7  movd        xmm1,dword ptr [rax-18h]  
000000013FC81DFC  movd        xmm2,dword ptr [rax-10h]  
000000013FC81E01  movd        xmm5,dword ptr [rax-0Ch]  
000000013FC81E06  xorps       xmm0,xmm0  
000000013FC81E09  xorps       xmm3,xmm3  

for (i = 0; i < 12; ++i)
000000013FC81E0C  xor         ecx,ecx  
000000013FC81E0E  mov         dword ptr [rax-48h],1  
000000013FC81E15  mov         dword ptr [rax-44h],1  
000000013FC81E1C  mov         dword ptr [rax-40h],1  
000000013FC81E23  punpckldq   xmm2,xmm1  
000000013FC81E27  mov         dword ptr [rax-3Ch],1  
000000013FC81E2E  mov         dword ptr [rax-38h],1  
000000013FC81E35  mov         dword ptr [rax-34h],1  
{
     sum += A[11 - i];
000000013FC81E3C  movdqa      xmm4,xmmword ptr [__xmm@00000001000000010000000100000001 (013FC83360h)]  
000000013FC81E44  paddd       xmm4,xmm0  
000000013FC81E48  movd        xmm0,dword ptr [rax-14h]  
000000013FC81E4D  mov         dword ptr [rax-30h],1  
000000013FC81E54  mov         dword ptr [rax-2Ch],1  
000000013FC81E5B  mov         dword ptr [rax-28h],1  
000000013FC81E62  mov         dword ptr [rax-24h],1  
000000013FC81E69  punpckldq   xmm5,xmm0  
000000013FC81E6D  punpckldq   xmm5,xmm2  
000000013FC81E71  paddd       xmm5,xmm3  
000000013FC81E75  paddd       xmm5,xmm4  
000000013FC81E79  mov         dword ptr [rax-20h],1  
000000013FC81E80  mov         dword ptr [rax-1Ch],1  
000000013FC81E87  mov         r8d,ecx  
000000013FC81E8A  movdqa      xmm0,xmm5  
000000013FC81E8E  psrldq      xmm0,8  
000000013FC81E93  paddd       xmm5,xmm0  
000000013FC81E97  movdqa      xmm0,xmm5  
000000013FC81E9B  lea         rax,[rax-40h]  
000000013FC81E9F  mov         r9d,2  
000000013FC81EA5  psrldq      xmm0,4  
000000013FC81EAA  paddd       xmm5,xmm0  
000000013FC81EAE  movd        edx,xmm5  
000000013FC81EB2  nop         word ptr [rax+rax]  
{
     sum += A[11 - i];
000000013FC81EC0  add         ecx,dword ptr [rax+4]  
000000013FC81EC3  add         r8d,dword ptr [rax]  
000000013FC81EC6  lea         rax,[rax-8]  
000000013FC81ECA  dec         r9  
000000013FC81ECD  jne         main+0D0h (013FC81EC0h)  
}

printf("%d\n", sum);
000000013FC81ECF  lea         eax,[r8+rcx]  
000000013FC81ED3  lea         rcx,[__security_cookie_complement+8h (013FC84040h)]  
000000013FC81EDA  add         edx,eax  
000000013FC81EDC  call        qword ptr [__imp_printf (013FC83140h)]  

return 0;
000000013FC81EE2  xor         eax,eax  
}
000000013FC81EE4  add         rsp,68h  
000000013FC81EE8  ret

There's a lot of strange and seemingly unnecessary mumbo-jumbo there left over by the code generator and optimizer, but what this code does can be briefly described as follows.

There are two independent algorithms there employed to produce the final sum, which are apparently supposed to process different portions of the array. I'd guess that two processing flows (non-SSE and SSE) are used to promote parallelism through instruction pipelining.

One algorithm is a simple cycle, which sums array elements, processing two elements per iteration. It can be extracted from the above "interleaved" code as follows

; Initialization
000000013F1E1E0C  xor         ecx,ecx                 ; ecx - odd element sum
000000013F1E1E87  mov         r8d,ecx                 ; r8 - even element sum
000000013F1E1E9B  lea         rax,[rax-40h]           ; start from i = 2
000000013F1E1E9F  mov         r9d,2                   ; do 2 iterations

; The cycle
000000013F1E1EC0  add         ecx,dword ptr [rax+4]   ; ecx += A[i + 1]
000000013F1E1EC3  add         r8d,dword ptr [rax]     ; r8d += A[i]
000000013F1E1EC6  lea         rax,[rax-8]             ; i -= 2
000000013F1E1ECA  dec         r9                      
000000013F1E1ECD  jne         main+0D0h (013F1E1EC0h) ; loop again if r9 is not zero

This algorithm begins adding elements from address rax - 40h , which in my experiment was equal to &A[2] and makes two iterations backwards skipping back over two elements. This accumulates the sum of A[0] and A[2] in register r8 and sum of A[1] and A[3] in register ecx . Thus this part of the algorithm processes 4 elements of the array and properly generates values 2 in both r8 and ecx .

The other part of the algorithm is written using SSE instructions and is apparently responsible for summing the remaining portion of the array. It can be extracted from the code as follows

; Initially xmm5 is zero
000000013F1E1E3C  movdqa      xmm4,xmmword ptr [__xmm@00000001000000010000000100000001 (013F1E3360h)]  
000000013F1E1E75  paddd       xmm5,xmm4  

000000013F1E1E8A  movdqa      xmm0,xmm5               ; copy
000000013F1E1E8E  psrldq      xmm0,8                  ; shift
000000013F1E1E93  paddd       xmm5,xmm0               ; and add

000000013F1E1E8A  movdqa      xmm0,xmm5               ; copy
000000013F1E1E8E  psrldq      xmm0,4                  ; shift
000000013F1E1E93  paddd       xmm5,xmm0               ; and add

000000013F1E1EAE  movd        edx,xmm5                ; edx - the sum

The general algorithm used by that part is simple: it places value 0x00000001000000010000000100000001 in 128-bit register xmm5 , then shifts it 8 bytes to the right ( 0x00000000000000000000000100000001 ) and adds it to the original value, producing 0x00000001000000010000000200000002 . This is again shifted 4 bytes to the right ( 0x00000000000000010000000100000002 ) and added to the previous value again, producing 0x00000001000000020000000300000004 . The last 32-bit word 0x00000004 of xmm5 is taken as the result and placed into register edx . Thus this algorithm produces 4 as its final result. It is pretty obvious that this algorithm simply performs the "parallel" addition of consecutive 32-bit words in a 128-bit register. Note, BTW that this algorithm does not even attempt to access A , it starts summing from an embedded constant produced by the compiler/optimizer.

Now, in the end the value of r8 + ecx + edx is reported as the final sum. Obviously, this is only 8 , instead of the correct 12 . It looks like one of these two algorithms forgot to do some of its work. I don't know which one, but judging by the abundance of "redundant" instructions it looks like it was the SSE algorithm that was supposed to generate 8 in edx instead of 4 . One suspicious instruction is this one

000000013FC81E71  paddd       xmm5,xmm3

At that moment xmm3 always contains zero. So, this instruction looks completely redundant and unnecessary. But if xmm3 actually contained another "magic" constant representing another 4 elements of the array (just like xmm4 did), then the algorithm would work properly and would produce the proper sum.

If one uses distinctive initial values for array elements

int A[12] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };

it can be clearly seen that the first (non-SSE) algorithm successfully sums 1, 2, 3, 4 , while the second (SSE) algorithm sums 9, 10, 11, 12 . 5, 6, 7, 8 remain excluded from consideration, resulting in 52 as final sum instead of the correct 78 .

This is definitely a compiler/optimizer bug.

PS The same project with the same settings imported into VS2013 Update 2 does not seem to suffer from this bug.

Answer 2

I believe you've found a bug in the optimizer. You can get a release build to give the same (correct) output as a debug build by either disabling optimizations or by adding extra code with side effects that can't be optimized away (eg cout << "hi" ) inside the innermost for loop (this presumably prevents whatever optimization is being incorrectly performed otherwise). I'd suggest reporting it to Microsoft.

Update: Microsoft confirms that this is a bug related to auto-vectorization and that it has been fixed in VS2013 update 2. A workaround in other versions is to disable vectorization by prefixing the loop with #pragma loop(no_vector) .

Further, they describe two different loop constructs that can trigger the bug. I'll just quote them:

There are two cases where the bug kicks in:

1) As user burzvingion mentioned, loops that get vectorized of the form:

for (int i=0; ...) { sum = A[...] - sum; }

2) Loops that get vectorized of the form:

for (int i=0; ...) { sum = sum + A[ - i]; }

They also give the following suggestion for locating vulnerable code:

If you are looking through your source code to attempt to find these cases, I recommend starting off by throwing /Qvec-report:1 to find all the loops that got vectorized, and going from there. To workaround the bugs, put #pragma loop(no_vector) above the for-loops.

Answer 3

The code which yields an optimization error can be reduced to the following:

#include <iostream>
using namespace std;

#define SIZE 12

int main()
{
    int A[SIZE] = {0};

    int sum = 0;
    for (int i=0; i<SIZE; i++)
        sum += A[SIZE-1-i];
    cout << sum << endl;

    return 0;
}

The optimization error can be removed by applying either one of the following changes:

Change the definition of SIZE to a value lower than 12
Change the expression A[SIZE-1-i] to A[SIZE-i-1]
Move the operation cout << sum << endl into the loop

So in order to diagnose the problem, we can simply apply either one of these changes, and then compare between the disassemby of the code before the change and the disassemby of the code after the change.

Answer 4

I compared the asm code for both cases(in VC++ 2013 express), in the release build, the asm code in the release build of the for loop

for (int i = 0; i< Period; i++)

is below, and it is very different to the that in the debug build

$LL6@main:

; 23   :        sum = 0;
; 24   :        for (int i = 0; i< Period; i++){

    xorps   xmm5, xmm5
    lea eax, DWORD PTR [edi+88]
    xorps   xmm4, xmm4
    mov ecx, 3
    npad    2
$LL3@main:

; 25   :            //cout << "hi";
; 26   :            sum += A[bars - i];

    movd    xmm2, DWORD PTR [eax-4]
    lea eax, DWORD PTR [eax-32]
    movd    xmm0, DWORD PTR [eax+32]
    movd    xmm1, DWORD PTR [eax+36]
    movd    xmm3, DWORD PTR [eax+40]
    punpckldq xmm3, xmm0
    movd    xmm0, DWORD PTR [eax+48]
    punpckldq xmm1, xmm2
    movd    xmm2, DWORD PTR [eax+44]
    punpckldq xmm3, xmm1
    movd    xmm1, DWORD PTR [eax+52]
    paddd   xmm5, xmm3
    movd    xmm3, DWORD PTR [eax+56]
    punpckldq xmm3, xmm0
    punpckldq xmm1, xmm2
    punpckldq xmm3, xmm1
    paddd   xmm4, xmm3
    dec ecx
    jne SHORT $LL3@main

; 23   :        sum = 0;
; 24   :        for (int i = 0; i< Period; i++){

    paddd   xmm4, xmm5
    xor edx, edx
    movdqa  xmm0, xmm4
    mov eax, edi
    psrldq  xmm0, 8
    mov esi, 3
    paddd   xmm4, xmm0
    movdqa  xmm0, xmm4
    psrldq  xmm0, 4
    paddd   xmm4, xmm0
    movd    ebx, xmm4
    npad    7
$LL30@main:

; 25   :            //cout << "hi";
; 26   :            sum += A[bars - i];

    add ecx, DWORD PTR [eax]
    lea eax, DWORD PTR [eax-8]
    add edx, DWORD PTR [eax+4]
    dec esi
    jne SHORT $LL30@main

; 27   :    

}

As you can from the asm code, SSE instructions are used here. So I checked the compiler options for SSE instructions in VC++, then I specified /arch:IA32 to disable generation of SSE and SSE2 instructions for x86 processors in the release build, then I got the same result as the debug build.

I'm not familiar with SSE, I hope someone can explain more based on my findings.

Visual Studio 2012 different values Release/Debug mode

Question

4 answers

solution1
18 2014-06-10 17:34:42

solution2
15 2014-06-10 14:50:35

solution3
5 2014-06-10 16:43:19

solution4
4 2014-06-10 15:41:02

Visual Studio 2012 different values Release/Debug mode

Question

4 answers

solution1 18 2014-06-10 17:34:42

solution2 15 2014-06-10 14:50:35

solution3 5 2014-06-10 16:43:19

solution4 4 2014-06-10 15:41:02

solution1
18 2014-06-10 17:34:42

solution2
15 2014-06-10 14:50:35

solution3
5 2014-06-10 16:43:19

solution4
4 2014-06-10 15:41:02