为什么godbolt生成不同的asm输出而不是我在Visual Studio中的实际asm代码？

Question

Here's the code generated by godbolt . 这是godbolt生成的代码。

Here's the same code generated by Visual studio on my main.asm file (enabled by Project->C/C++->Output Files->Assembly With Source Code (/FAs) under Assembler Output field): 这是Visual Studio在我的main.asm文件中生成的相同代码（由Project-> C / C ++ - >输出文件 - >汇编源代码（/ FAs）在汇编程序输出字段下启用）：

; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1 

    TITLE   c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    .686P
    .XMM
    include listing.inc
    .model  flat

INCLUDELIB OLDNAMES

EXTRN   __imp____std_terminate:PROC
EXTRN   @__security_check_cookie@4:PROC
EXTRN   __imp____CxxFrameHandler3:PROC
PUBLIC  ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z       ; std::less<void>::operator()<double const &,double const &>
PUBLIC  ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC  ??$clamp@N@std@@YAABNABN00@Z            ; std::clamp<double>
PUBLIC  _main
PUBLIC  ?ProcessOptimized@MyPlugin@@QAEXH@Z     ; MyPlugin::ProcessOptimized
PUBLIC  ?Process@MyPlugin@@QAEXH@Z          ; MyPlugin::Process
PUBLIC  ??1MyPlugin@@QAE@XZ             ; MyPlugin::~MyPlugin
PUBLIC  ??0MyPlugin@@QAE@XZ             ; MyPlugin::MyPlugin
PUBLIC  ?ProcessOptimized@Param@@QAEXHH@Z       ; Param::ProcessOptimized
PUBLIC  ?Process@Param@@QAEXHH@Z            ; Param::Process
PUBLIC  ??0Param@@QAE@XZ                ; Param::Param
PUBLIC  __real@3ff0000000000000
PUBLIC  __real@400921fb54442d18
PUBLIC  __real@4024000000000000
PUBLIC  __real@406fe00000000000
PUBLIC  __xmm@00000003000000020000000100000000
PUBLIC  __xmm@400921fb54442d18400921fb54442d18
PUBLIC  __xmm@406fe00000000000406fe00000000000
EXTRN   __chkstk:PROC
EXTRN   ___security_cookie:DWORD
EXTRN   __fltused:DWORD
;   COMDAT __xmm@406fe00000000000406fe00000000000
CONST   SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
    DB  '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST   ENDS
;   COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST   SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
    DB  018H, '-DT', 0fbH, '!', 09H, '@'
CONST   ENDS
;   COMDAT __xmm@00000003000000020000000100000000
CONST   SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
    DB  00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST   ENDS
;   COMDAT __real@406fe00000000000
CONST   SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r   ; 255
CONST   ENDS
;   COMDAT __real@4024000000000000
CONST   SEGMENT
__real@4024000000000000 DQ 04024000000000000r   ; 10
CONST   ENDS
;   COMDAT __real@400921fb54442d18
CONST   SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r   ; 3.14159
CONST   ENDS
;   COMDAT __real@3ff0000000000000
CONST   SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r   ; 1
CONST   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??0Param@@QAE@XZ
_TEXT   SEGMENT
??0Param@@QAE@XZ PROC                   ; Param::Param, COMDAT
; _this$ = ecx

; 23   :    Param() { }

    xorps   xmm0, xmm0
    mov eax, ecx
    movsd   QWORD PTR [ecx], xmm0
    movsd   QWORD PTR [ecx+16], xmm0
    movsd   xmm0, QWORD PTR __real@4024000000000000
    movsd   QWORD PTR [ecx+32], xmm0
    movsd   xmm0, QWORD PTR __real@3ff0000000000000
    movsd   QWORD PTR [ecx+48], xmm0
    movsd   QWORD PTR [ecx+64], xmm0
    ret 0
??0Param@@QAE@XZ ENDP                   ; Param::Param
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?Process@Param@@QAEXHH@Z
_TEXT   SEGMENT
$T1 = -24                       ; size = 8
$T3 = -16                       ; size = 8
$T2 = -8                        ; size = 8
_voiceIndex$ = 8                    ; size = 4
_blockSize$dead$ = 12                   ; size = 4
?Process@Param@@QAEXHH@Z PROC               ; Param::Process, COMDAT
; _this$ = ecx

; 25   :    inline void Process(int voiceIndex, int blockSize) {

    push    ebp
    mov ebp, esp
    sub esp, 24                 ; 00000018H

; 26   :        double *pB = b[voiceIndex];

    mov eax, DWORD PTR _voiceIndex$[ebp]
    xorps   xmm5, xmm5

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm2, QWORD PTR __real@400921fb54442d18
    push    esi
    mov esi, ecx
    shl eax, 11                 ; 0000000bH
    push    edi
    movsd   QWORD PTR $T1[ebp], xmm2
    mov ecx, 256                ; 00000100H
    movsd   QWORD PTR $T2[ebp], xmm5
    movsd   xmm3, QWORD PTR [esi+48]
    lea edx, DWORD PTR [esi+2128]
    movsd   xmm1, QWORD PTR [esi]
    add edx, eax
    mulsd   xmm3, QWORD PTR [esi+32]
    movsd   xmm4, QWORD PTR [esi+64]
    npad    11
$LL4@Process:
    movsd   xmm0, QWORD PTR [edx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [edx]
    mulsd   xmm0, xmm4
    comisd  xmm0, xmm2
    movsd   QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN10@Process
    movaps  xmm0, xmm2
    jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T2[ebp]
    lea edi, DWORD PTR $T3[ebp]
    cmovbe  eax, edi
    movsd   xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

    add edx, 8

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    addsd   xmm1, xmm0
    sub ecx, 1
    jne SHORT $LL4@Process

; 35   :        }
; 36   : 
; 37   :        mPhase = phase;
; 38   :    }

    pop edi
    movsd   QWORD PTR [esi], xmm1
    pop esi
    mov esp, ebp
    pop ebp
    ret 8
?Process@Param@@QAEXHH@Z ENDP               ; Param::Process
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT   SEGMENT
_v_phase$ = -16                     ; size = 16
_voiceIndex$ = 8                    ; size = 4
_blockSize$dead$ = 12                   ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC          ; Param::ProcessOptimized, COMDAT
; _this$ = ecx

; 39   :    inline void ProcessOptimized(int voiceIndex, int blockSize) {

    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -16                ; fffffff0H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov DWORD PTR [esp+4], ebp
    mov ebp, esp

; 40   :        double *pB = b[voiceIndex];

    mov eax, DWORD PTR _voiceIndex$[ebx]
    mov edx, ecx
    shl eax, 11                 ; 0000000bH
    xorps   xmm3, xmm3
    xorps   xmm2, xmm2
    sub esp, 16                 ; 00000010H
    xorps   xmm7, xmm7
    mov ecx, 128                ; 00000080H

; 41   :        double *pC = c[voiceIndex];
; 42   :        double phase = mPhaseOptimized;
; 43   :        double bp0 = mNoteFrequency * mHostPitch;

    movsd   xmm5, QWORD PTR [edx+48]
    mulsd   xmm5, QWORD PTR [edx+32]

; 44   : 
; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

    movsd   xmm6, QWORD PTR [edx+64]

; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49   : 
; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54   : 
; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR [eax+edx+80]
    movups  xmm4, XMMWORD PTR [eax+edx+80]
    movups  xmm1, XMMWORD PTR [eax+edx+2128]
    mulsd   xmm5, xmm6
    unpcklpd xmm3, xmm0

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm0, QWORD PTR [eax+edx+2128]
    add eax, 2136               ; 00000858H
    unpcklpd xmm2, xmm0
    add eax, edx

; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59   : 
; 60   :        __m128d v_phaseAcc1;
; 61   :        __m128d v_phaseAcc2;
; 62   :        __m128d v_phase = _mm_set1_pd(phase);

    movsd   xmm0, QWORD PTR [edx+16]
    unpcklpd xmm5, xmm5
    unpcklpd xmm6, xmm6
    mulpd   xmm4, xmm5
    mulpd   xmm1, xmm6
    mulpd   xmm3, xmm5
    mulpd   xmm2, xmm6
    unpcklpd xmm0, xmm0
    npad    2
$LL4@ProcessOpt:

; 63   : 
; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65   :            // some other code (that will use phase, like sin(phase))
; 66   : 
; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

    addpd   xmm1, xmm4

; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75   : 
; 76   :            v_pB0 = _mm_load_pd(pB + 2);

    movups  xmm4, XMMWORD PTR [eax-2040]
    addpd   xmm2, xmm3

; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78   :            v_pC0 = _mm_load_pd(pC + 2);
; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80   : 
; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);

    movups  xmm3, XMMWORD PTR [eax-2048]
    maxpd   xmm1, xmm7
    maxpd   xmm2, xmm7
    minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    addpd   xmm0, xmm1
    movups  xmm1, XMMWORD PTR [eax+8]
    addpd   xmm0, xmm2

; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);

    movups  xmm2, XMMWORD PTR [eax]
    add eax, 16                 ; 00000010H
    movaps  XMMWORD PTR _v_phase$[ebp], xmm0
    mulpd   xmm4, xmm5
    mulpd   xmm1, xmm6
    mulpd   xmm3, xmm5

; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm6
    sub ecx, 1
    jne SHORT $LL4@ProcessOpt

; 85   :        }
; 86   : 
; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

    movsd   xmm0, QWORD PTR _v_phase$[ebp+8]
    movsd   QWORD PTR [edx+16], xmm0

; 88   :    }

    mov esp, ebp
    pop ebp
    mov esp, ebx
    pop ebx
    ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP          ; Param::ProcessOptimized
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??0MyPlugin@@QAE@XZ
_TEXT   SEGMENT
??0MyPlugin@@QAE@XZ PROC                ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx

; 97   :        // fill b
; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    movaps  xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
    xorps   xmm0, xmm0
    movaps  xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
    xor edx, edx
    push    esi
    mov esi, ecx
    push    edi

; 14   :    alignas(16) double mPhase = 0.0;

    movsd   QWORD PTR [esi], xmm0

; 97   :        // fill b
; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR [esi+88]

; 15   :    alignas(16) double mPhaseOptimized = 0.0;

    movsd   QWORD PTR [esi+16], xmm0

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   xmm0, QWORD PTR __real@4024000000000000
    movsd   QWORD PTR [esi+32], xmm0

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   xmm0, QWORD PTR __real@3ff0000000000000
    movsd   QWORD PTR [esi+48], xmm0

; 18   :    alignas(16) double mRadiansPerSample = 1.0;

    movsd   QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:

; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));

    movd    xmm0, edx
    lea eax, DWORD PTR [edx+2]
    pshufd  xmm1, xmm0, 0
    lea ecx, DWORD PTR [ecx+32]
    movq    xmm0, xmm2
    add edx, 4
    paddd   xmm1, xmm0
    cvtdq2pd xmm0, xmm1
    divpd   xmm0, xmm3

; 101  : 
; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;

    movlpd  QWORD PTR [ecx-40], xmm0
    movhpd  QWORD PTR [ecx-32], xmm0
    movd    xmm0, eax
    pshufd  xmm1, xmm0, 0
    movq    xmm0, xmm2
    paddd   xmm1, xmm0
    cvtdq2pd xmm0, xmm1
    divpd   xmm0, xmm3
    movlpd  QWORD PTR [ecx-24], xmm0
    movhpd  QWORD PTR [ecx-16], xmm0
    cmp edx, 256                ; 00000100H
    jl  SHORT $LL7@MyPlugin

; 103  :            }
; 104  :        }
; 105  : 
; 106  :        // fill c
; 107  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea edi, DWORD PTR [esi+2128]
    xor eax, eax
    mov ecx, 512                ; 00000200H
    rep stosd

; 109  :                double value = 0.0;
; 110  : 
; 111  :                mParam1.c[voiceIndex][sampleIndex] = value;
; 112  :            }
; 113  :        }
; 114  :    }

    pop edi
    mov eax, esi
    pop esi
    ret 0
??0MyPlugin@@QAE@XZ ENDP                ; MyPlugin::MyPlugin
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??1MyPlugin@@QAE@XZ
_TEXT   SEGMENT
??1MyPlugin@@QAE@XZ PROC                ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx

; 115  :    ~MyPlugin() { }

    ret 0
??1MyPlugin@@QAE@XZ ENDP                ; MyPlugin::~MyPlugin
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT   SEGMENT
$T2 = -28                       ; size = 8
$T4 = -20                       ; size = 8
$T3 = -12                       ; size = 8
_blockSize$dead$ = 8                    ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC             ; MyPlugin::Process, COMDAT
; _this$ = ecx

; 117  :    void Process(int blockSize) {

    push    ebp
    mov ebp, esp
    sub esp, 28                 ; 0000001cH

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm2, QWORD PTR __real@400921fb54442d18
    xorps   xmm5, xmm5

; 117  :    void Process(int blockSize) {

    push    esi
    mov esi, ecx

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T2[ebp], xmm2

; 117  :    void Process(int blockSize) {

    push    edi

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T3[ebp], xmm5
    mov edx, 256                ; 00000100H
    movsd   xmm3, QWORD PTR [esi+48]

; 27   :        double *pC = c[voiceIndex];

    lea ecx, DWORD PTR [esi+2128]

; 28   :        double phase = mPhase;
; 29   :        double bp0 = mNoteFrequency * mHostPitch;

    movsd   xmm1, QWORD PTR [esi]
    mulsd   xmm3, QWORD PTR [esi+32]
    movsd   xmm4, QWORD PTR [esi+64]
    npad    3
$LL9@Process:

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm0, QWORD PTR [ecx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [ecx]
    mulsd   xmm0, xmm4
    comisd  xmm0, xmm2
    movsd   QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN15@Process
    movaps  xmm0, xmm2
    jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T3[ebp]
    lea edi, DWORD PTR $T4[ebp]
    cmovbe  eax, edi
    movsd   xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

    add ecx, 8

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    addsd   xmm1, xmm0
    sub edx, 1
    jne SHORT $LL9@Process

; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119  :            mParam1.Process(voiceIndex, blockSize);
; 120  :        }
; 121  :    }

    pop edi

; 37   :        mPhase = phase;

    movsd   QWORD PTR [esi], xmm1

; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119  :            mParam1.Process(voiceIndex, blockSize);
; 120  :        }
; 121  :    }

    pop esi
    mov esp, ebp
    pop ebp
    ret 4
?Process@MyPlugin@@QAEXH@Z ENDP             ; MyPlugin::Process
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT   SEGMENT
_v_phase$31 = -16                   ; size = 16
_blockSize$dead$ = 8                    ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC        ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx

; 122  :    void ProcessOptimized(int blockSize) {

    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -16                ; fffffff0H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov DWORD PTR [esp+4], ebp
    mov ebp, esp
    mov edx, ecx
    xorps   xmm3, xmm3
    xorps   xmm2, xmm2
    sub esp, 16                 ; 00000010H

; 40   :        double *pB = b[voiceIndex];

    mov ecx, 128                ; 00000080H
    movsd   xmm6, QWORD PTR [edx+48]
    lea eax, DWORD PTR [edx+2136]
    mulsd   xmm6, QWORD PTR [edx+32]

; 41   :        double *pC = c[voiceIndex];
; 42   :        double phase = mPhaseOptimized;
; 43   :        double bp0 = mNoteFrequency * mHostPitch;
; 44   : 
; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

    movsd   xmm7, QWORD PTR [edx+64]

; 54   : 
; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR [edx+80]
    movsd   xmm5, QWORD PTR [edx+16]
    movups  xmm4, XMMWORD PTR [edx+80]
    movups  xmm1, XMMWORD PTR [edx+2128]
    mulsd   xmm6, xmm7
    unpcklpd xmm3, xmm0

; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm0, QWORD PTR [edx+2128]
    unpcklpd xmm7, xmm7
    unpcklpd xmm6, xmm6
    unpcklpd xmm2, xmm0
    xorps   xmm0, xmm0

; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49   : 
; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);

    mulpd   xmm4, xmm6

; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

    mulpd   xmm1, xmm7

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);

    mulpd   xmm3, xmm6

; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm7

; 59   : 
; 60   :        __m128d v_phaseAcc1;
; 61   :        __m128d v_phaseAcc2;
; 62   :        __m128d v_phase = _mm_set1_pd(phase);

    unpcklpd xmm5, xmm5
    npad    13
$LL9@ProcessOpt:

; 63   : 
; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65   :            // some other code (that will use phase, like sin(phase))
; 66   : 
; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

    addpd   xmm1, xmm4

; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75   : 
; 76   :            v_pB0 = _mm_load_pd(pB + 2);

    movups  xmm4, XMMWORD PTR [eax-2040]
    addpd   xmm2, xmm3

; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78   :            v_pC0 = _mm_load_pd(pC + 2);
; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80   : 
; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);

    movups  xmm3, XMMWORD PTR [eax-2048]
    maxpd   xmm1, xmm0
    maxpd   xmm2, xmm0
    minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    addpd   xmm5, xmm1
    movups  xmm1, XMMWORD PTR [eax+8]
    addpd   xmm5, xmm2

; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);

    movups  xmm2, XMMWORD PTR [eax]
    add eax, 16                 ; 00000010H
    movaps  XMMWORD PTR _v_phase$31[ebp], xmm5
    mulpd   xmm4, xmm6
    mulpd   xmm1, xmm7
    mulpd   xmm3, xmm6

; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm7
    sub ecx, 1
    jne SHORT $LL9@ProcessOpt

; 85   :        }
; 86   : 
; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

    movsd   xmm0, QWORD PTR _v_phase$31[ebp+8]
    movsd   QWORD PTR [edx+16], xmm0

; 123  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124  :            mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125  :        }
; 126  :    }

    mov esp, ebp
    pop ebp
    mov esp, ebx
    pop ebx
    ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP        ; MyPlugin::ProcessOptimized
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT _main
_TEXT   SEGMENT
_counterProcessing$1$ = -4304               ; size = 4
_counterProcessing$ = -4304             ; size = 8
_bp0$1$ = -4296                     ; size = 8
_v_radiansPerSample$1$ = -4288              ; size = 16
$T3 = -4264                     ; size = 8
_v_phase$38 = -4256                 ; size = 16
$T4 = -4256                     ; size = 8
$T2 = -4232                     ; size = 8
tv1040 = -4224                      ; size = 16
tv1039 = -4208                      ; size = 16
_myPlugin$ = -4192                  ; size = 4176
__$ArrayPad$ = -4                   ; size = 4
_main   PROC                        ; COMDAT

; 129  : int main() {

    push    ebp
    mov ebp, esp
    and esp, -16                ; fffffff0H
    mov eax, 4312               ; 000010d8H
    call    __chkstk
    mov eax, DWORD PTR ___security_cookie
    xor eax, esp
    mov DWORD PTR __$ArrayPad$[esp+4312], eax

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   xmm0, QWORD PTR __real@4024000000000000

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR _myPlugin$[esp+4392]
    movsd   xmm1, QWORD PTR __real@406fe00000000000
    xorps   xmm2, xmm2

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   QWORD PTR _myPlugin$[esp+4344], xmm0

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    xor eax, eax

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   xmm0, QWORD PTR __real@3ff0000000000000

; 129  : int main() {

    push    esi
    push    edi

; 14   :    alignas(16) double mPhase = 0.0;

    movsd   QWORD PTR _myPlugin$[esp+4320], xmm2

; 15   :    alignas(16) double mPhaseOptimized = 0.0;

    movsd   QWORD PTR _myPlugin$[esp+4336], xmm2

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   QWORD PTR _myPlugin$[esp+4368], xmm0

; 18   :    alignas(16) double mRadiansPerSample = 1.0;

    movsd   QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
    movd    xmm0, eax

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR [ecx+8]

; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));

    cvtdq2pd xmm0, xmm0
    inc eax
    divsd   xmm0, xmm1

; 101  : 
; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;

    movsd   QWORD PTR [ecx-8], xmm0
    cmp eax, 256                ; 00000100H
    jl  SHORT $LL11@main

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm6, QWORD PTR __real@400921fb54442d18

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea edi, DWORD PTR _myPlugin$[esp+6448]
    mov ecx, 512                ; 00000200H

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T2[esp+4320], xmm6

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    xor eax, eax

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T3[esp+4320], xmm2

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    rep stosd
    movsd   xmm3, QWORD PTR _myPlugin$[esp+4352]
    xorps   xmm0, xmm0
    mulsd   xmm3, QWORD PTR _myPlugin$[esp+4368]

; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movaps  xmm4, xmm2
    movsd   xmm1, QWORD PTR _myPlugin$[esp+4384]

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm5, QWORD PTR _myPlugin$[esp+4336]

; 130  :    MyPlugin myPlugin;
; 131  : 
; 132  :    long long numProcessing = 5;
; 133  :    long long counterProcessing = 0;

    movlpd  QWORD PTR _counterProcessing$[esp+4320], xmm0

; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR _myPlugin$[esp+4400]
    movaps  xmm7, xmm3
    mulsd   xmm7, QWORD PTR _myPlugin$[esp+4384]

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    mov edi, DWORD PTR _counterProcessing$[esp+4324]
    mov esi, DWORD PTR _counterProcessing$[esp+4320]
    unpcklpd xmm4, xmm0
    movsd   xmm0, QWORD PTR _myPlugin$[esp+6448]
    movups  XMMWORD PTR tv1040[esp+4320], xmm4
    movaps  xmm4, xmm2
    unpcklpd xmm1, xmm1
    unpcklpd xmm4, xmm0
    movups  XMMWORD PTR tv1039[esp+4320], xmm4
    movsd   xmm4, QWORD PTR _myPlugin$[esp+4320]
    movsd   QWORD PTR _bp0$1$[esp+4320], xmm3
    unpcklpd xmm7, xmm7
    movaps  XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
    npad    8
$LL2@main:

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    add esi, 1

; 26   :        double *pB = b[voiceIndex];

    lea ecx, DWORD PTR _myPlugin$[esp+6448]

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    mov DWORD PTR _counterProcessing$1$[esp+4320], esi

; 26   :        double *pB = b[voiceIndex];

    mov edx, 256                ; 00000100H

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    adc edi, 0
    npad    10
$LL29@main:

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm0, QWORD PTR [ecx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [ecx]
    mulsd   xmm0, QWORD PTR _myPlugin$[esp+4384]
    comisd  xmm0, xmm6
    movsd   QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN35@main
    movaps  xmm0, xmm6
    jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T3[esp+4320]
    lea esi, DWORD PTR $T4[esp+4320]
    cmovbe  eax, esi
    movsd   xmm0, QWORD PTR [eax]

// ...

( Note : I've removed some lines because StackOverflow limit it.) （注意：我删除了一些行，因为StackOverflow限制了它。）

Its pretty different. 它非常不同。 Also, I see the code generated by VS is a bit redundant ie search for string phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI); 另外，我看到VS生成的代码有点多余，即搜索字符串phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI); : there are many. ：有许多。

Which settings am I missing?. 我错过了哪些设置？ I've matched the same MSVC version (19.15), on a X86 build, placing also the actual optimization I have. 我在X86版本上匹配了相同的MSVC版本（19.15），同时也放置了我的实际优化。

Answer 1

It doesn't seem that you're using the same compiler flags. 您似乎没有使用相同的编译器标志。 The assembly dump from Visual Studio shows that each function was optimized with the flags /Ogtp , which are used internally when you specify /Og in the command line. Visual Studio中的程序集转储显示每个函数都使用flags /Ogtp进行了优化，这些内容在命令行中指定/Og在内部使用。 On the other hand, in the godbolt version, you used /Ot /O2 , which internally correspond to /Ogtpy . 另一方面，在godbolt版本中，您使用了/Ot /O2 ，它在内部对应于/Ogtpy 。 If I manually add the /Oy flag, the code becomes slightly different, but still not the same as the one generated by Visual Studio. 如果我手动添加/Oy标志，代码会略有不同，但仍然与Visual Studio生成的代码不同。

I realize that the compiler versions are not exactly the same, but the difference between 19.15.26726.0 and 19.15.26732.1 is very minor and probably only includes bug fixes. 我意识到编译器版本并不完全相同，但19.15.26726.0和19.15.26732.1之间的区别非常小，可能只包括错误修复。 I think there are other flags that are different. 我认为还有其他标志不同。 You can go to the Property Pages of your project and find all the compiler options that have been used in the "All Options" and "Additional Options" panes. 您可以转到项目的属性页，找到已在“所有选项”和“其他选项”窗格中使用的所有编译器选项。 In the Release build, many options are used other than /arch:SSE2 /Ot /O2 . 在发布版本中，除了/arch:SSE2 /Ot /O2之外还使用了许多选项/arch:SSE2 /Ot /O2 。 Note that /arch:SSE2 is the default , so you don't have to explicitly specify it. 请注意/arch:SSE2是默认值，因此您不必明确指定它。 Also, /O2 implies /Ot . 此外， /O2暗示/Ot 。 So /arch:SSE2 /Ot /O2 is equivalent to /O2 . 所以/arch:SSE2 /Ot /O2相当于/O2 。

Answer 2

There are multiple paths to a destination. 目的地有多条路径。

Roger Orr gave a good talk at an ACCU conference on the quirks of compiling. Roger Orr 在ACCU会议上就编译的怪癖发表了精彩的演讲。 For example, a simple "hello world" will generate 98 lines of asm in GCC but 6,704 in MSVC. 例如，一个简单的“hello world”将在GCC中生成98行asm，但在MSVC中生成6,704行。

To quickly and simply answer your question: in your Godbolt link the version is 19.15.26726.0 and your main.asm file 19.15.26732.1 快速简单地回答你的问题：在你的Godbolt链接中，版本是19.15.26726.0和你的main.asm文件19.15.26732.1

Close, but maybe enough to make this difference? 关闭，但也许足以让这个差异化？

19.15.26726.0

19.15.26732.1

MSVC is particularly weird, you can output asm with GCC then use that asm to pass through GCC again and get the same machine code. MSVC特别奇怪，您可以使用GCC输出asm然后再使用asm再次通过GCC并获得相同的机器代码。 You can't in MSVC. 你不能在MSVC。 So maybe if the versions were exactly the same, you would still get different asm, that'd be a fun experiment to run, this article shows you how to run two different versions of MSVC side by side in Visual Studio. 因此，如果版本完全相同，你仍然会得到不同的asm，这将是一个有趣的实验，本文将向您展示如何在Visual Studio中并排运行两个不同版本的MSVC。

为什么godbolt生成不同的asm输出而不是我在Visual Studio中的实际asm代码？

问题描述

2 个解决方案

解决方案1
2 2019-07-21 06:18:51

解决方案2
0 2019-07-20 20:56:02

为什么godbolt生成不同的asm输出而不是我在Visual Studio中的实际asm代码？

问题描述

2 个解决方案

解决方案1 2 2019-07-21 06:18:51

解决方案2 0 2019-07-20 20:56:02

解决方案1
2 2019-07-21 06:18:51

解决方案2
0 2019-07-20 20:56:02