[英]Why godbolt generate different asm output than my actual asm code in Visual Studio?
Here's the code generated by godbolt . 这是godbolt生成的代码。
Here's the same code generated by Visual studio on my main.asm file (enabled by Project->C/C++->Output Files->Assembly With Source Code (/FAs) under Assembler Output field): 这是Visual Studio在我的main.asm文件中生成的相同代码(由Project-> C / C ++ - >输出文件 - >汇编源代码(/ FAs)在汇编程序输出字段下启用):
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1
TITLE c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB OLDNAMES
EXTRN __imp____std_terminate:PROC
EXTRN @__security_check_cookie@4:PROC
EXTRN __imp____CxxFrameHandler3:PROC
PUBLIC ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z ; std::less<void>::operator()<double const &,double const &>
PUBLIC ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC ??$clamp@N@std@@YAABNABN00@Z ; std::clamp<double>
PUBLIC _main
PUBLIC ?ProcessOptimized@MyPlugin@@QAEXH@Z ; MyPlugin::ProcessOptimized
PUBLIC ?Process@MyPlugin@@QAEXH@Z ; MyPlugin::Process
PUBLIC ??1MyPlugin@@QAE@XZ ; MyPlugin::~MyPlugin
PUBLIC ??0MyPlugin@@QAE@XZ ; MyPlugin::MyPlugin
PUBLIC ?ProcessOptimized@Param@@QAEXHH@Z ; Param::ProcessOptimized
PUBLIC ?Process@Param@@QAEXHH@Z ; Param::Process
PUBLIC ??0Param@@QAE@XZ ; Param::Param
PUBLIC __real@3ff0000000000000
PUBLIC __real@400921fb54442d18
PUBLIC __real@4024000000000000
PUBLIC __real@406fe00000000000
PUBLIC __xmm@00000003000000020000000100000000
PUBLIC __xmm@400921fb54442d18400921fb54442d18
PUBLIC __xmm@406fe00000000000406fe00000000000
EXTRN __chkstk:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
; COMDAT __xmm@406fe00000000000406fe00000000000
CONST SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
DB '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST ENDS
; COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
DB 018H, '-DT', 0fbH, '!', 09H, '@'
CONST ENDS
; COMDAT __xmm@00000003000000020000000100000000
CONST SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
DB 00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST ENDS
; COMDAT __real@406fe00000000000
CONST SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r ; 255
CONST ENDS
; COMDAT __real@4024000000000000
CONST SEGMENT
__real@4024000000000000 DQ 04024000000000000r ; 10
CONST ENDS
; COMDAT __real@400921fb54442d18
CONST SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r ; 3.14159
CONST ENDS
; COMDAT __real@3ff0000000000000
CONST SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r ; 1
CONST ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0Param@@QAE@XZ
_TEXT SEGMENT
??0Param@@QAE@XZ PROC ; Param::Param, COMDAT
; _this$ = ecx
; 23 : Param() { }
xorps xmm0, xmm0
mov eax, ecx
movsd QWORD PTR [ecx], xmm0
movsd QWORD PTR [ecx+16], xmm0
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [ecx+32], xmm0
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [ecx+48], xmm0
movsd QWORD PTR [ecx+64], xmm0
ret 0
??0Param@@QAE@XZ ENDP ; Param::Param
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@Param@@QAEXHH@Z
_TEXT SEGMENT
$T1 = -24 ; size = 8
$T3 = -16 ; size = 8
$T2 = -8 ; size = 8
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?Process@Param@@QAEXHH@Z PROC ; Param::Process, COMDAT
; _this$ = ecx
; 25 : inline void Process(int voiceIndex, int blockSize) {
push ebp
mov ebp, esp
sub esp, 24 ; 00000018H
; 26 : double *pB = b[voiceIndex];
mov eax, DWORD PTR _voiceIndex$[ebp]
xorps xmm5, xmm5
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm2, QWORD PTR __real@400921fb54442d18
push esi
mov esi, ecx
shl eax, 11 ; 0000000bH
push edi
movsd QWORD PTR $T1[ebp], xmm2
mov ecx, 256 ; 00000100H
movsd QWORD PTR $T2[ebp], xmm5
movsd xmm3, QWORD PTR [esi+48]
lea edx, DWORD PTR [esi+2128]
movsd xmm1, QWORD PTR [esi]
add edx, eax
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 11
$LL4@Process:
movsd xmm0, QWORD PTR [edx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [edx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN10@Process
movaps xmm0, xmm2
jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T2[ebp]
lea edi, DWORD PTR $T3[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
add edx, 8
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
addsd xmm1, xmm0
sub ecx, 1
jne SHORT $LL4@Process
; 35 : }
; 36 :
; 37 : mPhase = phase;
; 38 : }
pop edi
movsd QWORD PTR [esi], xmm1
pop esi
mov esp, ebp
pop ebp
ret 8
?Process@Param@@QAEXHH@Z ENDP ; Param::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT SEGMENT
_v_phase$ = -16 ; size = 16
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC ; Param::ProcessOptimized, COMDAT
; _this$ = ecx
; 39 : inline void ProcessOptimized(int voiceIndex, int blockSize) {
push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
; 40 : double *pB = b[voiceIndex];
mov eax, DWORD PTR _voiceIndex$[ebx]
mov edx, ecx
shl eax, 11 ; 0000000bH
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
xorps xmm7, xmm7
mov ecx, 128 ; 00000080H
; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
movsd xmm5, QWORD PTR [edx+48]
mulsd xmm5, QWORD PTR [edx+32]
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
movsd xmm6, QWORD PTR [edx+64]
; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR [eax+edx+80]
movups xmm4, XMMWORD PTR [eax+edx+80]
movups xmm1, XMMWORD PTR [eax+edx+2128]
mulsd xmm5, xmm6
unpcklpd xmm3, xmm0
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm0, QWORD PTR [eax+edx+2128]
add eax, 2136 ; 00000858H
unpcklpd xmm2, xmm0
add eax, edx
; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);
movsd xmm0, QWORD PTR [edx+16]
unpcklpd xmm5, xmm5
unpcklpd xmm6, xmm6
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
mulpd xmm2, xmm6
unpcklpd xmm0, xmm0
npad 2
$LL4@ProcessOpt:
; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
addpd xmm1, xmm4
; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);
movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3
; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);
movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm7
maxpd xmm2, xmm7
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm0, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm0, xmm2
; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);
movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$[ebp], xmm0
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm6
sub ecx, 1
jne SHORT $LL4@ProcessOpt
; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
movsd xmm0, QWORD PTR _v_phase$[ebp+8]
movsd QWORD PTR [edx+16], xmm0
; 88 : }
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP ; Param::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0MyPlugin@@QAE@XZ
_TEXT SEGMENT
??0MyPlugin@@QAE@XZ PROC ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx
; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
movaps xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
xorps xmm0, xmm0
movaps xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
xor edx, edx
push esi
mov esi, ecx
push edi
; 14 : alignas(16) double mPhase = 0.0;
movsd QWORD PTR [esi], xmm0
; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR [esi+88]
; 15 : alignas(16) double mPhaseOptimized = 0.0;
movsd QWORD PTR [esi+16], xmm0
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [esi+32], xmm0
; 17 : alignas(16) double mHostPitch = 1.0;
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [esi+48], xmm0
; 18 : alignas(16) double mRadiansPerSample = 1.0;
movsd QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:
; 100 : double value = (sampleIndex / ((double)bufferSize - 1));
movd xmm0, edx
lea eax, DWORD PTR [edx+2]
pshufd xmm1, xmm0, 0
lea ecx, DWORD PTR [ecx+32]
movq xmm0, xmm2
add edx, 4
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;
movlpd QWORD PTR [ecx-40], xmm0
movhpd QWORD PTR [ecx-32], xmm0
movd xmm0, eax
pshufd xmm1, xmm0, 0
movq xmm0, xmm2
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
movlpd QWORD PTR [ecx-24], xmm0
movhpd QWORD PTR [ecx-16], xmm0
cmp edx, 256 ; 00000100H
jl SHORT $LL7@MyPlugin
; 103 : }
; 104 : }
; 105 :
; 106 : // fill c
; 107 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea edi, DWORD PTR [esi+2128]
xor eax, eax
mov ecx, 512 ; 00000200H
rep stosd
; 109 : double value = 0.0;
; 110 :
; 111 : mParam1.c[voiceIndex][sampleIndex] = value;
; 112 : }
; 113 : }
; 114 : }
pop edi
mov eax, esi
pop esi
ret 0
??0MyPlugin@@QAE@XZ ENDP ; MyPlugin::MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??1MyPlugin@@QAE@XZ
_TEXT SEGMENT
??1MyPlugin@@QAE@XZ PROC ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx
; 115 : ~MyPlugin() { }
ret 0
??1MyPlugin@@QAE@XZ ENDP ; MyPlugin::~MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
$T2 = -28 ; size = 8
$T4 = -20 ; size = 8
$T3 = -12 ; size = 8
_blockSize$dead$ = 8 ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC ; MyPlugin::Process, COMDAT
; _this$ = ecx
; 117 : void Process(int blockSize) {
push ebp
mov ebp, esp
sub esp, 28 ; 0000001cH
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm2, QWORD PTR __real@400921fb54442d18
xorps xmm5, xmm5
; 117 : void Process(int blockSize) {
push esi
mov esi, ecx
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T2[ebp], xmm2
; 117 : void Process(int blockSize) {
push edi
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T3[ebp], xmm5
mov edx, 256 ; 00000100H
movsd xmm3, QWORD PTR [esi+48]
; 27 : double *pC = c[voiceIndex];
lea ecx, DWORD PTR [esi+2128]
; 28 : double phase = mPhase;
; 29 : double bp0 = mNoteFrequency * mHostPitch;
movsd xmm1, QWORD PTR [esi]
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 3
$LL9@Process:
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN15@Process
movaps xmm0, xmm2
jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T3[ebp]
lea edi, DWORD PTR $T4[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
add ecx, 8
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
addsd xmm1, xmm0
sub edx, 1
jne SHORT $LL9@Process
; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }
pop edi
; 37 : mPhase = phase;
movsd QWORD PTR [esi], xmm1
; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }
pop esi
mov esp, ebp
pop ebp
ret 4
?Process@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
_v_phase$31 = -16 ; size = 16
_blockSize$dead$ = 8 ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx
; 122 : void ProcessOptimized(int blockSize) {
push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
mov edx, ecx
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
; 40 : double *pB = b[voiceIndex];
mov ecx, 128 ; 00000080H
movsd xmm6, QWORD PTR [edx+48]
lea eax, DWORD PTR [edx+2136]
mulsd xmm6, QWORD PTR [edx+32]
; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
movsd xmm7, QWORD PTR [edx+64]
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR [edx+80]
movsd xmm5, QWORD PTR [edx+16]
movups xmm4, XMMWORD PTR [edx+80]
movups xmm1, XMMWORD PTR [edx+2128]
mulsd xmm6, xmm7
unpcklpd xmm3, xmm0
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm0, QWORD PTR [edx+2128]
unpcklpd xmm7, xmm7
unpcklpd xmm6, xmm6
unpcklpd xmm2, xmm0
xorps xmm0, xmm0
; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
mulpd xmm4, xmm6
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
mulpd xmm1, xmm7
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
mulpd xmm3, xmm6
; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm7
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);
unpcklpd xmm5, xmm5
npad 13
$LL9@ProcessOpt:
; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
addpd xmm1, xmm4
; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);
movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3
; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);
movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm0
maxpd xmm2, xmm0
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm5, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm5, xmm2
; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);
movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$31[ebp], xmm5
mulpd xmm4, xmm6
mulpd xmm1, xmm7
mulpd xmm3, xmm6
; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm7
sub ecx, 1
jne SHORT $LL9@ProcessOpt
; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
movsd xmm0, QWORD PTR _v_phase$31[ebp+8]
movsd QWORD PTR [edx+16], xmm0
; 123 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124 : mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125 : }
; 126 : }
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT _main
_TEXT SEGMENT
_counterProcessing$1$ = -4304 ; size = 4
_counterProcessing$ = -4304 ; size = 8
_bp0$1$ = -4296 ; size = 8
_v_radiansPerSample$1$ = -4288 ; size = 16
$T3 = -4264 ; size = 8
_v_phase$38 = -4256 ; size = 16
$T4 = -4256 ; size = 8
$T2 = -4232 ; size = 8
tv1040 = -4224 ; size = 16
tv1039 = -4208 ; size = 16
_myPlugin$ = -4192 ; size = 4176
__$ArrayPad$ = -4 ; size = 4
_main PROC ; COMDAT
; 129 : int main() {
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
mov eax, 4312 ; 000010d8H
call __chkstk
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+4312], eax
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd xmm0, QWORD PTR __real@4024000000000000
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR _myPlugin$[esp+4392]
movsd xmm1, QWORD PTR __real@406fe00000000000
xorps xmm2, xmm2
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd QWORD PTR _myPlugin$[esp+4344], xmm0
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
xor eax, eax
; 17 : alignas(16) double mHostPitch = 1.0;
movsd xmm0, QWORD PTR __real@3ff0000000000000
; 129 : int main() {
push esi
push edi
; 14 : alignas(16) double mPhase = 0.0;
movsd QWORD PTR _myPlugin$[esp+4320], xmm2
; 15 : alignas(16) double mPhaseOptimized = 0.0;
movsd QWORD PTR _myPlugin$[esp+4336], xmm2
; 17 : alignas(16) double mHostPitch = 1.0;
movsd QWORD PTR _myPlugin$[esp+4368], xmm0
; 18 : alignas(16) double mRadiansPerSample = 1.0;
movsd QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
movd xmm0, eax
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR [ecx+8]
; 100 : double value = (sampleIndex / ((double)bufferSize - 1));
cvtdq2pd xmm0, xmm0
inc eax
divsd xmm0, xmm1
; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;
movsd QWORD PTR [ecx-8], xmm0
cmp eax, 256 ; 00000100H
jl SHORT $LL11@main
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm6, QWORD PTR __real@400921fb54442d18
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea edi, DWORD PTR _myPlugin$[esp+6448]
mov ecx, 512 ; 00000200H
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T2[esp+4320], xmm6
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
xor eax, eax
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T3[esp+4320], xmm2
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
rep stosd
movsd xmm3, QWORD PTR _myPlugin$[esp+4352]
xorps xmm0, xmm0
mulsd xmm3, QWORD PTR _myPlugin$[esp+4368]
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movaps xmm4, xmm2
movsd xmm1, QWORD PTR _myPlugin$[esp+4384]
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm5, QWORD PTR _myPlugin$[esp+4336]
; 130 : MyPlugin myPlugin;
; 131 :
; 132 : long long numProcessing = 5;
; 133 : long long counterProcessing = 0;
movlpd QWORD PTR _counterProcessing$[esp+4320], xmm0
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR _myPlugin$[esp+4400]
movaps xmm7, xmm3
mulsd xmm7, QWORD PTR _myPlugin$[esp+4384]
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
mov edi, DWORD PTR _counterProcessing$[esp+4324]
mov esi, DWORD PTR _counterProcessing$[esp+4320]
unpcklpd xmm4, xmm0
movsd xmm0, QWORD PTR _myPlugin$[esp+6448]
movups XMMWORD PTR tv1040[esp+4320], xmm4
movaps xmm4, xmm2
unpcklpd xmm1, xmm1
unpcklpd xmm4, xmm0
movups XMMWORD PTR tv1039[esp+4320], xmm4
movsd xmm4, QWORD PTR _myPlugin$[esp+4320]
movsd QWORD PTR _bp0$1$[esp+4320], xmm3
unpcklpd xmm7, xmm7
movaps XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
npad 8
$LL2@main:
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
add esi, 1
; 26 : double *pB = b[voiceIndex];
lea ecx, DWORD PTR _myPlugin$[esp+6448]
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
mov DWORD PTR _counterProcessing$1$[esp+4320], esi
; 26 : double *pB = b[voiceIndex];
mov edx, 256 ; 00000100H
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
adc edi, 0
npad 10
$LL29@main:
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, QWORD PTR _myPlugin$[esp+4384]
comisd xmm0, xmm6
movsd QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN35@main
movaps xmm0, xmm6
jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T3[esp+4320]
lea esi, DWORD PTR $T4[esp+4320]
cmovbe eax, esi
movsd xmm0, QWORD PTR [eax]
// ...
( Note : I've removed some lines because StackOverflow limit it.) ( 注意 :我删除了一些行,因为StackOverflow限制了它。)
Its pretty different. 它非常不同。 Also, I see the code generated by VS is a bit redundant ie search for string
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
另外,我看到VS生成的代码有点多余,即搜索字符串
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
: there are many. : 有许多。
Which settings am I missing?. 我错过了哪些设置? I've matched the same MSVC version (19.15), on a X86 build, placing also the actual optimization I have.
我在X86版本上匹配了相同的MSVC版本(19.15),同时也放置了我的实际优化。
It doesn't seem that you're using the same compiler flags. 您似乎没有使用相同的编译器标志。 The assembly dump from Visual Studio shows that each function was optimized with the flags
/Ogtp
, which are used internally when you specify /Og
in the command line. Visual Studio中的程序集转储显示每个函数都使用flags
/Ogtp
进行了优化,这些内容在命令行中指定/Og
在内部使用。 On the other hand, in the godbolt version, you used /Ot /O2
, which internally correspond to /Ogtpy
. 另一方面,在godbolt版本中,您使用了
/Ot /O2
,它在内部对应于/Ogtpy
。 If I manually add the /Oy
flag, the code becomes slightly different, but still not the same as the one generated by Visual Studio. 如果我手动添加
/Oy
标志,代码会略有不同,但仍然与Visual Studio生成的代码不同。
I realize that the compiler versions are not exactly the same, but the difference between 19.15.26726.0 and 19.15.26732.1 is very minor and probably only includes bug fixes. 我意识到编译器版本并不完全相同,但19.15.26726.0和19.15.26732.1之间的区别非常小,可能只包括错误修复。 I think there are other flags that are different.
我认为还有其他标志不同。 You can go to the Property Pages of your project and find all the compiler options that have been used in the "All Options" and "Additional Options" panes.
您可以转到项目的属性页,找到已在“所有选项”和“其他选项”窗格中使用的所有编译器选项。 In the Release build, many options are used other than
/arch:SSE2 /Ot /O2
. 在发布版本中,除了
/arch:SSE2 /Ot /O2
之外还使用了许多选项/arch:SSE2 /Ot /O2
。 Note that /arch:SSE2
is the default , so you don't have to explicitly specify it. 请注意
/arch:SSE2
是默认值 ,因此您不必明确指定它。 Also, /O2
implies /Ot
. 此外,
/O2
暗示/Ot
。 So /arch:SSE2 /Ot /O2
is equivalent to /O2
. 所以
/arch:SSE2 /Ot /O2
相当于/O2
。
There are multiple paths to a destination. 目的地有多条路径。
Roger Orr gave a good talk at an ACCU conference on the quirks of compiling. Roger Orr 在ACCU会议上就编译的怪癖发表了精彩的演讲 。 For example, a simple "hello world" will generate 98 lines of asm in GCC but 6,704 in MSVC.
例如,一个简单的“hello world”将在GCC中生成98行asm,但在MSVC中生成6,704行。
To quickly and simply answer your question: in your Godbolt link the version is 19.15.26726.0
and your main.asm file 19.15.26732.1
快速简单地回答你的问题:在你的Godbolt链接中,版本是
19.15.26726.0
和你的main.asm文件19.15.26732.1
Close, but maybe enough to make this difference? 关闭,但也许足以让这个差异化?
19.15.26726.0
19.15.26732.1
MSVC is particularly weird, you can output asm with GCC then use that asm to pass through GCC again and get the same machine code. MSVC特别奇怪,您可以使用GCC输出asm然后再使用asm再次通过GCC并获得相同的机器代码。 You can't in MSVC.
你不能在MSVC。 So maybe if the versions were exactly the same, you would still get different asm, that'd be a fun experiment to run, this article shows you how to run two different versions of MSVC side by side in Visual Studio.
因此,如果版本完全相同,你仍然会得到不同的asm,这将是一个有趣的实验, 本文将向您展示如何在Visual Studio中并排运行两个不同版本的MSVC。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.