GCC生成SSE指令而不是AVX

Question

我打電話給GCC是這樣的：

$ gcc -I/usr/include/SDL2 -D_REENTRANT -Ibuild -I. -S -fverbose-asm -O2 -m64 -mpc64 -mfpmath=both -fipa-pta -ftree-loop-linear -floop-interchange -floop-strip-mine -floop-block -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops -ftree-vectorize -march=core-avx-i -c algo/collision.c -o build/collision.s

重要的選擇是：

-S                      : output assembly
-ftree-vectorize        : vectorize loops
-march=core-avx-i       : enable "MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2,
                        : AVX, AES, PCLMUL, FSGSBASE, RDRND and F16C
                        : instruction set support."

這是匯編之前的來源：

#include "collision.h"

int8_t currentField[FIELD_W][FIELD_H];

// Clear and rebuild the field based on the objects with a gravity well
void buildField (const gravityWell *body) {
    int x, y;
    int w, h, Cx, Cy;
    int Vx[2], Vy[2];

    // Clear the field
    for (x = 0; x < FIELD_W; x++) {
        memset (currentField[x], 0x00, FIELD_H);
    }

    // Rebuild the field
    for (x = 0; x < body->object_count; x++) {
        // Fetch the position and dimensions of the object and round
        // them to ints
        Cx =    body->stuff[x].pos.x;
        Cy =    body->stuff[x].pos.y;
        w = body->stuff[x].pos.w;
        h = body->stuff[x].pos.h;

        // Calculate the lower left and upper right edges of a
        // rectangle encompassing the object
        w = w / 2;
        h = h / 2;
        Vx[0] = Cx - w;
        Vx[1] = Cx + w;
        Vy[0] = Cy - h;
        Vy[1] = Cy + h;

        // Add in the offset for array accesses
        Vx[0] += FIELD_W / 2;
        Vx[1] += FIELD_W / 2;
        Vy[0] += FIELD_H / 2;
        Vy[1] += FIELD_H / 2;

        Vx[1]++;
        Vy[1]++;

        // Set the area occupied by the object to ones
        for (y = Vx[0]; y < Vx[1]; y++) {
            memset (currentField[y], 0x01, (Vy[1] - Vy[0]));
        }
    }

    return;
}

這是匯編源（GAS語法）：

    .file   "collision.c"
# GNU C (Ubuntu/Linaro 4.8.1-10ubuntu9) version 4.8.1 (x86_64-linux-gnu)
#   compiled by GNU C version 4.8.1, GMP version 5.1.2, MPFR version 3.1.1-p2, MPC version 1.0.1
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed:  -I /usr/include/SDL2 -I build -I .
# -imultiarch x86_64-linux-gnu -D _REENTRANT algo/collision.c -m64 -mpc64
# -mfpmath=both -march=core-avx-i -auxbase-strip build/collision.s -O2
# -fverbose-asm -fipa-pta -floop-interchange -floop-strip-mine -floop-block
# -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops
# -ftree-vectorize -fstack-protector -Wformat -Wformat-security
# options enabled:  -faggressive-loop-optimizations
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fcombine-stack-adjustments -fcommon -fcompare-elim
# -fcprop-registers -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm
# -fearly-inlining -feliminate-unused-debug-types -fexpensive-optimizations
# -fforward-propagate -ffunction-cse -fgcse -fgcse-lm -fgnu-runtime
# -fguess-branch-probability -fhoist-adjacent-loads -fident -fif-conversion
# -fif-conversion2 -findirect-inlining -finline -finline-atomics
# -finline-functions-called-once -finline-small-functions -fipa-cp
# -fipa-profile -fipa-pta -fipa-pure-const -fipa-reference -fipa-sra
# -fira-hoist-pressure -fira-share-save-slots -fira-share-spill-slots
# -fivopts -fkeep-static-consts -fleading-underscore -floop-block
# -floop-interchange -floop-strip-mine -fmath-errno -fmerge-constants
# -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
# -foptimize-register-move -foptimize-sibling-calls -foptimize-strlen
# -fpartial-inlining -fpeephole -fpeephole2 -fprefetch-loop-arrays -free
# -freg-struct-return -fregmove -freorder-blocks -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-insns2
# -fshow-column -fshrink-wrap -fsigned-zeros -fsplit-ivs-in-unroller
# -fsplit-wide-types -fstack-protector -fstrict-aliasing -fstrict-overflow
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-distribution -ftree-loop-if-convert -ftree-loop-im
# -ftree-loop-ivcanon -ftree-loop-optimize -ftree-parallelize-loops=
# -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc -ftree-scev-cprop
# -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter
# -ftree-vect-loop-version -ftree-vectorize -ftree-vrp -funit-at-a-time
# -funswitch-loops -funwind-tables -fverbose-asm -fzero-initialized-in-bss
# -m128bit-long-double -m64 -m80387 -maccumulate-outgoing-args -maes
# -malign-stringops -mavx -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mcx16 -mf16c -mfancy-math-387
# -mfp-ret-in-387 -mfsgsbase -mfxsr -mglibc -mieee-fp -mlong-double-80
# -mmmx -mpc64 -mpclmul -mpopcnt -mpush-args -mrdrnd -mred-zone -msahf
# -msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -mssse3
# -mtls-direct-seg-refs -mvzeroupper -mxsave -mxsaveopt

    .text
    .p2align 4,,15
    .globl  buildField
    .type   buildField, @function
buildField:
.LFB24:
    .cfi_startproc
    pushq   %r14    #
    .cfi_def_cfa_offset 16
    .cfi_offset 14, -16
    pushq   %r13    #
    .cfi_def_cfa_offset 24
    .cfi_offset 13, -24
    movq    %rdi, %r13  # body, body
    pushq   %r12    #
    .cfi_def_cfa_offset 32
    .cfi_offset 12, -32
    pushq   %rbp    #
    .cfi_def_cfa_offset 40
    .cfi_offset 6, -40
    pushq   %rbx    #
    .cfi_def_cfa_offset 48
    .cfi_offset 3, -48
    movl    $currentField, %ebx #, ivtmp.26
    .p2align 4,,10
    .p2align 3
.L3:
    xorl    %esi, %esi  #
    movq    %rbx, %rdi  # ivtmp.26,
    movl    $4000, %edx #,
    call    memset  #
    addq    $4000, %rbx #, ivtmp.26
    cmpq    $currentField+16000000, %rbx    #, ivtmp.26
    jne .L3 #,
    movl    8(%r13), %eax   # body_11(D)->object_count,
    xorl    %r14d, %r14d    # ivtmp.19
    xorl    %r12d, %r12d    # x
    testl   %eax, %eax  #
    jle .L12    #,
    .p2align 4,,10
    .p2align 3
.L11:
    movq    %r14, %rax  # ivtmp.19, D.2657
    addq    0(%r13), %rax   # body_11(D)->stuff, D.2657
    movl    96(%rax), %edx  # _16->pos.w, w
    vmovss  88(%rax), %xmm0 # _16->pos.x,
    vmovss  92(%rax), %xmm1 # _16->pos.y,
    movl    100(%rax), %eax # _16->pos.h, h
    vcvttss2si  %xmm0, %esi #, Cx
    movl    %edx, %edi  # w, tmp125
    vcvttss2si  %xmm1, %ecx #, Cy
    shrl    $31, %edi   #, tmp125
    addl    %edi, %edx  # tmp125, tmp127
    movl    %eax, %edi  # h, tmp128
    sarl    %edx    # tmp127
    shrl    $31, %edi   #, tmp128
    movl    %ecx, %r8d  # Cy, D.2655
    addl    %edi, %eax  # tmp128, tmp130
    movl    %esi, %edi  # Cx, D.2655
    sarl    %eax    # tmp130
    subl    %edx, %edi  # tmp127, D.2655
    addl    %esi, %edx  # Cx, D.2655
    leal    2001(%rcx,%rax), %ebp   #, D.2655
    subl    %eax, %r8d  # tmp130, D.2655
    leal    2000(%rdi), %esi    #, y
    addl    $2000, %r8d #, D.2655
    leal    2001(%rdx), %eax    #, D.2655
    cmpl    %eax, %esi  # D.2655, y
    jge .L8 #,
    movslq  %esi, %rax  # y, D.2660
    subl    %edi, %edx  # D.2655, D.2654
    subl    %r8d, %ebp  # D.2655, D.2655
    leaq    (%rdx,%rax), %rbx   #, D.2654
    movslq  %ebp, %rbp  # D.2655, D.2661
    imulq   $4000, %rax, %rcx   #, D.2660, D.2660
    imulq   $4000, %rbx, %rbx   #, D.2654, D.2654
    addq    $currentField, %rcx #, ivtmp.12
    addq    $currentField+4000, %rbx    #, D.2654
    .p2align 4,,10
    .p2align 3
.L9:
    movq    %rcx, %rdi  # ivtmp.12,
    movq    %rbp, %rdx  # D.2661,
    movl    $1, %esi    #,
    call    memset  #
    movq    %rax, %rcx  #, ivtmp.12
    addq    $4000, %rcx #, ivtmp.12
    cmpq    %rbx, %rcx  # D.2654, ivtmp.12
    jne .L9 #,
.L8:
    addl    $1, %r12d   #, x
    subq    $-128, %r14 #, ivtmp.19
    cmpl    %r12d, 8(%r13)  # x, body_11(D)->object_count
    jg  .L11    #,
.L12:
    popq    %rbx    #
    .cfi_def_cfa_offset 40
    popq    %rbp    #
    .cfi_def_cfa_offset 32
    popq    %r12    #
    .cfi_def_cfa_offset 24
    popq    %r13    #
    .cfi_def_cfa_offset 16
    popq    %r14    #
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE24:
    .size   buildField, .-buildField
    .comm   currentField,16000000,32
    .ident  "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1"
    .section    .note.GNU-stack,"",@progbits

GCC使用SSE指令代替AVX指令，特別是考慮到它使用SSE的128位%xmm寄存器而不是AVX的256位%ymm寄存器。

為什么這個，更重要的是，我如何強制gcc使用AVX而不是SSE？

Answer 1

你的代碼執行所有整數運算; AVX擴展中沒有整數操作。 它們是在AVX2中添加的，您尚未啟用它。

在你重新編寫所有代碼以使用float或購買帶有AVX2的處理器之前，我應該指出你看起來使用的結構數組內存布局會擊敗許多自動矢量化器，所以它並不是很明顯如果整數操作可用，您的代碼將利用AVX。 您可能需要考慮使用數組結構布局，盡管這也可能被證明是一個相對侵入性的變化。

GCC生成SSE指令而不是AVX

問題描述

1 個解決方案

解決方案1
11 已采納 2014-02-17 22:44:08

GCC生成SSE指令而不是AVX

問題描述

1 個解決方案

解決方案1 11 已采納 2014-02-17 22:44:08

解決方案1
11 已采納 2014-02-17 22:44:08