繁体   English   中英

gcc -O3选项如何使运行这么快?

[英]How can gcc -O3 option make the run so fast?

[问题]我在带有O3选项的代码下运行。 然后,我发现使用O3的代码的性能比不使用O3的代码的性能高9倍。

编辑:我想知道优化技术的关键,而不是原因。 这是我的问题。 我从未经历过x86组装。 因此,很难理解x86汇编代码。 这就是我发布此问题的原因。 或者,您能为我解释带有O3选项的代码吗? .................................................. ..............................

[C代码]该代码仅执行加法。

float minmax_scale(unsigned int x) {

    // x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
    return (x/(255.0 * OFFSET));
}

int main(int argc, char** argv) {
  char ibuffer[INPUT_FEATURE];
  double H[TSIZE];

  // feature summation and scale
  for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
            H[k] = minmax_scale(
                   (unsigned int)ibuffer[i]
                   + ibuffer[i+1]
                   + ibuffer[i+2]
                   + ibuffer[i+3]
                   + ibuffer[i+4]
                   + ibuffer[i+5]
                   + ibuffer[i+6]
                   + ibuffer[i+7]
                  );
  }

  return 0;
}

[与O3组装]

    .file   "measure_fs_simple.c"
    .section    .text.unlikely,"ax",@progbits
.LCOLDB1:
    .text
.LHOTB1:
    .p2align 4,,15
    .globl  minmax_scale
    .type   minmax_scale, @function
minmax_scale:
.LFB0:
    .cfi_startproc
    pxor    %xmm0, %xmm0
    movl    %edi, %edi
    cvtsi2sdq   %rdi, %xmm0
    divsd   .LC0(%rip), %xmm0
    cvtsd2ss    %xmm0, %xmm0
    ret
    .cfi_endproc
.LFE0:
    .size   minmax_scale, .-minmax_scale
    .section    .text.unlikely
.LCOLDE1:
    .text
.LHOTE1:
    .section    .text.unlikely
.LCOLDB2:
    .section    .text.startup,"ax",@progbits
.LHOTB2:
    .p2align 4,,15
    .globl  main
    .type   main, @function
main:
.LFB1:
    .cfi_startproc
    xorl    %eax, %eax
    ret
    .cfi_endproc
.LFE1:
    .size   main, .-main
    .section    .text.unlikely
.LCOLDE2:
    .section    .text.startup
.LHOTE2:
    .section    .rodata.cst8,"aM",@progbits,8
    .align 8
.LC0:
    .long   0
    .long   1084219392
    .ident  "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
    .section    .note.GNU-stack,"",@progbits

[没有O3的组装]

 .file   "measure_fs_simple.c"
    .text
    .globl  minmax_scale
    .type   minmax_scale, @function
minmax_scale:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movl    %edi, -4(%rbp)
    movl    -4(%rbp), %eax
    testq   %rax, %rax
    js  .L2
    pxor    %xmm0, %xmm0
    cvtsi2sdq   %rax, %xmm0
    jmp .L3
.L2:
    movq    %rax, %rdx
    shrq    %rdx
    andl    $1, %eax
    orq %rax, %rdx
    pxor    %xmm0, %xmm0
    cvtsi2sdq   %rdx, %xmm0
    addsd   %xmm0, %xmm0
.L3:
    movsd   .LC0(%rip), %xmm1
    divsd   %xmm1, %xmm0
    cvtsd2ss    %xmm0, %xmm0
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   minmax_scale, .-minmax_scale
    .globl  main
    .type   main, @function
main:
.LFB1:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    subq    $2096, %rsp
    movl    %edi, -2084(%rbp)
    movq    %rsi, -2096(%rbp)
    movq    %fs:40, %rax
    movq    %rax, -8(%rbp)
    xorl    %eax, %eax
    movl    $0, -2072(%rbp)
    movl    $0, -2068(%rbp)
    jmp .L6
.L7:
    movl    -2068(%rbp), %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %edx
    movl    -2068(%rbp), %eax
    addl    $1, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2068(%rbp), %eax
    addl    $2, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2068(%rbp), %eax
    addl    $3, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2068(%rbp), %eax
    addl    $4, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2068(%rbp), %eax
    addl    $5, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2068(%rbp), %eax
    addl    $6, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2068(%rbp), %eax
    addl    $7, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %edx, %eax
    movl    %eax, %edi
    call    minmax_scale
    cvtss2sd    %xmm0, %xmm0
    movl    -2072(%rbp), %eax
    cltq
    movsd   %xmm0, -2064(%rbp,%rax,8)
    addl    $8, -2068(%rbp)
    addl    $1, -2072(%rbp)
.L6:
    cmpl    $127, -2072(%rbp)
    jle .L7
    movl    $0, %eax
    movq    -8(%rbp), %rcx
    xorq    %fs:40, %rcx
    je  .L9
    call    __stack_chk_fail
.L9:
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE1:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC0:
    .long   0
    .long   1084219392
    .ident  "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
    .section    .note.GNU-stack,"",@progbits

您的代码没有可观察到的副作用,因此优化器只是丢弃了大部分代码。

使用-O3将您的主要功能变为:

main:
    xorl    %eax, %eax
    ret

等效于:

int main()
{
    return 0;
}

这表明微基准测试代码可能很难正确执行。

编辑:

如下面的注释中指出的那样,发布的代码不会初始化ibuffer[INPUT_FEATURE] 读取未初始化的变量是未定义的行为,这会使整个程序格式错误。 这是一个实际的问题,不需要代码即可产生合理的结果。 谢谢@chqrlie

我修改了代码并进行了尝试,以反映您的答复,如下所示。 结果与以前相同。 O3选项总比没有选择要好。

#define OFFSET                  (8)
#define INPUT_FEATURE           (1024)
#define TSIZE                   (INPUT_FEATURE/OFFSET)
#include<stdio.h>

float minmax_scale(unsigned int x) {

    // x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
    return (x/(255.0 * OFFSET));
}

int main(int argc, char** argv) {
  char ibuffer[INPUT_FEATURE];
  double H[TSIZE];

  for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
      H[k] = 0.0;
  }

  // feature summation and scale
  for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
            H[k] = minmax_scale(
                   (unsigned int)ibuffer[i]
                   + ibuffer[i+1]
                   + ibuffer[i+2]
                   + ibuffer[i+3]
                   + ibuffer[i+4]
                   + ibuffer[i+5]
                   + ibuffer[i+6]
                   + ibuffer[i+7]
                  );
  }

  for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
      printf("%lf",H[k]);
  }

  return 0;
}

[带有O3选项的代码]

     .file   "measure_fs_simple.c"
        .section    .text.unlikely,"ax",@progbits
    .LCOLDB1:
        .text
    .LHOTB1:
        .p2align 4,,15
        .globl  minmax_scale
        .type   minmax_scale, @function
    minmax_scale:
    .LFB23:
        .cfi_startproc
        pxor    %xmm0, %xmm0
        movl    %edi, %edi
        cvtsi2sdq   %rdi, %xmm0
        divsd   .LC0(%rip), %xmm0
        cvtsd2ss    %xmm0, %xmm0
        ret
        .cfi_endproc
    .LFE23:
        .size   minmax_scale, .-minmax_scale
        .section    .text.unlikely
    .LCOLDE1:
        .text
    .LHOTE1:
        .section    .rodata.str1.1,"aMS",@progbits,1
    .LC5:
        .string "%lf"
        .section    .text.unlikely
    .LCOLDB6:
        .section    .text.startup,"ax",@progbits
    .LHOTB6:
        .p2align 4,,15
        .globl  main
        .type   main, @function
    main:
    .LFB24:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        pushq   %rbx
        .cfi_def_cfa_offset 24
        .cfi_offset 3, -24
        movl    $128, %ecx
        pxor    %xmm12, %xmm12

[代码无选项]

.file   "measure_fs_simple.c"
    .text
    .globl  minmax_scale
    .type   minmax_scale, @function
minmax_scale:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movl    %edi, -4(%rbp)
    movl    -4(%rbp), %eax
    testq   %rax, %rax
    js  .L2
    pxor    %xmm0, %xmm0
    cvtsi2sdq   %rax, %xmm0
    jmp .L3
.L2:
    movq    %rax, %rdx
    shrq    %rdx
    andl    $1, %eax
    orq %rax, %rdx
    pxor    %xmm0, %xmm0
    cvtsi2sdq   %rdx, %xmm0
    addsd   %xmm0, %xmm0
.L3:
    movsd   .LC0(%rip), %xmm1
    divsd   %xmm1, %xmm0
    cvtsd2ss    %xmm0, %xmm0
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   minmax_scale, .-minmax_scale
    .section    .rodata
.LC2:
    .string "%lf"
    .text
    .globl  main
    .type   main, @function
main:
.LFB1:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    subq    $2128, %rsp
    movl    %edi, -2100(%rbp)
    movq    %rsi, -2112(%rbp)
    movq    %fs:40, %rax
    movq    %rax, -8(%rbp)
    xorl    %eax, %eax
    movl    $0, -2088(%rbp)
    movl    $0, -2084(%rbp)
    jmp .L6
.L7:
    movl    -2088(%rbp), %eax
    cltq
    pxor    %xmm0, %xmm0
    movsd   %xmm0, -2064(%rbp,%rax,8)
    addl    $8, -2084(%rbp)
    addl    $1, -2088(%rbp)
.L6:
    cmpl    $127, -2088(%rbp)
    jle .L7
    movl    $0, -2080(%rbp)
    movl    $0, -2076(%rbp)
    jmp .L8
.L9:
    movl    -2076(%rbp), %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %edx
    movl    -2076(%rbp), %eax
    addl    $1, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2076(%rbp), %eax
    addl    $2, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2076(%rbp), %eax
    addl    $3, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2076(%rbp), %eax
    addl    $4, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2076(%rbp), %eax
    addl    $5, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2076(%rbp), %eax
    addl    $6, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %eax, %edx
    movl    -2076(%rbp), %eax
    addl    $7, %eax
    cltq
    movzbl  -1040(%rbp,%rax), %eax
    movsbl  %al, %eax
    addl    %edx, %eax
    movl    %eax, %edi
    call    minmax_scale
    cvtss2sd    %xmm0, %xmm0
    movl    -2080(%rbp), %eax
    cltq
    movsd   %xmm0, -2064(%rbp,%rax,8)
    addl    $8, -2076(%rbp)
    addl    $1, -2080(%rbp)
.L8:
    cmpl    $127, -2080(%rbp)
    jle .L9
    movl    $0, -2072(%rbp)
    movl    $0, -2068(%rbp)
    jmp .L10
.L11:
    movl    -2072(%rbp), %eax
    cltq
    movq    -2064(%rbp,%rax,8), %rax
    movq    %rax, -2120(%rbp)
    movsd   -2120(%rbp), %xmm0
    movl    $.LC2, %edi
    movl    $1, %eax
    call    printf
    addl    $8, -2068(%rbp)
    addl    $1, -2072(%rbp)
.L10:
    cmpl    $127, -2072(%rbp)
    jle .L11
    movl    $0, %eax
    movq    -8(%rbp), %rcx
    xorq    %fs:40, %rcx
    je  .L13
    call    __stack_chk_fail
.L13:
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE1:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC0:
    .long   0
    .long   1084219392
    .ident  "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
    .section    .note.GNU-stack,"",@progbits

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM