如果在 Intel Skylake CPU 上调用 function，为什么我的空循环运行速度会快两倍？

Question

I was running some tests to compare C to Java and ran into something interesting.我正在运行一些测试来比较 C 和 Java 并遇到了一些有趣的事情。 Running my exactly identical benchmark code with optimization level 1 (-O1) in a function called by main, rather than in main itself, resulted in roughly double performance.在 main 调用的 function 中运行与优化级别 1 (-O1) 完全相同的基准代码，而不是在 main 本身中，导致性能大约翻倍。 I'm printing out the size of test_t to verify beyond any doubt that the code is being compiled to x64.我正在打印 test_t 的大小，以毫无疑问地验证代码是否正在编译为 x64。

I sent the executables to my friend who's running an i7-7700HQ and got similar results.我将可执行文件发送给正在运行 i7-7700HQ 的朋友并得到了类似的结果。 I'm running an i7-6700.我正在运行 i7-6700。

Here's the slower code:这是较慢的代码：

#include <stdio.h>
#include <time.h>
#include <stdint.h>

int main() {
    printf("Size = %I64u\n", sizeof(size_t));
    int start = clock();
    for(int64_t i = 0; i < 10000000000L; i++) {
        
    }
    printf("%ld\n", clock() - start);
    return 0;
}

And the faster:而且更快：

#include <stdio.h>
#include <time.h>
#include <stdint.h>

void test() {
    printf("Size = %I64u\n", sizeof(size_t));
    int start = clock();
    for(int64_t i = 0; i < 10000000000L; i++) {
        
    }
    printf("%ld\n", clock() - start);
}

int main() {
    test();
    return 0;
}

I'll also provide the assembly code for you to dig in to.我还将提供汇编代码供您深入研究。 I don't know assembly.我不知道组装。 Slower:慢点：

    .file   "dummy.c"
    .text
    .def    __main; .scl    2;  .type   32; .endef
    .section .rdata,"dr"
.LC0:
    .ascii "Size = %I64u\12\0"
.LC1:
    .ascii "%ld\12\0"
    .text
    .globl  main
    .def    main;   .scl    2;  .type   32; .endef
    .seh_proc   main
main:
    pushq   %rbx
    .seh_pushreg    %rbx
    subq    $32, %rsp
    .seh_stackalloc 32
    .seh_endprologue
    call    __main
    movl    $8, %edx
    leaq    .LC0(%rip), %rcx
    call    printf
    call    clock
    movl    %eax, %ebx
    movabsq $10000000000, %rax
.L2:
    subq    $1, %rax
    jne .L2
    call    clock
    subl    %ebx, %eax
    movl    %eax, %edx
    leaq    .LC1(%rip), %rcx
    call    printf
    movl    $0, %eax
    addq    $32, %rsp
    popq    %rbx
    ret
    .seh_endproc
    .ident  "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0"
    .def    printf; .scl    2;  .type   32; .endef
    .def    clock;  .scl    2;  .type   32; .endef

Faster:快点：

    .file   "dummy.c"
    .text
    .section .rdata,"dr"
.LC0:
    .ascii "Size = %I64u\12\0"
.LC1:
    .ascii "%ld\12\0"
    .text
    .globl  test
    .def    test;   .scl    2;  .type   32; .endef
    .seh_proc   test
test:
    pushq   %rbx
    .seh_pushreg    %rbx
    subq    $32, %rsp
    .seh_stackalloc 32
    .seh_endprologue
    movl    $8, %edx
    leaq    .LC0(%rip), %rcx
    call    printf
    call    clock
    movl    %eax, %ebx
    movabsq $10000000000, %rax
.L2:
    subq    $1, %rax
    jne .L2
    call    clock
    subl    %ebx, %eax
    movl    %eax, %edx
    leaq    .LC1(%rip), %rcx
    call    printf
    nop
    addq    $32, %rsp
    popq    %rbx
    ret
    .seh_endproc
    .def    __main; .scl    2;  .type   32; .endef
    .globl  main
    .def    main;   .scl    2;  .type   32; .endef
    .seh_proc   main
main:
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    call    __main
    call    test
    movl    $0, %eax
    addq    $40, %rsp
    ret
    .seh_endproc
    .ident  "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0"
    .def    printf; .scl    2;  .type   32; .endef
    .def    clock;  .scl    2;  .type   32; .endef

Here's my batch script for compilation:这是我的编译批处理脚本：

@echo off
set /p file= File to compile: 
del compiled.exe
gcc -Wall -Wextra -std=c17 -O1 -o compiled.exe %file%.c
compiled.exe
PAUSE

And for compilation to assembly:对于编译到汇编：

@echo off
set /p file= File to compile: 
del %file%.s
gcc -S -Wall -Wextra -std=c17 -O1 %file%.c
PAUSE

Answer 1

The slow version:慢速版：

Note that the sub rax, 1 \ jne pair goes right across the boundary of the ..80 (which is a 32byte boundary).请注意sub rax, 1 \ jne对正好穿过..80的边界（这是一个 32 字节的边界）。 This is one of the cases mentioned in Intels document regarding this issue namely as this diagram:这是英特尔文档中提到的有关此问题的案例之一，即如下图：

So this op/branch pair is affected by the fix for the JCC erratum (which would cause it to not be cached in the µop cache).所以这个 op/branch 对受到JCC 错误修复的影响（这会导致它不被缓存在 µop 缓存中）。 I'm not sure if that is the reason, there are other things at play too, but it's a thing.我不确定这是否是原因，还有其他事情在起作用，但这是一回事。

In the fast version, the branch is not "touching" a 32byte boundary, so it is not affected.在快速版本中，分支没有“触及” 32 字节边界，因此不受影响。

There may be other effects that apply.可能还有其他适用的效果。 Still due to crossing a 32byte boundary, in the slow case the loop is spread across 2 chunks in the µop cache, even without the fix for JCC erratum that may cause it to run at 2 cycles per iteration if the loop cannot execute from the Loop Stream Detector (which is disabled on some processors by an other fix for an other erratum, SKL150).仍然由于跨越 32 字节边界，在较慢的情况下，循环分布在 µop 缓存中的 2 个块上，即使没有修复 JCC 错误，如果循环无法从循环执行，可能导致它在每次迭代中运行 2 个循环Stream 检测器（在某些处理器上被其他错误的其他修复程序 SKL150 禁用）。 See eg this answer about loop performance .参见例如这个关于循环性能的答案。

To address the various comments saying they cannot reproduce this, yes there are various ways that could happen:为了解决各种评论说他们无法重现这一点，是的，有多种可能发生的方式：

Whichever effect was responsible for the slowdown, it is likely caused by the exact placement of the op/branch pair across a 32byte boundary, which happened by pure accident.无论哪种影响导致速度变慢，都可能是由于操作/分支对跨 32 字节边界的确切位置造成的，这纯属偶然。 Compiling from source is unlikely to reproduce the same circumstances, unless you use the same compiler with the same setup as was used by the original poster.从源代码编译不太可能重现相同的情况，除非您使用与原始发布者使用相同设置的相同编译器。
Even using the same binary, regardless of which of the effects is responsible, the weird effect would only happen on particular processors.即使使用相同的二进制文件，无论哪个效果负责，奇怪的效果只会发生在特定的处理器上。

如果在 Intel Skylake CPU 上调用 function，为什么我的空循环运行速度会快两倍？

问题描述

1 个解决方案

解决方案1
16 已采纳 2021-06-07 21:19:31

如果在 Intel Skylake CPU 上调用 function，为什么我的空循环运行速度会快两倍？

问题描述

1 个解决方案

解决方案1 16 已采纳 2021-06-07 21:19:31

解决方案1
16 已采纳 2021-06-07 21:19:31