为什么 ARM NEON 代码比原生 C 代码慢

Question

I'am implementing dequantize operation in ARM NEON (ARM8-A architecture).我正在 ARM NEON（ARM8-A 架构）中实现去量化操作。 But i faced a strange point that ARM NEON version (11 ms) slower than C version (4.75 ms).但我遇到了一个奇怪的问题，即 ARM NEON 版本（11 毫秒）比 C 版本（4.75 毫秒）慢。

Here is our code,这是我们的代码，

NEON ASM code: NEON ASM 代码：

.arch armv8-a+crc
    .file   "dequantize.c"
    .text
    .align  2
    .global retinaface_dequantize_v3
    .type   retinaface_dequantize_v3, %function
retinaface_dequantize_v3:
.LFB3836:
    .cfi_startproc
    mov x6, x0  // ivtmp.39, in_cls
    add x0, x0, 266240  // _214, in_cls,
    add x0, x0, 2560    // _214, _214,

    adrp    x7, .LC0    // tmp294,
    ldr q21, [x7, #:lo12:.LC0]  // tmp222,

    adrp    x7, .LC1    // tmp295,
    ldr q20, [x7, #:lo12:.LC1]  // tmp227,

    adrp    x7, .LC2    // tmp296,
    ldr q19, [x7, #:lo12:.LC2]  // tmp229,

    adrp    x7, .LC3    // tmp297,
    ldr q1, [x7, #:lo12:.LC3]   // tmp246,

    adrp    x7, .LC4    // tmp298,
    ldr q0, [x7, #:lo12:.LC4]   // tmp248,
.L2:
    ldr q28, [x6]   // _108,* ivtmp.39
    ldr q27, [x6, 16]   // _107,
    ldr q26, [x1]   // _106,* ivtmp.41
    ldr q25, [x1, 16]   // _105,
    ldr q24, [x1, 32]   // _104,
    ldr q23, [x1, 48]   // _103,
    ldr q22, [x2]   // _102,* ivtmp.45
    ldr q18, [x2, 16]   // _101,
    ldr q17, [x2, 32]   // _100,
    ldr q16, [x2, 48]   // _99,
    ldr q7, [x2, 64]    // _98,
    ldr q6, [x2, 80]    // _97,
    ldr q5, [x2, 96]    // _96,
    ldr q4, [x2, 112]   // _95,
    ldr q3, [x2, 128]   // _94,
    ldr q2, [x2, 144]   // _93,

    fmulx   v28.2d, v28.2d, v21.2d  // tmp221, _108, tmp222
    fmulx   v27.2d, v27.2d, v21.2d  // tmp224, _107, tmp222
    fmulx   v26.2d, v26.2d, v19.2d  // tmp228, tmp226, tmp229
    fmulx   v25.2d, v25.2d, v19.2d  // tmp233, tmp231, tmp229
    fmulx   v24.2d, v24.2d, v19.2d  // tmp238, tmp236, tmp229
    fmulx   v23.2d, v23.2d, v19.2d  // tmp243, tmp241, tmp229
    fmulx   v22.2d, v22.2d, v0.2d   // tmp247, tmp245, tmp248
    fmulx   v18.2d, v18.2d, v0.2d   // tmp252, tmp250, tmp248
    fmulx   v17.2d, v17.2d, v0.2d   // tmp257, tmp255, tmp248
    fmulx   v16.2d, v16.2d, v0.2d   // tmp262, tmp260, tmp248
    fmulx   v7.2d, v7.2d, v0.2d // tmp267, tmp265, tmp248
    fmulx   v6.2d, v6.2d, v0.2d // tmp272, tmp270, tmp248
    fmulx   v5.2d, v5.2d, v0.2d // tmp277, tmp275, tmp248
    fmulx   v4.2d, v4.2d, v0.2d // tmp282, tmp280, tmp248
    fmulx   v3.2d, v3.2d, v0.2d // tmp287, tmp285, tmp248
    fmulx   v2.2d, v2.2d, v0.2d // tmp292, tmp290, tmp248

    fadd    v26.2d, v26.2d, v20.2d  // tmp226, _106, tmp227
    fadd    v25.2d, v25.2d, v20.2d  // tmp231, _105, tmp227
    fadd    v24.2d, v24.2d, v20.2d  // tmp236, _104, tmp227
    fadd    v23.2d, v23.2d, v20.2d  // tmp241, _103, tmp227
    fadd    v22.2d, v22.2d, v1.2d   // tmp245, _102, tmp246
    fadd    v18.2d, v18.2d, v1.2d   // tmp250, _101, tmp246
    fadd    v17.2d, v17.2d, v1.2d   // tmp255, _100, tmp246
    fadd    v16.2d, v16.2d, v1.2d   // tmp260, _99, tmp246
    fadd    v7.2d, v7.2d, v1.2d // tmp265, _98, tmp246
    fadd    v6.2d, v6.2d, v1.2d // tmp270, _97, tmp246
    fadd    v5.2d, v5.2d, v1.2d // tmp275, _96, tmp246
    fadd    v4.2d, v4.2d, v1.2d // tmp280, _95, tmp246
    fadd    v3.2d, v3.2d, v1.2d // tmp285, _94, tmp246
    fadd    v2.2d, v2.2d, v1.2d // tmp290, _93, tmp246

    str q28, [x3]   // tmp221,* ivtmp.55
    str q27, [x3, 16]   // tmp224,
    str q26, [x4]   // tmp228,* ivtmp.57
    str q25, [x4, 16]   // tmp233,
    str q24, [x4, 32]   // tmp238,
    str q23, [x4, 48]   // tmp243,
    str q22, [x5]   // tmp247,* ivtmp.61
    str q18, [x5, 16]   // tmp252,
    str q17, [x5, 32]   // tmp257,
    str q16, [x5, 48]   // tmp262,
    str q7, [x5, 64]    // tmp267,
    str q6, [x5, 80]    // tmp272,
    str q5, [x5, 96]    // tmp277,
    str q4, [x5, 112]   // tmp282,
    str q3, [x5, 128]   // tmp287,
    str q2, [x5, 144]   // tmp292,

    add x6, x6, 32  // ivtmp.39, ivtmp.39,
    add x1, x1, 64  // ivtmp.41, ivtmp.41,
    add x2, x2, 160 // ivtmp.45, ivtmp.45,
    add x3, x3, 32  // ivtmp.55, ivtmp.55,
    add x4, x4, 64  // ivtmp.57, ivtmp.57,
    add x5, x5, 160 // ivtmp.61, ivtmp.61,
    cmp x6, x0  // ivtmp.39, _214
    bne .L2     //,
// dequantize.c:475: }
    ret 
    .cfi_endproc
.LFE3836:
    .size   retinaface_dequantize_v3, .-retinaface_dequantize_v3
    .section    .rodata.cst16,"aM",@progbits,16
    .align  4
.LC0:
    .word   0
    .word   1064304640
    .word   0
    .word   1064304640
.LC1:
    .word   0
    .word   -1067417600
    .word   0
    .word   -1067417600
.LC2:
    .word   536870912
    .word   1068027667
    .word   536870912
    .word   1068027667
.LC3:
    .word   0
    .word   -1067515904
    .word   0
    .word   -1067515904
.LC4:
    .word   3758096384
    .word   1069039660
    .word   3758096384
    .word   1069039660
    .ident  "GCC: (Debian 8.3.0-6) 8.3.0"
    .section    .note.GNU-stack,"",@progbits

C code: C 代码：

void retinaface_dequantize_v0(uint8_t *in_cls, uint8_t *in_bbox, uint8_t *in_ldm, double *out_cls, double *out_bbox, double *out_ldm, uint64_t length)
{

    double const dequan_cls     = 0.00390625;
    double const dequan_bbox    = 0.048454854637384415;
    double const dequan_ldm     = 0.0947292372584343;
    const uint8_t bbox_minus    = 132;
    const uint8_t ldm_minus     = 124;

    for (int64_t i = 16799;i>=0;i--)
    {
        //cls
        out_cls[i*2]    = dequan_cls * (uint8_t)in_cls[i*2];
        out_cls[i*2+1]  = dequan_cls * (uint8_t)in_cls[i*2+1];
        //bbox
        out_bbox[i*4]   = dequan_bbox * ((uint8_t)in_bbox[i*4] - bbox_minus);
        out_bbox[i*4+1] = dequan_bbox * ((uint8_t)in_bbox[i*4+1] - bbox_minus);
        out_bbox[i*4+2] = dequan_bbox * ((uint8_t)in_bbox[i*4+2] - bbox_minus);
        out_bbox[i*4+3] = dequan_bbox * ((uint8_t)in_bbox[i*4+3] - bbox_minus);
        //ldm
        out_ldm[i*10]       = dequan_ldm * ((uint8_t)in_ldm[i*10] - ldm_minus);
        out_ldm[i*10+1]     = dequan_ldm * ((uint8_t)in_ldm[i*10+1] - ldm_minus);
        out_ldm[i*10+2]     = dequan_ldm * ((uint8_t)in_ldm[i*10+2] - ldm_minus);
        out_ldm[i*10+3]     = dequan_ldm * ((uint8_t)in_ldm[i*10+3] - ldm_minus);
        out_ldm[i*10+4]     = dequan_ldm * ((uint8_t)in_ldm[i*10+4] - ldm_minus);
        out_ldm[i*10+5]     = dequan_ldm * ((uint8_t)in_ldm[i*10+5] - ldm_minus);
        out_ldm[i*10+6]     = dequan_ldm * ((uint8_t)in_ldm[i*10+6] - ldm_minus);
        out_ldm[i*10+7]     = dequan_ldm * ((uint8_t)in_ldm[i*10+7] - ldm_minus);
        out_ldm[i*10+8]     = dequan_ldm * ((uint8_t)in_ldm[i*10+8] - ldm_minus);
        out_ldm[i*10+9]     = dequan_ldm * ((uint8_t)in_ldm[i*10+9] - ldm_minus)

    }
}

Answer 1

The code looks like code that a good compiler will vectorize completely.该代码看起来像一个好的编译器将完全矢量化的代码。 Disassemble the code to see what the compiler has generated.反汇编代码以查看编译器生成了什么。

If it has not, use NEON compiler intrinsics to generate fully vectorized code, instead of assembler.如果没有，请使用 NEON 编译器内在函数来生成完全矢量化的代码，而不是汇编程序。 The advantage of using compiler intrinsics is that the compiler can reorder ARM, and ARM NEON instructions in order to prevent pipeline stalls, and maximize concurrent execution.使用编译器内在函数的优点是编译器可以重新排序 ARM 和 ARM NEON 指令，以防止流水线停顿，并最大化并发执行。 And it does a very good job.它做得很好。

The most obvious problem with your code: you need to interleave arithmetic operations and ARM assembler with vld1/vst1 load/store instructions in order to prevent the pipeline from stalling while waiting for consecutive loads/stores to complete.您的代码最明显的问题：您需要将算术运算和 ARM 汇编器与 vld1/vst1 加载/存储指令交错，以防止管道在等待连续加载/存储完成时停止。 The ARM instructions at the head of the loop need to be pushed down between initial load instructions as as much as possible, where they will execute in parallel, and the NEON math instructions need to be scattered between the initial load instructions so that they execute concurrently with stalled memory reads (and writes).循环头部的 ARM 指令需要在初始加载指令之间尽可能下推，并行执行，NEON 数学指令需要分散在初始加载指令之间，以便它们同时执行停止 memory 读取（和写入）。

Compilers LOVE doing that sort of instruction scheduling, and do a better job than even the most expert of assembler coders can do by hand.编译器喜欢做这种指令调度，并且比最专业的汇编编码器手工做的还要好。 An expert would start with the compiler-generated code, and use the (very expensive and difficult to use) ARM profiling tools to look for places where the compiler has mis-scheduled operations.专家将从编译器生成的代码开始，并使用（非常昂贵且难以使用）ARM 分析工具来查找编译器错误调度操作的位置。 And would be happy to get a 5 or 10% improvement over well-written C/C++ code.并且很高兴能比编写良好的 C/C++ 代码提高 5% 或 10%。

An added bonus when using intrinsics, is that you can tell the compiler which specific ARM processor you're using, and the compiler will optimize scheduling for that particular processor.使用内在函数时的另一个好处是，您可以告诉编译器您正在使用哪个特定的 ARM 处理器，并且编译器将为该特定处理器优化调度。 So you won't end up rewriting the code from scratch when you migrate from an a52 to an a76, or to an Arm9 processor.因此，当您从 a52 迁移到 a76 或 Arm9 处理器时，您最终不会从头开始重写代码。

fwiw, all the vld1 data reads will be cache misses. fwiw，所有 vld1 数据读取都将是缓存未命中。 Reasonably optimized code should be spending 99% of its time waiting for the memory reads, with pretty much all of the rest of the code executing concurrently while waiting for the reads to complete.合理优化的代码应该花费 99% 的时间等待 memory 读取，几乎所有代码的 rest 在等待读取完成时同时执行。

为什么 ARM NEON 代码比原生 C 代码慢

问题描述

1 个解决方案

解决方案1
0 2021-10-07 00:34:29

为什么 ARM NEON 代码比原生 C 代码慢

问题描述

1 个解决方案

解决方案1 0 2021-10-07 00:34:29

解决方案1
0 2021-10-07 00:34:29