Why ARM NEON code slower than native C code

Question

I'am implementing dequantize operation in ARM NEON (ARM8-A architecture). But i faced a strange point that ARM NEON version (11 ms) slower than C version (4.75 ms).

Here is our code,

NEON ASM code:

.arch armv8-a+crc
    .file   "dequantize.c"
    .text
    .align  2
    .global retinaface_dequantize_v3
    .type   retinaface_dequantize_v3, %function
retinaface_dequantize_v3:
.LFB3836:
    .cfi_startproc
    mov x6, x0  // ivtmp.39, in_cls
    add x0, x0, 266240  // _214, in_cls,
    add x0, x0, 2560    // _214, _214,

    adrp    x7, .LC0    // tmp294,
    ldr q21, [x7, #:lo12:.LC0]  // tmp222,

    adrp    x7, .LC1    // tmp295,
    ldr q20, [x7, #:lo12:.LC1]  // tmp227,

    adrp    x7, .LC2    // tmp296,
    ldr q19, [x7, #:lo12:.LC2]  // tmp229,

    adrp    x7, .LC3    // tmp297,
    ldr q1, [x7, #:lo12:.LC3]   // tmp246,

    adrp    x7, .LC4    // tmp298,
    ldr q0, [x7, #:lo12:.LC4]   // tmp248,
.L2:
    ldr q28, [x6]   // _108,* ivtmp.39
    ldr q27, [x6, 16]   // _107,
    ldr q26, [x1]   // _106,* ivtmp.41
    ldr q25, [x1, 16]   // _105,
    ldr q24, [x1, 32]   // _104,
    ldr q23, [x1, 48]   // _103,
    ldr q22, [x2]   // _102,* ivtmp.45
    ldr q18, [x2, 16]   // _101,
    ldr q17, [x2, 32]   // _100,
    ldr q16, [x2, 48]   // _99,
    ldr q7, [x2, 64]    // _98,
    ldr q6, [x2, 80]    // _97,
    ldr q5, [x2, 96]    // _96,
    ldr q4, [x2, 112]   // _95,
    ldr q3, [x2, 128]   // _94,
    ldr q2, [x2, 144]   // _93,

    fmulx   v28.2d, v28.2d, v21.2d  // tmp221, _108, tmp222
    fmulx   v27.2d, v27.2d, v21.2d  // tmp224, _107, tmp222
    fmulx   v26.2d, v26.2d, v19.2d  // tmp228, tmp226, tmp229
    fmulx   v25.2d, v25.2d, v19.2d  // tmp233, tmp231, tmp229
    fmulx   v24.2d, v24.2d, v19.2d  // tmp238, tmp236, tmp229
    fmulx   v23.2d, v23.2d, v19.2d  // tmp243, tmp241, tmp229
    fmulx   v22.2d, v22.2d, v0.2d   // tmp247, tmp245, tmp248
    fmulx   v18.2d, v18.2d, v0.2d   // tmp252, tmp250, tmp248
    fmulx   v17.2d, v17.2d, v0.2d   // tmp257, tmp255, tmp248
    fmulx   v16.2d, v16.2d, v0.2d   // tmp262, tmp260, tmp248
    fmulx   v7.2d, v7.2d, v0.2d // tmp267, tmp265, tmp248
    fmulx   v6.2d, v6.2d, v0.2d // tmp272, tmp270, tmp248
    fmulx   v5.2d, v5.2d, v0.2d // tmp277, tmp275, tmp248
    fmulx   v4.2d, v4.2d, v0.2d // tmp282, tmp280, tmp248
    fmulx   v3.2d, v3.2d, v0.2d // tmp287, tmp285, tmp248
    fmulx   v2.2d, v2.2d, v0.2d // tmp292, tmp290, tmp248

    fadd    v26.2d, v26.2d, v20.2d  // tmp226, _106, tmp227
    fadd    v25.2d, v25.2d, v20.2d  // tmp231, _105, tmp227
    fadd    v24.2d, v24.2d, v20.2d  // tmp236, _104, tmp227
    fadd    v23.2d, v23.2d, v20.2d  // tmp241, _103, tmp227
    fadd    v22.2d, v22.2d, v1.2d   // tmp245, _102, tmp246
    fadd    v18.2d, v18.2d, v1.2d   // tmp250, _101, tmp246
    fadd    v17.2d, v17.2d, v1.2d   // tmp255, _100, tmp246
    fadd    v16.2d, v16.2d, v1.2d   // tmp260, _99, tmp246
    fadd    v7.2d, v7.2d, v1.2d // tmp265, _98, tmp246
    fadd    v6.2d, v6.2d, v1.2d // tmp270, _97, tmp246
    fadd    v5.2d, v5.2d, v1.2d // tmp275, _96, tmp246
    fadd    v4.2d, v4.2d, v1.2d // tmp280, _95, tmp246
    fadd    v3.2d, v3.2d, v1.2d // tmp285, _94, tmp246
    fadd    v2.2d, v2.2d, v1.2d // tmp290, _93, tmp246

    str q28, [x3]   // tmp221,* ivtmp.55
    str q27, [x3, 16]   // tmp224,
    str q26, [x4]   // tmp228,* ivtmp.57
    str q25, [x4, 16]   // tmp233,
    str q24, [x4, 32]   // tmp238,
    str q23, [x4, 48]   // tmp243,
    str q22, [x5]   // tmp247,* ivtmp.61
    str q18, [x5, 16]   // tmp252,
    str q17, [x5, 32]   // tmp257,
    str q16, [x5, 48]   // tmp262,
    str q7, [x5, 64]    // tmp267,
    str q6, [x5, 80]    // tmp272,
    str q5, [x5, 96]    // tmp277,
    str q4, [x5, 112]   // tmp282,
    str q3, [x5, 128]   // tmp287,
    str q2, [x5, 144]   // tmp292,

    add x6, x6, 32  // ivtmp.39, ivtmp.39,
    add x1, x1, 64  // ivtmp.41, ivtmp.41,
    add x2, x2, 160 // ivtmp.45, ivtmp.45,
    add x3, x3, 32  // ivtmp.55, ivtmp.55,
    add x4, x4, 64  // ivtmp.57, ivtmp.57,
    add x5, x5, 160 // ivtmp.61, ivtmp.61,
    cmp x6, x0  // ivtmp.39, _214
    bne .L2     //,
// dequantize.c:475: }
    ret 
    .cfi_endproc
.LFE3836:
    .size   retinaface_dequantize_v3, .-retinaface_dequantize_v3
    .section    .rodata.cst16,"aM",@progbits,16
    .align  4
.LC0:
    .word   0
    .word   1064304640
    .word   0
    .word   1064304640
.LC1:
    .word   0
    .word   -1067417600
    .word   0
    .word   -1067417600
.LC2:
    .word   536870912
    .word   1068027667
    .word   536870912
    .word   1068027667
.LC3:
    .word   0
    .word   -1067515904
    .word   0
    .word   -1067515904
.LC4:
    .word   3758096384
    .word   1069039660
    .word   3758096384
    .word   1069039660
    .ident  "GCC: (Debian 8.3.0-6) 8.3.0"
    .section    .note.GNU-stack,"",@progbits

C code:

void retinaface_dequantize_v0(uint8_t *in_cls, uint8_t *in_bbox, uint8_t *in_ldm, double *out_cls, double *out_bbox, double *out_ldm, uint64_t length)
{

    double const dequan_cls     = 0.00390625;
    double const dequan_bbox    = 0.048454854637384415;
    double const dequan_ldm     = 0.0947292372584343;
    const uint8_t bbox_minus    = 132;
    const uint8_t ldm_minus     = 124;

    for (int64_t i = 16799;i>=0;i--)
    {
        //cls
        out_cls[i*2]    = dequan_cls * (uint8_t)in_cls[i*2];
        out_cls[i*2+1]  = dequan_cls * (uint8_t)in_cls[i*2+1];
        //bbox
        out_bbox[i*4]   = dequan_bbox * ((uint8_t)in_bbox[i*4] - bbox_minus);
        out_bbox[i*4+1] = dequan_bbox * ((uint8_t)in_bbox[i*4+1] - bbox_minus);
        out_bbox[i*4+2] = dequan_bbox * ((uint8_t)in_bbox[i*4+2] - bbox_minus);
        out_bbox[i*4+3] = dequan_bbox * ((uint8_t)in_bbox[i*4+3] - bbox_minus);
        //ldm
        out_ldm[i*10]       = dequan_ldm * ((uint8_t)in_ldm[i*10] - ldm_minus);
        out_ldm[i*10+1]     = dequan_ldm * ((uint8_t)in_ldm[i*10+1] - ldm_minus);
        out_ldm[i*10+2]     = dequan_ldm * ((uint8_t)in_ldm[i*10+2] - ldm_minus);
        out_ldm[i*10+3]     = dequan_ldm * ((uint8_t)in_ldm[i*10+3] - ldm_minus);
        out_ldm[i*10+4]     = dequan_ldm * ((uint8_t)in_ldm[i*10+4] - ldm_minus);
        out_ldm[i*10+5]     = dequan_ldm * ((uint8_t)in_ldm[i*10+5] - ldm_minus);
        out_ldm[i*10+6]     = dequan_ldm * ((uint8_t)in_ldm[i*10+6] - ldm_minus);
        out_ldm[i*10+7]     = dequan_ldm * ((uint8_t)in_ldm[i*10+7] - ldm_minus);
        out_ldm[i*10+8]     = dequan_ldm * ((uint8_t)in_ldm[i*10+8] - ldm_minus);
        out_ldm[i*10+9]     = dequan_ldm * ((uint8_t)in_ldm[i*10+9] - ldm_minus)

    }
}

Answer 1

The code looks like code that a good compiler will vectorize completely. Disassemble the code to see what the compiler has generated.

If it has not, use NEON compiler intrinsics to generate fully vectorized code, instead of assembler. The advantage of using compiler intrinsics is that the compiler can reorder ARM, and ARM NEON instructions in order to prevent pipeline stalls, and maximize concurrent execution. And it does a very good job.

The most obvious problem with your code: you need to interleave arithmetic operations and ARM assembler with vld1/vst1 load/store instructions in order to prevent the pipeline from stalling while waiting for consecutive loads/stores to complete. The ARM instructions at the head of the loop need to be pushed down between initial load instructions as as much as possible, where they will execute in parallel, and the NEON math instructions need to be scattered between the initial load instructions so that they execute concurrently with stalled memory reads (and writes).

Compilers LOVE doing that sort of instruction scheduling, and do a better job than even the most expert of assembler coders can do by hand. An expert would start with the compiler-generated code, and use the (very expensive and difficult to use) ARM profiling tools to look for places where the compiler has mis-scheduled operations. And would be happy to get a 5 or 10% improvement over well-written C/C++ code.

An added bonus when using intrinsics, is that you can tell the compiler which specific ARM processor you're using, and the compiler will optimize scheduling for that particular processor. So you won't end up rewriting the code from scratch when you migrate from an a52 to an a76, or to an Arm9 processor.

fwiw, all the vld1 data reads will be cache misses. Reasonably optimized code should be spending 99% of its time waiting for the memory reads, with pretty much all of the rest of the code executing concurrently while waiting for the reads to complete.

Why ARM NEON code slower than native C code

Question

1 answers

solution1
0 2021-10-07 00:34:29

Why ARM NEON code slower than native C code

Question

1 answers

solution1 0 2021-10-07 00:34:29

solution1
0 2021-10-07 00:34:29