[英]Why ARM NEON code slower than native C code
I'am implementing dequantize operation in ARM NEON (ARM8-A architecture).我正在 ARM NEON(ARM8-A 架构)中实现去量化操作。 But i faced a strange point that ARM NEON version (11 ms) slower than C version (4.75 ms).
但我遇到了一个奇怪的问题,即 ARM NEON 版本(11 毫秒)比 C 版本(4.75 毫秒)慢。
Here is our code,这是我们的代码,
NEON ASM code: NEON ASM 代码:
.arch armv8-a+crc
.file "dequantize.c"
.text
.align 2
.global retinaface_dequantize_v3
.type retinaface_dequantize_v3, %function
retinaface_dequantize_v3:
.LFB3836:
.cfi_startproc
mov x6, x0 // ivtmp.39, in_cls
add x0, x0, 266240 // _214, in_cls,
add x0, x0, 2560 // _214, _214,
adrp x7, .LC0 // tmp294,
ldr q21, [x7, #:lo12:.LC0] // tmp222,
adrp x7, .LC1 // tmp295,
ldr q20, [x7, #:lo12:.LC1] // tmp227,
adrp x7, .LC2 // tmp296,
ldr q19, [x7, #:lo12:.LC2] // tmp229,
adrp x7, .LC3 // tmp297,
ldr q1, [x7, #:lo12:.LC3] // tmp246,
adrp x7, .LC4 // tmp298,
ldr q0, [x7, #:lo12:.LC4] // tmp248,
.L2:
ldr q28, [x6] // _108,* ivtmp.39
ldr q27, [x6, 16] // _107,
ldr q26, [x1] // _106,* ivtmp.41
ldr q25, [x1, 16] // _105,
ldr q24, [x1, 32] // _104,
ldr q23, [x1, 48] // _103,
ldr q22, [x2] // _102,* ivtmp.45
ldr q18, [x2, 16] // _101,
ldr q17, [x2, 32] // _100,
ldr q16, [x2, 48] // _99,
ldr q7, [x2, 64] // _98,
ldr q6, [x2, 80] // _97,
ldr q5, [x2, 96] // _96,
ldr q4, [x2, 112] // _95,
ldr q3, [x2, 128] // _94,
ldr q2, [x2, 144] // _93,
fmulx v28.2d, v28.2d, v21.2d // tmp221, _108, tmp222
fmulx v27.2d, v27.2d, v21.2d // tmp224, _107, tmp222
fmulx v26.2d, v26.2d, v19.2d // tmp228, tmp226, tmp229
fmulx v25.2d, v25.2d, v19.2d // tmp233, tmp231, tmp229
fmulx v24.2d, v24.2d, v19.2d // tmp238, tmp236, tmp229
fmulx v23.2d, v23.2d, v19.2d // tmp243, tmp241, tmp229
fmulx v22.2d, v22.2d, v0.2d // tmp247, tmp245, tmp248
fmulx v18.2d, v18.2d, v0.2d // tmp252, tmp250, tmp248
fmulx v17.2d, v17.2d, v0.2d // tmp257, tmp255, tmp248
fmulx v16.2d, v16.2d, v0.2d // tmp262, tmp260, tmp248
fmulx v7.2d, v7.2d, v0.2d // tmp267, tmp265, tmp248
fmulx v6.2d, v6.2d, v0.2d // tmp272, tmp270, tmp248
fmulx v5.2d, v5.2d, v0.2d // tmp277, tmp275, tmp248
fmulx v4.2d, v4.2d, v0.2d // tmp282, tmp280, tmp248
fmulx v3.2d, v3.2d, v0.2d // tmp287, tmp285, tmp248
fmulx v2.2d, v2.2d, v0.2d // tmp292, tmp290, tmp248
fadd v26.2d, v26.2d, v20.2d // tmp226, _106, tmp227
fadd v25.2d, v25.2d, v20.2d // tmp231, _105, tmp227
fadd v24.2d, v24.2d, v20.2d // tmp236, _104, tmp227
fadd v23.2d, v23.2d, v20.2d // tmp241, _103, tmp227
fadd v22.2d, v22.2d, v1.2d // tmp245, _102, tmp246
fadd v18.2d, v18.2d, v1.2d // tmp250, _101, tmp246
fadd v17.2d, v17.2d, v1.2d // tmp255, _100, tmp246
fadd v16.2d, v16.2d, v1.2d // tmp260, _99, tmp246
fadd v7.2d, v7.2d, v1.2d // tmp265, _98, tmp246
fadd v6.2d, v6.2d, v1.2d // tmp270, _97, tmp246
fadd v5.2d, v5.2d, v1.2d // tmp275, _96, tmp246
fadd v4.2d, v4.2d, v1.2d // tmp280, _95, tmp246
fadd v3.2d, v3.2d, v1.2d // tmp285, _94, tmp246
fadd v2.2d, v2.2d, v1.2d // tmp290, _93, tmp246
str q28, [x3] // tmp221,* ivtmp.55
str q27, [x3, 16] // tmp224,
str q26, [x4] // tmp228,* ivtmp.57
str q25, [x4, 16] // tmp233,
str q24, [x4, 32] // tmp238,
str q23, [x4, 48] // tmp243,
str q22, [x5] // tmp247,* ivtmp.61
str q18, [x5, 16] // tmp252,
str q17, [x5, 32] // tmp257,
str q16, [x5, 48] // tmp262,
str q7, [x5, 64] // tmp267,
str q6, [x5, 80] // tmp272,
str q5, [x5, 96] // tmp277,
str q4, [x5, 112] // tmp282,
str q3, [x5, 128] // tmp287,
str q2, [x5, 144] // tmp292,
add x6, x6, 32 // ivtmp.39, ivtmp.39,
add x1, x1, 64 // ivtmp.41, ivtmp.41,
add x2, x2, 160 // ivtmp.45, ivtmp.45,
add x3, x3, 32 // ivtmp.55, ivtmp.55,
add x4, x4, 64 // ivtmp.57, ivtmp.57,
add x5, x5, 160 // ivtmp.61, ivtmp.61,
cmp x6, x0 // ivtmp.39, _214
bne .L2 //,
// dequantize.c:475: }
ret
.cfi_endproc
.LFE3836:
.size retinaface_dequantize_v3, .-retinaface_dequantize_v3
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC0:
.word 0
.word 1064304640
.word 0
.word 1064304640
.LC1:
.word 0
.word -1067417600
.word 0
.word -1067417600
.LC2:
.word 536870912
.word 1068027667
.word 536870912
.word 1068027667
.LC3:
.word 0
.word -1067515904
.word 0
.word -1067515904
.LC4:
.word 3758096384
.word 1069039660
.word 3758096384
.word 1069039660
.ident "GCC: (Debian 8.3.0-6) 8.3.0"
.section .note.GNU-stack,"",@progbits
C code: C 代码:
void retinaface_dequantize_v0(uint8_t *in_cls, uint8_t *in_bbox, uint8_t *in_ldm, double *out_cls, double *out_bbox, double *out_ldm, uint64_t length)
{
double const dequan_cls = 0.00390625;
double const dequan_bbox = 0.048454854637384415;
double const dequan_ldm = 0.0947292372584343;
const uint8_t bbox_minus = 132;
const uint8_t ldm_minus = 124;
for (int64_t i = 16799;i>=0;i--)
{
//cls
out_cls[i*2] = dequan_cls * (uint8_t)in_cls[i*2];
out_cls[i*2+1] = dequan_cls * (uint8_t)in_cls[i*2+1];
//bbox
out_bbox[i*4] = dequan_bbox * ((uint8_t)in_bbox[i*4] - bbox_minus);
out_bbox[i*4+1] = dequan_bbox * ((uint8_t)in_bbox[i*4+1] - bbox_minus);
out_bbox[i*4+2] = dequan_bbox * ((uint8_t)in_bbox[i*4+2] - bbox_minus);
out_bbox[i*4+3] = dequan_bbox * ((uint8_t)in_bbox[i*4+3] - bbox_minus);
//ldm
out_ldm[i*10] = dequan_ldm * ((uint8_t)in_ldm[i*10] - ldm_minus);
out_ldm[i*10+1] = dequan_ldm * ((uint8_t)in_ldm[i*10+1] - ldm_minus);
out_ldm[i*10+2] = dequan_ldm * ((uint8_t)in_ldm[i*10+2] - ldm_minus);
out_ldm[i*10+3] = dequan_ldm * ((uint8_t)in_ldm[i*10+3] - ldm_minus);
out_ldm[i*10+4] = dequan_ldm * ((uint8_t)in_ldm[i*10+4] - ldm_minus);
out_ldm[i*10+5] = dequan_ldm * ((uint8_t)in_ldm[i*10+5] - ldm_minus);
out_ldm[i*10+6] = dequan_ldm * ((uint8_t)in_ldm[i*10+6] - ldm_minus);
out_ldm[i*10+7] = dequan_ldm * ((uint8_t)in_ldm[i*10+7] - ldm_minus);
out_ldm[i*10+8] = dequan_ldm * ((uint8_t)in_ldm[i*10+8] - ldm_minus);
out_ldm[i*10+9] = dequan_ldm * ((uint8_t)in_ldm[i*10+9] - ldm_minus)
}
}
The code looks like code that a good compiler will vectorize completely.该代码看起来像一个好的编译器将完全矢量化的代码。 Disassemble the code to see what the compiler has generated.
反汇编代码以查看编译器生成了什么。
If it has not, use NEON compiler intrinsics to generate fully vectorized code, instead of assembler.如果没有,请使用 NEON 编译器内在函数来生成完全矢量化的代码,而不是汇编程序。 The advantage of using compiler intrinsics is that the compiler can reorder ARM, and ARM NEON instructions in order to prevent pipeline stalls, and maximize concurrent execution.
使用编译器内在函数的优点是编译器可以重新排序 ARM 和 ARM NEON 指令,以防止流水线停顿,并最大化并发执行。 And it does a very good job.
它做得很好。
The most obvious problem with your code: you need to interleave arithmetic operations and ARM assembler with vld1/vst1 load/store instructions in order to prevent the pipeline from stalling while waiting for consecutive loads/stores to complete.您的代码最明显的问题:您需要将算术运算和 ARM 汇编器与 vld1/vst1 加载/存储指令交错,以防止管道在等待连续加载/存储完成时停止。 The ARM instructions at the head of the loop need to be pushed down between initial load instructions as as much as possible, where they will execute in parallel, and the NEON math instructions need to be scattered between the initial load instructions so that they execute concurrently with stalled memory reads (and writes).
循环头部的 ARM 指令需要在初始加载指令之间尽可能下推,并行执行,NEON 数学指令需要分散在初始加载指令之间,以便它们同时执行停止 memory 读取(和写入)。
Compilers LOVE doing that sort of instruction scheduling, and do a better job than even the most expert of assembler coders can do by hand.编译器喜欢做这种指令调度,并且比最专业的汇编编码器手工做的还要好。 An expert would start with the compiler-generated code, and use the (very expensive and difficult to use) ARM profiling tools to look for places where the compiler has mis-scheduled operations.
专家将从编译器生成的代码开始,并使用(非常昂贵且难以使用)ARM 分析工具来查找编译器错误调度操作的位置。 And would be happy to get a 5 or 10% improvement over well-written C/C++ code.
并且很高兴能比编写良好的 C/C++ 代码提高 5% 或 10%。
An added bonus when using intrinsics, is that you can tell the compiler which specific ARM processor you're using, and the compiler will optimize scheduling for that particular processor.使用内在函数时的另一个好处是,您可以告诉编译器您正在使用哪个特定的 ARM 处理器,并且编译器将为该特定处理器优化调度。 So you won't end up rewriting the code from scratch when you migrate from an a52 to an a76, or to an Arm9 processor.
因此,当您从 a52 迁移到 a76 或 Arm9 处理器时,您最终不会从头开始重写代码。
fwiw, all the vld1 data reads will be cache misses. fwiw,所有 vld1 数据读取都将是缓存未命中。 Reasonably optimized code should be spending 99% of its time waiting for the memory reads, with pretty much all of the rest of the code executing concurrently while waiting for the reads to complete.
合理优化的代码应该花费 99% 的时间等待 memory 读取,几乎所有代码的 rest 在等待读取完成时同时执行。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.