在 Raspberry Pi 4 上使用 ARM Neon Intrinsics 加速矩阵向量乘法

Question

I need to optimize a matrix vector multiplication.我需要优化矩阵向量乘法。 The data looks like following:数据如下所示：

Vector has 81 columns向量有 81 列
Matrix has 90,000 rows and 81 columns and is already transposed.矩阵有 90,000 行和 81 列，并且已经转置。 So row-wise dot product can be used.所以可以使用逐行点积。
The output is hence a vector with 90,000 rows因此 output 是一个有 90,000 行的向量
All lie in 1D float array全部位于一维浮点数组中

Some non-function requirements are also have to be met for this routine:此例程还必须满足一些非功能要求：

As few as possible standard libraries should be used (no std::vector for example)应使用尽可能少的标准库（例如不使用std::vector ）
No third-party library should be used (so no Eigen or Blas for me, either)不应使用第三方库（所以对我来说也没有Eigen或Blas ）

This is my (simplified, where I assume the input is perfectly blocked, for sake of readability) code,这是我的（简化的，为了便于阅读，我假设输入被完全阻止）代码，

// input_height = 90000
// input_width = 81

for (uint32_t y = 0; y < input_height; y += 4) {
    float32x4_t sum0 = vmovq_n_f32(0);
    float32x4_t sum1 = vmovq_n_f32(0);
    float32x4_t sum2 = vmovq_n_f32(0);
    float32x4_t sum3 = vmovq_n_f32(0);

    for (uint32_t x = 0; x < input_width; x += 16) {
        float32x4x4_t A = load_matrix_transpose(kernel + x);

        float32x4x4_t B0 = load_matrix_transpose(input + y * input_width + x);
        float32x4x4_t B1 = load_matrix_transpose(input + (y + 1) * input_width + x);
        float32x4x4_t B2 = load_matrix_transpose(input + (y + 2) * input_width + x);
        float32x4x4_t B3 = load_matrix_transpose(input + (y + 3) * input_width + x);

        matrix_element_wise_multiplication(A, B0, sum0);
        matrix_element_wise_multiplication(A, B1, sum1);
        matrix_element_wise_multiplication(A, B2, sum2);
        matrix_element_wise_multiplication(A, B3, sum3);
    }

    output[y] = vaddvq_f32(sum0);
    output[y + 1] = vaddvq_f32(sum1);
    output[y + 2] = vaddvq_f32(sum2);
    output[y + 3] = vaddvq_f32(sum3);
}

Where the load_matrix_transpose , matrix_element_wise_multiplication are the following functions:其中load_matrix_transpose ， matrix_element_wise_multiplication是以下函数：

inline float32x4x4_t load_matrix_transpose(float *a) {
    float32x4x4_t ret;

    ret.val[0] = simd_load(a);

    ret.val[1] = simd_load(a + 4);

    ret.val[2] = simd_load(a + 8);

    ret.val[3] = simd_load(a + 12);

    return ret;
}

inline void simd_matrix_element_wise_multiplication(float32x4x4_t & A, float32x4x4_t & B, float32x4x4_t & C) {
    C = vmlaq_f32(C, A.val[0], B.val[0]);
    C = vmlaq_f32(C, A.val[1], B.val[1]);
    C = vmlaq_f32(C, A.val[2], B.val[2]);
    C = vmlaq_f32(C, A.val[3], B.val[3]);
}

On my Rasperry Pi 4 (ARMv8, 8GB RAM, 4 cores) the code takes with optimization level -O3 about 60ms .在我的 Rasperry Pi 4（ARMv8，8GB RAM，4 核）上，代码采用优化级别-O3大约60ms 。

On long run (many loops), the Neon register version is exactly twice as fast as the normal code.在长期运行（许多循环）中，Neon 寄存器版本的速度正好是普通代码的两倍。

My question is, is there anyway to optimize the code further?我的问题是，有没有进一步优化代码？ I have tried many things but can not make any improvement with respect to the normal code.我尝试了很多东西，但对正常代码没有任何改进。

Answer 1

Data locality is the highest priority when it comes to optimizations, and you should be aware of the register capacity since registers are BY FAR the fastest and most scarce resource.在优化方面，数据局部性是最高优先级，您应该注意寄存器容量，因为寄存器是迄今为止最快和最稀缺的资源。

aarch64 : 32x128bit neon registers (512 bytes) aarch64 ：32x128bit neon寄存器（512 字节）
aarch32 : 16x128bit neon registers (256 bytes) aarch32 ：16x128bit neon寄存器（256 字节）

A 81x90000 matrix when transposed requires to hold 90000 intermediate values to do the multiplication, and since 360000 bytes don't fit into a register bank of 512 bytes, there will be TONS of memory swapping which translates in HUGE performance hits.转置时的 81x90000 矩阵需要保存 90000 个中间值来进行乘法运算，并且由于 360000 字节不适合 512 字节的寄存器组，因此将有大量的 memory 交换，这意味着巨大的性能损失。
On the other hand, 4*81 bytes of the vector fit nicely into the 512 bytes.另一方面，向量的 4*81 字节非常适合 512 字节。

void matVecMult81x90000(float *pDst, float *pMat, float *pVec)
{
    register float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
    register float32x4_t mat0, mat1, mat2, mat3, mat4, rslt;
    register float32x2_t drslt;
    register uint32_t nRows = 90000;

    vec80 = vdupq_n_f32(0.0f);
    mat4 =vdupq_n_f32(0.0f);
    vec0_3 = vld1q_f32(pVec); pVec += 4;
    vec4_7 = vld1q_f32(pVec); pVec += 4;
    vec8_11 = vld1q_f32(pVec); pVec += 4;
    vec12_15 = vld1q_f32(pVec); pVec += 4;
    vec16_19 = vld1q_f32(pVec); pVec += 4;
    vec20_23 = vld1q_f32(pVec); pVec += 4;
    vec24_27 = vld1q_f32(pVec); pVec += 4;
    vec28_31 = vld1q_f32(pVec); pVec += 4;
    vec32_35 = vld1q_f32(pVec); pVec += 4;
    vec36_39 = vld1q_f32(pVec); pVec += 4;
    vec40_43 = vld1q_f32(pVec); pVec += 4;
    vec44_47 = vld1q_f32(pVec); pVec += 4;
    vec48_51 = vld1q_f32(pVec); pVec += 4;
    vec52_55 = vld1q_f32(pVec); pVec += 4;
    vec56_59 = vld1q_f32(pVec); pVec += 4;
    vec60_63 = vld1q_f32(pVec); pVec += 4;
    vec64_67 = vld1q_f32(pVec); pVec += 4;
    vec68_71 = vld1q_f32(pVec); pVec += 4;
    vec72_75 = vld1q_f32(pVec); pVec += 4;
    vec76_79 = vld1q_f32(pVec); pVec += 4;
    vld1q_lane_f32(pVec, vec80, 0);

    do {
        mat0 = vld1q_f32(pMat); pMat += 4;
        mat1 = vld1q_f32(pMat); pMat += 4;
        mat2 = vld1q_f32(pMat); pMat += 4;
        mat3 = vld1q_f32(pMat); pMat += 4;
        rslt = vmulq_f32(mat0, vec0_3);
        rslt += vmulq_f32(mat1, vec4_7);
        rslt += vmulq_f32(mat2, vec8_11);
        rslt += vmulq_f32(mat3, vec12_15);

        mat0 = vld1q_f32(pMat); pMat += 4;
        mat1 = vld1q_f32(pMat); pMat += 4;
        mat2 = vld1q_f32(pMat); pMat += 4;
        mat3 = vld1q_f32(pMat); pMat += 4;
        rslt += vmulq_f32(mat0, vec16_19);
        rslt += vmulq_f32(mat1, vec20_23);
        rslt += vmulq_f32(mat2, vec24_27);
        rslt += vmulq_f32(mat3, vec28_31);

        mat0 = vld1q_f32(pMat); pMat += 4;
        mat1 = vld1q_f32(pMat); pMat += 4;
        mat2 = vld1q_f32(pMat); pMat += 4;
        mat3 = vld1q_f32(pMat); pMat += 4;
        rslt += vmulq_f32(mat0, vec32_35);
        rslt += vmulq_f32(mat1, vec36_39);
        rslt += vmulq_f32(mat2, vec40_43);
        rslt += vmulq_f32(mat3, vec44_47);

        mat0 = vld1q_f32(pMat); pMat += 4;
        mat1 = vld1q_f32(pMat); pMat += 4;
        mat2 = vld1q_f32(pMat); pMat += 4;
        mat3 = vld1q_f32(pMat); pMat += 4;
        rslt += vmulq_f32(mat0, vec48_51);
        rslt += vmulq_f32(mat1, vec52_55);
        rslt += vmulq_f32(mat2, vec56_59);
        rslt += vmulq_f32(mat3, vec60_63);

        mat0 = vld1q_f32(pMat); pMat += 4;
        mat1 = vld1q_f32(pMat); pMat += 4;
        mat2 = vld1q_f32(pMat); pMat += 4;
        mat3 = vld1q_f32(pMat); pMat += 4;
        vld1q_lane_f32(pMat, mat4, 0); pMat += 1;
        rslt += vmulq_f32(mat0, vec64_67);
        rslt += vmulq_f32(mat1, vec68_71);
        rslt += vmulq_f32(mat2, vec72_75);
        rslt += vmulq_f32(mat3, vec76_79);
        rslt += vmulq_f32(mat4, vec80);

        *pDst++ = vaddvq_f32(rslt);
    } while (--nRows);
}

Unfortunately, compilers don't play along nicely.不幸的是，编译器不能很好地配合。 (Both GCC and Clang) （GCC 和 Clang）
The generated code shows some stack swapping on the Vector inside the loop.生成的代码显示了循环内 Vector 上的一些堆栈交换。 Below is the same function in hand written assembly without any stack swapping:下面是相同的 function 手写汇编，没有任何堆栈交换：

    .arch   armv8-a
    .global     matVecMult81x90000_asm
    .text

.balign 64
.func
matVecMult81x90000_asm:
// init loop counter
    mov     w3, #90000 & 0xffff
    movk    w3, #90000>>16, lsl #16

// preserve registers
    stp     d8, d9, [sp, #-48]!
    stp     d10, d11, [sp, #1*16]
    stp     d12, d13, [sp, #2*16]

// load vectors
    ldp     q0, q1, [x2, #0*32]
    ldp     q2, q3, [x2, #1*32]
    ldp     q4, q5, [x2, #2*32]
    ldp     q6, q7, [x2, #3*32]
    ldp     q8, q9, [x2, #4*32]
    ldp     q10, q11, [x2, #5*32]
    ldp     q12, q13, [x2, #6*32]
    ldp     q16, q17, [x2, #7*32]
    ldp     q18, q19, [x2, #8*32]
    ldp     q20, q21, [x2, #9*32]
    ldr     s22, [x2, #10*32]

// loop
.balign 64
1:
    ldp     q24, q25, [x1, #0*32]
    ldp     q26, q27, [x1, #1*32]
    ldp     q28, q29, [x1, #2*32]
    ldp     q30, q31, [x1, #3*32]
    subs    w3, w3, #1

    fmul    v23.4s, v24.4s, v0.4s
    fmla    v23.4s, v25.4s, v1.4s
    fmla    v23.4s, v26.4s, v2.4s
    fmla    v23.4s, v27.4s, v3.4s
    fmla    v23.4s, v28.4s, v4.4s
    fmla    v23.4s, v29.4s, v5.4s
    fmla    v23.4s, v30.4s, v6.4s
    fmla    v23.4s, v31.4s, v7.4s

    ldp     q24, q25, [x1, #4*32]
    ldp     q26, q27, [x1, #5*32]
    ldp     q28, q29, [x1, #6*32]
    ldp     q30, q31, [x1, #7*32]

    fmla    v23.4s, v24.4s, v8.4s
    fmla    v23.4s, v25.4s, v9.4s
    fmla    v23.4s, v26.4s, v10.4s
    fmla    v23.4s, v27.4s, v11.4s
    fmla    v23.4s, v28.4s, v12.4s
    fmla    v23.4s, v29.4s, v13.4s
    fmla    v23.4s, v30.4s, v16.4s
    fmla    v23.4s, v31.4s, v17.4s

    ldp     q24, q25, [x1, #8*32]
    ldp     q26, q27, [x1, #9*32]
    ldr     s28, [x1, #10*32]

    fmla    v23.4s, v24.4s, v18.4s
    fmla    v23.4s, v25.4s, v19.4s
    fmla    v23.4s, v26.4s, v20.4s
    fmla    v23.4s, v27.4s, v21.4s
    fmla    v23.4s, v28.4s, v22.4s

    add     x1, x1, #81*4

    faddp   v23.4s, v23.4s, v23.4s
    faddp   v23.2s, v23.2s, v23.2s

    str     s23, [x0], #4
    b.ne    1b

.balign 8
//restore registers

    ldp     d10, d11, [sp, #1*16]
    ldp     d12, d13, [sp, #2*16]
    ldp     d8, d9, [sp], #48

// return
    ret

.endfunc
.end

Test results on RK3368: RK3368上的测试结果：
Clang intrinsics: 10.41ms Clang 内在函数：10.41ms
assembly: 9.59ms组装：9.59ms

The compilers didn't perform that bad in this case, but more than often they are unbelievably stupid.在这种情况下，编译器的表现并没有那么糟糕，但它们往往是令人难以置信的愚蠢。 I strongly recommend learning assembly.我强烈推荐学习汇编。

Answer 2

Here's an optimization of Jake's answer.这是对杰克答案的优化。

Using 4 accumulators instead of a single one helps because FMA instructions have latency much higher than throughput.使用 4 个累加器而不是单个累加器会有所帮助，因为 FMA 指令的延迟远高于吞吐量。 According to Cortex-A72 optimization guide , the latency of FMLA instruction is 7 cycles for the complete thing, or 3 cycles when the dependency is on the accumulator (if you wonder what the hell is Q-form and D-form, Q is for 16-byte vectors, D is for 8-byte vectors).根据Cortex-A72 优化指南， FMLA指令的延迟是完整的 7 个周期，或者当依赖于累加器时为 3 个周期（如果你想知道 Q-form 和 D-form 到底是什么，Q 是用于16 字节向量，D 代表 8 字节向量）。 The throughput is much higher, it's 1 cycle, the CPU can run one FMA every cycle.吞吐量要高得多，它是 1 个周期，CPU 每个周期可以运行一个 FMA。

The following version used 4 independent accumulators instead of a single one, should improve the throughput despite we need 3 extra instructions in the end of the loop to sum the accumulators.以下版本使用 4 个独立的累加器而不是单个累加器，尽管我们在循环结束时需要 3 条额外指令来对累加器求和，但应该会提高吞吐量。

I've also used a few macros to help with repetitive code.我还使用了一些宏来帮助处理重复代码。 Untested.未经测试。

void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
    // 30 vector registers in total; ARM64 has 32 of them, so we're good.
    float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
    float32x4_t mat0, mat1, mat2, mat3, mat4;
    float32x4_t res0, res1, res2, res3;

    vec80 = mat4 = vdupq_n_f32( 0.0f );
    // Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 )      \
    v0 = vld1q_f32( pVec ); pVec += 4;     \
    v1 = vld1q_f32( pVec ); pVec += 4;     \
    v2 = vld1q_f32( pVec ); pVec += 4;     \
    v3 = vld1q_f32( pVec ); pVec += 4

    // Load the complete vector into registers using the above macro
    LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 );
    LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 );
    LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 );
    LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 );
    LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 );
    // Load the final scalar of the vector
    vec80 = vld1q_lane_f32( pVec, vec80, 0 );

#undef LOAD_VEC_16

    // Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer
#define LOAD_MATRIX_16()                         \
        mat0 = vld1q_f32( pMat ); pMat += 4;     \
        mat1 = vld1q_f32( pMat ); pMat += 4;     \
        mat2 = vld1q_f32( pMat ); pMat += 4;     \
        mat3 = vld1q_f32( pMat ); pMat += 4

    // Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3
    // Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them.
#define HANDLE_BLOCK_16( v0, v1, v2, v3 )        \
        res0 = vfmaq_f32( res0, mat0, v0 );      \
        res1 = vfmaq_f32( res1, mat1, v1 );      \
        res2 = vfmaq_f32( res2, mat2, v2 );      \
        res3 = vfmaq_f32( res3, mat3, v3 )

    const float* const pMatEnd = pMat + nRows * 81;
    while( pMat < pMatEnd )
    {
        // Initial 16 elements only need multiplication.
        LOAD_MATRIX_16();
        res0 = vmulq_f32( mat0, vec0_3 );
        res1 = vmulq_f32( mat1, vec4_7 );
        res2 = vmulq_f32( mat2, vec8_11 );
        res3 = vmulq_f32( mat3, vec12_15 );

        // Handle the rest of the row using FMA instructions.
        LOAD_MATRIX_16();
        HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 );

        LOAD_MATRIX_16();
        HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 );

        LOAD_MATRIX_16();
        HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 );

        // The final block of the row has 17 scalars instead of 16
        LOAD_MATRIX_16();
        mat4 = vld1q_lane_f32( pMat, mat4, 0 ); pMat++;

        HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 );
        res0 = vfmaq_f32( res0, mat4, vec80 );

        // Vertically add 4 accumulators into res0
        res1 = vaddq_f32( res1, res2 );
        res0 = vaddq_f32( res3, res0 );
        res0 = vaddq_f32( res1, res0 );

        // Store the horizontal sum of the accumulator
        *pDst = vaddvq_f32( res0 );
        pDst++;
    }

#undef LOAD_MATRIX_16
#undef HANDLE_BLOCK_16
}

The assembly generated from that source with GCC 10.1 looks more or less OK .使用 GCC 10.1 从该源生成的程序集看起来或多或少 OK 。

在 Raspberry Pi 4 上使用 ARM Neon Intrinsics 加速矩阵向量乘法

问题描述

2 个解决方案

解决方案1
3 已采纳 2021-03-17 14:21:55

解决方案2
1 2021-03-17 22:43:16

在 Raspberry Pi 4 上使用 ARM Neon Intrinsics 加速矩阵向量乘法

问题描述

2 个解决方案

解决方案1 3 已采纳 2021-03-17 14:21:55

解决方案2 1 2021-03-17 22:43:16

解决方案1
3 已采纳 2021-03-17 14:21:55

解决方案2
1 2021-03-17 22:43:16