兩段 C++ 代碼之間存在顯着的性能差異

Question

我很難理解為什么以下兩個例程在性能方面沒有預期的差異。

void matmul1(const float *matrix, const float *vector, float *output, uint32_t input_height, uint32_t input_width) {
    for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
            output[y] += matrix[y * input_width + x] * vector[x];
        }
    }
}

void matmul2(const float *matrix, const float *vector, float *output, uint32_t input_height, uint32_t input_width) {
    for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
            output[y] += *matrix++ * vector[x];
        }
    }
}

我在同一台機器上對隨機數據重復執行兩個函數 100 次。 function matmul1的平均運行時間為21298μs ，function matmul2的平均運行時間為24034μs 。 樣本的標准差為198和171 。

拆卸https://godbolt.org/z/of3zM4給了我這個（我不適合組裝來正確解釋結果）

matmul1(float const*, float const*, float*, unsigned int, unsigned int):
  mov w8, 0
  mov x7, 0
  cbz w3, .L1
.L2:
  cbz w4, .L5
  ldr s0, [x2, x7, lsl 2]
  mov x5, 0
.L6:
  add w6, w8, w5
  ldr s1, [x1, x5, lsl 2]
  add x5, x5, 1
  cmp w4, w5
  ldr s2, [x0, x6, lsl 2]
  fmadd s0, s2, s1, s0
  str s0, [x2, x7, lsl 2]
  bhi .L6
.L5:
  add x7, x7, 1
  add w8, w8, w4
  cmp w3, w7
  bhi .L2
.L1:
  ret
matmul2(float const*, float const*, float*, unsigned int, unsigned int):
  cbz w3, .L10
  sub w7, w4, #1
  mov x6, 0
  add x7, x7, 1
  lsl x7, x7, 2
.L15:
  cbz w4, .L12
  ldr s0, [x2, x6, lsl 2]
  mov x5, 0
.L13:
  ldr s2, [x0, x5, lsl 2]
  ldr s1, [x1, x5, lsl 2]
  add x5, x5, 1
  cmp w4, w5
  fmadd s0, s2, s1, s0
  str s0, [x2, x6, lsl 2]
  bhi .L13
  add x0, x0, x7
.L12:
  add x6, x6, 1
  cmp w3, w6
  bhi .L15
.L10:
  ret

我還在不同的優化級別、不同的輸入大小上運行代碼。 每次第一個 function 都會擊敗第二個 function。 為什么呢？ 我希望第一個 function 比第二個慢，因為它在內部循環中多了一個乘法，但事實並非如此。

我使用 g++ 8.3.0 在 Raspberry Pi 4 上運行代碼

Answer 1

更新1：

我花了一些時間，實際上我的建議比'matmul2'運行得慢。 見下文！

嘗試更換

output[y] += *matrix++ * vector[x];

在第二個循環中

output[y] += *(++matrix) * vector[x];

即用前增量替換指針的后增量。 如果您使用后增量，則會創建指針的臨時副本並使用其值。 因為每次運行時間增加時都會發生這種情況。 如果您使用預增量，則不需要此臨時副本。

我不確定編譯器是否可以優化這部分。 因為您使用指針，所以可能無法避免副作用。 因此，它保持不變。

仔細檢查結果，因為語義略有變化。

更新1：

我實現了以下功能並花了一些時間。 matmul1是最慢的版本。 matmul2是我機器上最快的版本。 我沒想到matmul3會更慢。 1000 次重復且無優化的時序如下：

matmul1 - 573.62 毫秒

matmul2 - 512.58 毫秒

matmul3 - 534.63 毫秒

    #include <chrono>
    #include <iostream>
    #include <vector>
    
    using namespace std;
    using std::chrono::duration;
    using std::chrono::duration_cast;
    using std::chrono::high_resolution_clock;
    using std::chrono::milliseconds;
    
    void
    long_operation(vector<int>& vec) {
      /* Simulating a long, heavy operation. */
      for (size_t i = 0; i < vec.size(); ++i)
        vec[i] += i;
    }
    
    void
    matmul1(const float* matrix,
            const float* vector,
            float*       output,
            uint32_t     input_height,
            uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += matrix[y * input_width + x] * vector[x];
        }
      }
    }
    
    void
    matmul2(const float* matrix,
            const float* vector,
            float*       output,
            uint32_t     input_height,
            uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += *matrix++ * vector[x];
        }
      }
    }
    
    void
    matmul3(const float* matrix,
            const float* vector,
            float*       output,
            uint32_t     input_height,
            uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += *(++matrix) * vector[x];
        }
      }
    }
    
    int
    main() {
      //--- prepare some test data ---//
      uint32_t       height = 100;
      uint32_t       width  = 200;
      const uint32_t size   = height * width;
      float          matrix[size];
      float          vector[size];
      float          output[size];
    
      for (uint32_t i = 0; i < size; ++i) {
        matrix[i] = i;
        vector[i] = i * i;
        output[i] = 0.0;
      }
    
      //--- test timings ---//
      double time1  = 0.0;
      double time2  = 0.0;
      double time3  = 0.0;
      int    repeat = 0;
      for (repeat = 0; repeat < 10000; ++repeat) {
        //--- version 1
        auto t1 = high_resolution_clock::now();
        matmul1(matrix, vector, output, height, width);
        auto t2 = high_resolution_clock::now();
    
        duration<double, std::milli> ms_double = t2 - t1;
        time1 += ms_double.count();
    
        //--- version 2
        t1 = high_resolution_clock::now();
        matmul2(matrix, vector, output, height, width);
        t2 = high_resolution_clock::now();
    
        ms_double = t2 - t1;
        time2 += ms_double.count();
    
        //--- version 3
        t1 = high_resolution_clock::now();
        matmul3(matrix, vector, output, height, width);
        t2 = high_resolution_clock::now();
    
        ms_double = t2 - t1;
        time3 += ms_double.count();
      }
      std::cout << "total time 1:   " << time1 << "ms\n";
      std::cout << "total time 2:   " << time2 << "ms\n";
      std::cout << "total time 3:   " << time3 << "ms\n" << endl;
    
      time1 /= repeat;
      time2 /= repeat;
      time3 /= repeat;
      
      cout << "average time 1:   " << time1 << "ms\n";
      cout << "average time 2:   " << time2 << "ms\n";
      cout << "average time 3:   " << time3 << "ms" << endl;
      return 0;
    }

使用完全優化（-O3），時間幾乎相同。

兩段 C++ 代碼之間存在顯着的性能差異

問題描述

1 個解決方案

解決方案1
0 2021-03-22 09:34:03

兩段 C++ 代碼之間存在顯着的性能差異

問題描述

1 個解決方案

解決方案1 0 2021-03-22 09:34:03

解決方案1
0 2021-03-22 09:34:03