Significant performance discrepancy between two pieces of C++ code

Question

I am having the trouble to understand why these two following routines do not have the expected difference in term of performance.

void matmul1(const float *matrix, const float *vector, float *output, uint32_t input_height, uint32_t input_width) {
    for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
            output[y] += matrix[y * input_width + x] * vector[x];
        }
    }
}

void matmul2(const float *matrix, const float *vector, float *output, uint32_t input_height, uint32_t input_width) {
    for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
            output[y] += *matrix++ * vector[x];
        }
    }
}

I repeat the executions of two functions on random data for 100 times on the same machines. The function matmul1 has a mean running time of 21298μs and the function matmul2 has a mean running time of 24034μs . The standard deviation of the samples are 198 and 171 .

Disassembly https://godbolt.org/z/of3zM4 gives me this (I am not fit enough with assembly to interpret the results properly)

matmul1(float const*, float const*, float*, unsigned int, unsigned int):
  mov w8, 0
  mov x7, 0
  cbz w3, .L1
.L2:
  cbz w4, .L5
  ldr s0, [x2, x7, lsl 2]
  mov x5, 0
.L6:
  add w6, w8, w5
  ldr s1, [x1, x5, lsl 2]
  add x5, x5, 1
  cmp w4, w5
  ldr s2, [x0, x6, lsl 2]
  fmadd s0, s2, s1, s0
  str s0, [x2, x7, lsl 2]
  bhi .L6
.L5:
  add x7, x7, 1
  add w8, w8, w4
  cmp w3, w7
  bhi .L2
.L1:
  ret
matmul2(float const*, float const*, float*, unsigned int, unsigned int):
  cbz w3, .L10
  sub w7, w4, #1
  mov x6, 0
  add x7, x7, 1
  lsl x7, x7, 2
.L15:
  cbz w4, .L12
  ldr s0, [x2, x6, lsl 2]
  mov x5, 0
.L13:
  ldr s2, [x0, x5, lsl 2]
  ldr s1, [x1, x5, lsl 2]
  add x5, x5, 1
  cmp w4, w5
  fmadd s0, s2, s1, s0
  str s0, [x2, x6, lsl 2]
  bhi .L13
  add x0, x0, x7
.L12:
  add x6, x6, 1
  cmp w3, w6
  bhi .L15
.L10:
  ret

I also ran the code on different optimization levels, different input sizes. Every time the first function would beat the second function. Why is that so? I expect the first function to be slower than the second, since it has one multiplication more in the inner loop, which is not the case.

I run the code on Raspberry Pi 4 with g++ 8.3.0

Answer 1

Update 1:

I took some timings and actually my suggestions runs slower than the 'matmul2'. See below!

Try to replace

output[y] += *matrix++ * vector[x];

in the second loop with

output[y] += *(++matrix) * vector[x];

Ie replace the post-increment of the pointer with a pre-increment. If you use a post-increment a temporary copy of the pointer is created and its value is used. Because this happens each time the run-time increases. If you use the pre-increment this temporary copy is not necessary.

I am not sure, if the compiler can optimize this part. Because you use a pointer it might be not possible to avoid side-effects. Therefore, it stays untouched.

Double-check the results because the semantic slightly changes.

Update 1:

I implemented the following functions and took some timings. matmul1 is the slowest version. matmul2 is the fastest version on my machine. I did not expect matmul3 to be slower. Timings for 1000 repeats and no optimization are as follows:

matmul1 - 573.62 ms

matmul2 - 512.58 ms

matmul3 - 534.63 ms

    #include <chrono>
    #include <iostream>
    #include <vector>
    
    using namespace std;
    using std::chrono::duration;
    using std::chrono::duration_cast;
    using std::chrono::high_resolution_clock;
    using std::chrono::milliseconds;
    
    void
    long_operation(vector<int>& vec) {
      /* Simulating a long, heavy operation. */
      for (size_t i = 0; i < vec.size(); ++i)
        vec[i] += i;
    }
    
    void
    matmul1(const float* matrix,
            const float* vector,
            float*       output,
            uint32_t     input_height,
            uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += matrix[y * input_width + x] * vector[x];
        }
      }
    }
    
    void
    matmul2(const float* matrix,
            const float* vector,
            float*       output,
            uint32_t     input_height,
            uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += *matrix++ * vector[x];
        }
      }
    }
    
    void
    matmul3(const float* matrix,
            const float* vector,
            float*       output,
            uint32_t     input_height,
            uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += *(++matrix) * vector[x];
        }
      }
    }
    
    int
    main() {
      //--- prepare some test data ---//
      uint32_t       height = 100;
      uint32_t       width  = 200;
      const uint32_t size   = height * width;
      float          matrix[size];
      float          vector[size];
      float          output[size];
    
      for (uint32_t i = 0; i < size; ++i) {
        matrix[i] = i;
        vector[i] = i * i;
        output[i] = 0.0;
      }
    
      //--- test timings ---//
      double time1  = 0.0;
      double time2  = 0.0;
      double time3  = 0.0;
      int    repeat = 0;
      for (repeat = 0; repeat < 10000; ++repeat) {
        //--- version 1
        auto t1 = high_resolution_clock::now();
        matmul1(matrix, vector, output, height, width);
        auto t2 = high_resolution_clock::now();
    
        duration<double, std::milli> ms_double = t2 - t1;
        time1 += ms_double.count();
    
        //--- version 2
        t1 = high_resolution_clock::now();
        matmul2(matrix, vector, output, height, width);
        t2 = high_resolution_clock::now();
    
        ms_double = t2 - t1;
        time2 += ms_double.count();
    
        //--- version 3
        t1 = high_resolution_clock::now();
        matmul3(matrix, vector, output, height, width);
        t2 = high_resolution_clock::now();
    
        ms_double = t2 - t1;
        time3 += ms_double.count();
      }
      std::cout << "total time 1:   " << time1 << "ms\n";
      std::cout << "total time 2:   " << time2 << "ms\n";
      std::cout << "total time 3:   " << time3 << "ms\n" << endl;
    
      time1 /= repeat;
      time2 /= repeat;
      time3 /= repeat;
      
      cout << "average time 1:   " << time1 << "ms\n";
      cout << "average time 2:   " << time2 << "ms\n";
      cout << "average time 3:   " << time3 << "ms" << endl;
      return 0;
    }

With full optimization (-O3) the timings are almost the same.

Significant performance discrepancy between two pieces of C++ code

Question

1 answers

solution1
0 2021-03-22 09:34:03

Significant performance discrepancy between two pieces of C++ code

Question

1 answers

solution1 0 2021-03-22 09:34:03

solution1
0 2021-03-22 09:34:03