如何优化我的 C++ OpenMp 矩阵乘法代码

Question

我编写了一个 C++ OpenMp 矩阵乘法代码，用于将两个 1000x1000 矩阵相乘。 到目前为止，我使用 OpenMp 获得了 0.700 秒的执行时间，但我想看看是否有其他方法可以使用 OpenMp 使其更快？

我很感激你能给我的任何建议或提示。

这是我的代码：

#include <iostream>
#include <time.h>
#include <omp.h>

using namespace std;

void Multiply()
 {
    //initialize matrices with random numbers
    int aMatrix[1000][1000], i, j;
  for( i = 0; i < 1000; ++i)
  {for( j = 0;  j < 1000; ++j)
     {aMatrix[i][j] = rand();}
  }
    int bMatrix[1000][1000], i1, j2;
  for( i1 = 0; i1 < 1000; ++i1)
  {for( j2 = 0;  j2 < 1000; ++j2)
     {bMatrix[i1][j2] = rand();}
  }
  //Result Matrix
    int product[1000][1000]  = {0};


    for (int row = 0; row < 1000; row++) {
        for (int col = 0; col < 1000; col++) {
            // Multiply the row of A by the column of B to get the row, column of product.
            for (int inner = 0; inner < 1000; inner++) {
                product[row][col] += aMatrix[row][inner] * bMatrix[inner][col];
            }

        }
    
    }
}

int main() {

   time_t begin, end;
    time(&begin);

    Multiply();

  time(&end);
  time_t elapsed = end - begin;

  cout << ("Time measured: %ld seconds.\n", elapsed);

return 0;
}

Answer 1

这是直接的 c++ 代码，它在 .08s 中运行整数，在 .14s 中运行浮点数或双精度数。 我的系统已有 10 年历史，速度相对较慢 memory。当时很好，但现在是现在。

我同意@VictorEijkhout 的观点，最好的结果是使用经过调优的代码。 已经有大量的工作来优化这些。

#include <vector>
#include <array>
#include <random>
#include <cassert>
#include <iostream>
#include <iomanip>
#include <thread>
#include <future>
#include <chrono>

struct Timer {
    std::chrono::system_clock::time_point snapTime;
    Timer() { snapTime = std::chrono::system_clock::now(); }
    operator double() { return std::chrono::duration<double>(std::chrono::system_clock::now() - snapTime).count(); }
};

using DataType = int;
using std::array, std::vector;
constexpr int N = 1000, THREADS = 12;
static auto launchType = std::launch::async;
using Matrix = vector<array<DataType, N>>;
Matrix create_matrix() { return Matrix(N); };

Matrix product(Matrix const& v0, Matrix const& v1, double& time)
{
    Matrix ret = create_matrix();
    Matrix v2 = create_matrix();
    Timer timer;
    for (size_t r = 0; r < N; r++)      // transpose first
        for (size_t c = 0; c < N; c++)
            v2[c][r] = v1[r][c];
    // lambda to process sets of rows in separate threads
    auto do_row_set = [&v0, &v2, &ret](size_t start, size_t last) {
        for (size_t row = start; row < last; row++)
            for (size_t col = 0; col < N; col++)
               {
                DataType tmp{}; // separate tmp variable significantly improves optimization
                for (size_t col_t = 0; col_t < N; col_t++)
                    tmp += v0[row][col_t] * v2[col][col_t];
                ret[row][col] = tmp;
                }

    };

    vector<size_t> seq;
    const size_t NN = N / THREADS;
    // make a sequence of NN+1 rows from start to end
    for (size_t thread_n = 0; thread_n < N; thread_n += NN)
        seq.push_back(thread_n);
    seq.push_back(N);
    vector<std::future<void>> results; results.reserve(THREADS);
    for (size_t i = 0; i < THREADS; i++)
        results.emplace_back(std::async(launchType, do_row_set, seq[i], seq[i + 1]));
    for (auto& x : results)
        x.get();
    time = timer;
    return ret;
}

bool operator==(Matrix const& v0, Matrix const& v1)
{
    for (size_t r = 0; r < N; r++)
        for (size_t c = 0; c < N; c++)
            if (v0[r][c] != v1[r][c])
                return false;
    return true;
}

int main()
{
    auto fill = [](Matrix& v) {
        std::mt19937_64 r(1);
        std::normal_distribution dist(1.);
        for (size_t row = 0; row < N; row++)
            for (size_t col = 0; col < N; col++)
                v[row][col] = DataType(dist(r));
    };
    Matrix m1 = create_matrix(), m2 = create_matrix(), m3 = create_matrix();
    fill(m1); fill(m2);

    auto process_test = [&m1, &m2](Matrix& out) {
        const int rpt_count = 4;
        double sum = 0;
        for (int i = 0; i < rpt_count; i++)
        {
            double time;
            out = product(m1, m2, time);
            sum += time / rpt_count;
        }
        return sum;
    };

    std::cout << std::fixed << std::setprecision(4);
    double time{};
    time = process_test(m3);
    std::cout << "Threads w transpose   " << time << " secs.\n";
}

Answer 2

可以做以下事情来加速：

使用 OpenMP 并行化外部循环，就像您所做的那样（就像我在以下代码中所做的那样）。 或者使用std::async进行多线程，就像在另一个答案中使用的那样。
转置 B 矩阵，这将有助于增加 L1 缓存命中，因为您将从顺序 memory 中读取每个 B 列（或转置变体中的行）。
使用向量化的SIMD指令，这将允许在一个 CPU 周期内进行多次乘法（和加法）。 编译器通常会在没有您帮助的情况下通过 SIMD 指令很好地自动向量化您的循环，但我在我的代码中明确地做到了。
在循环内运行多个独立的 SIMD 指令。 这将有助于占用 SIMD 的整个 CPU 流水线。 我在我的代码中使用四个 SIMD 寄存器r0, r1, r2, r3这样做了。 在编译器中，这通常称为循环展开。
在 64 字节边界上对齐矩阵起始地址。 这将有助于 SIMD 指令进行快速对齐的读/写。
在 64 字节边界上对齐每个矩阵行的起始地址。 我在我的代码中通过用零填充每一行直到 64 字节的倍数来做到这一点。 这也有助于 SIMD 指令进行快速对齐的读/写。

在我的以下代码中，我完成了上述所有1. - 6.步骤。 Memory 64 字节 alignment 我通过实现在std::vector中使用的AlignmentAllocator来完成。 我还对float / double / int进行了时间测量。

在我的旧 4 核笔记本电脑上，我对 1000x1000 矩阵乘以 1000x1000 的情况进行了以下时间测量：

 float: time 0.1569 sec
double: time 0.3168 sec
   int: time 0.1565 sec

为了比较我的硬件功能，我针对int的情况测量了@doug 的另一个答案：

Threads w transpose   0.2164 secs.

可以看到我的解决方案比另一个答案快1.4倍，我猜是由于 memory 64 字节 alignment 并且可能是由于使用了显式 SIMD（而不是依赖循环的编译器自动矢量化）。

要编译我的程序，如果您正在编译，不要忘记添加-fopenmp -lgomp选项（用于 OpenMP 支持）和-march=native -O3 -std=c++20选项（用于 SIMD 支持、优化和标准）在 GCC/CLang 下，而我猜想 MSVC 会自动添加 OpenMP，并且不需要任何特殊选项（使用/O2 /GL /std:c++latest进行优化和 MSVC 中的标准）。

在我的代码中，我只为 SIMD 实现了 SSE2/SSE4/AVX/AVX2 指令，如果你有更强大的机器，你可以告诉我，我也实现了 FMA/AVX-512，它们将提供两倍的速度提升。

我的Mul() function 非常通用，它是模板化的，您只需将指针传递给矩阵和行/列计数，因此您的矩阵可以以任何方式存储在调用端（通过 std::vector 或 std::array 或普通二维数组）。

在Run() function 开始时，如果您需要更大的测试，您可以更改行数和列数。 请注意，我的所有函数都支持任何行和列，您甚至可以将大小为 1234x2345 的矩阵乘以 2345x3456。

在线试用！

#include <cstdint>
#include <cstring>
#include <stdexcept>
#include <iostream>
#include <iomanip>
#include <vector>
#include <memory>
#include <string>

#include <immintrin.h>

#define USE_OPENMP 1
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#if defined(_MSC_VER)
    #define IS_MSVC 1
#else
    #define IS_MSVC 0
#endif

#if USE_OPENMP
    #include <omp.h>
#endif

template <typename T, std::size_t N>
class AlignmentAllocator {
  public:
    typedef T value_type;
    typedef std::size_t size_type;
    typedef std::ptrdiff_t difference_type;
    typedef T * pointer;
    typedef const T * const_pointer;
    typedef T & reference;
    typedef const T & const_reference;

  public:
    inline AlignmentAllocator() throw() {}
    template <typename T2> inline AlignmentAllocator(const AlignmentAllocator<T2, N> &) throw() {}
    inline ~AlignmentAllocator() throw() {}
    inline pointer adress(reference r) { return &r; }
    inline const_pointer adress(const_reference r) const { return &r; }
    inline pointer allocate(size_type n);
    inline void deallocate(pointer p, size_type);
    inline void construct(pointer p, const value_type & wert);
    inline void destroy(pointer p) { p->~value_type(); }
    inline size_type max_size() const throw() { return size_type(-1) / sizeof(value_type); }
    template <typename T2> struct rebind { typedef AlignmentAllocator<T2, N> other; };
    bool operator!=(const AlignmentAllocator<T, N> & other) const { return !(*this == other); }
    bool operator==(const AlignmentAllocator<T, N> & other) const { return true; }
};

template <typename T, std::size_t N>
inline typename AlignmentAllocator<T, N>::pointer AlignmentAllocator<T, N>::allocate(size_type n) {
    #if IS_MSVC
        auto p = (pointer)_aligned_malloc(n * sizeof(value_type), N);
    #else
        auto p = (pointer)std::aligned_alloc(N, n * sizeof(value_type));
    #endif
    ASSERT(p);
    return p;
}
template <typename T, std::size_t N>
inline void AlignmentAllocator<T, N>::deallocate(pointer p, size_type) {
    #if IS_MSVC
        _aligned_free(p);
    #else
        std::free(p);
    #endif
}
template <typename T, std::size_t N>
inline void AlignmentAllocator<T, N>::construct(pointer p, const value_type & wert) {
    new (p) value_type(wert);
}

template <typename T>
using AlignedVector = std::vector<T, AlignmentAllocator<T, 64>>;

template <typename T>
struct RegT;

#ifdef __AVX__
    template <> struct RegT<float> { static size_t constexpr bisize = 256; using type = __m256; static type zero() { return _mm256_setzero_ps(); } };
    template <> struct RegT<double> { static size_t constexpr bisize = 256; using type = __m256d; static type zero() { return _mm256_setzero_pd(); } };
    
    inline void MulAddReg(float const * a, float const * b, __m256 & c) {
        c = _mm256_add_ps(c, _mm256_mul_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
    }
    inline void MulAddReg(double const * a, double const * b, __m256d & c) {
        c = _mm256_add_pd(c, _mm256_mul_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
    }
    
    inline void StoreReg(float * dst, __m256 const & src) { _mm256_store_ps(dst, src); }
    inline void StoreReg(double * dst, __m256d const & src) { _mm256_store_pd(dst, src); }
#else // SSE2
    template <> struct RegT<float> { static size_t constexpr bisize = 128; using type = __m128; static type zero() { return _mm_setzero_ps(); } };
    template <> struct RegT<double> { static size_t constexpr bisize = 128; using type = __m128d; static type zero() { return _mm_setzero_pd(); } };

    inline void MulAddReg(float const * a, float const * b, __m128 & c) {
        c = _mm_add_ps(c, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)));
    }
    inline void MulAddReg(double const * a, double const * b, __m128d & c) {
        c = _mm_add_pd(c, _mm_mul_pd(_mm_load_pd(a), _mm_load_pd(b)));
    }
    
    inline void StoreReg(float * dst, __m128 const & src) { _mm_store_ps(dst, src); }
    inline void StoreReg(double * dst, __m128d const & src) { _mm_store_pd(dst, src); }
#endif

#ifdef __AVX2__
    template <> struct RegT<int32_t> { static size_t constexpr bisize = 256; using type = __m256i; static type zero() { return _mm256_setzero_si256(); } };
    //template <> struct RegT<int64_t> { static size_t constexpr bisize = 256; using type = __m256i; static type zero() { return _mm256_setzero_si256(); } };

    inline void MulAddReg(int32_t const * a, int32_t const * b, __m256i & c) {
        c = _mm256_add_epi32(c, _mm256_mullo_epi32(_mm256_load_si256((__m256i*)a), _mm256_load_si256((__m256i*)b)));
    }
    //inline void MulAddReg(int64_t const * a, int64_t const * b, __m256i & c) {
    //    c = _mm256_add_epi64(c, _mm256_mullo_epi64(_mm256_load_si256((__m256i*)a), _mm256_load_si256((__m256i*)b)));
    //}
    
    inline void StoreReg(int32_t * dst, __m256i const & src) { _mm256_store_si256((__m256i*)dst, src); }
    //inline void StoreReg(int64_t * dst, __m256i const & src) { _mm256_store_si256((__m256i*)dst, src); }
#else // SSE2
    template <> struct RegT<int32_t> { static size_t constexpr bisize = 128; using type = __m128i; static type zero() { return _mm_setzero_si128(); } };
    //template <> struct RegT<int64_t> { static size_t constexpr bisize = 128; using type = __m128i; static type zero() { return _mm_setzero_si128(); } };

    inline void MulAddReg(int32_t const * a, int32_t const * b, __m128i & c) {
        c = _mm_add_epi32(c, _mm_mullo_epi32(_mm_load_si128((__m128i*)a), _mm_load_si128((__m128i*)b)));
    }
    //inline void MulAddReg(int64_t const * a, int64_t const * b, __m128i & c) {
    //    c = _mm_add_epi64(c, _mm_mullo_epi64(_mm_load_si128((__m128i*)a), _mm_load_si128((__m128i*)b)));
    //}
    
    inline void StoreReg(int32_t * dst, __m128i const & src) { _mm_store_si128((__m128i*)dst, src); }
    //inline void StoreReg(int64_t * dst, __m128i const & src) { _mm_store_si128((__m128i*)dst, src); }
#endif    

template <typename T>
void Mul(T const * A0, size_t A_rows, size_t A_cols, T const * B0, size_t B_rows, size_t B_cols, T * C) {
    size_t constexpr reg_cnt = RegT<T>::bisize / 8 / sizeof(T), block = 4 * reg_cnt;
    ASSERT(A_cols == B_rows);
    size_t const A_cols_aligned = (A_cols + block - 1) / block * block, B_rows_aligned = (B_rows + block - 1) / block * block;
    
    // Copy aligned A
    AlignedVector<T> Av(A_rows * A_cols_aligned);
    for (size_t i = 0; i < A_rows; ++i)
        std::memcpy(&Av[i * A_cols_aligned], &A0[i * A_cols], sizeof(Av[0]) * A_cols);
    T const * A = Av.data();
    // Transpose B
    AlignedVector<T> Bv(B_cols * B_rows_aligned);
    for (size_t j = 0; j < B_cols; ++j)
        for (size_t i = 0; i < B_rows; ++i)
            Bv[j * B_rows_aligned + i] = B0[i * B_cols + j];
    T const * Bt = Bv.data();
    ASSERT(uintptr_t(A) % 64 == 0 && uintptr_t(Bt) % 64 == 0);
    ASSERT(uintptr_t(&A[A_cols_aligned]) % 64 == 0 && uintptr_t(&Bt[B_rows_aligned]) % 64 == 0);
    
    // Multiply
    #pragma omp parallel for
    for (size_t i = 0; i < A_rows; ++i) {
        // Aligned Reg storage
        AlignedVector<T> Regs(block);
        
        for (size_t j = 0; j < B_cols; ++j) {
            T const * Arow = &A[i * A_cols_aligned + 0], * Btrow = &Bt[j * B_rows_aligned + 0];
            
            using Reg = typename RegT<T>::type;
            Reg r0 = RegT<T>::zero(), r1 = RegT<T>::zero(), r2 = RegT<T>::zero(), r3 = RegT<T>::zero();
            
            size_t const k_hi = A_cols - A_cols % block;
            
            for (size_t k = 0; k < k_hi; k += block) {
                MulAddReg(&Arow[k + reg_cnt * 0], &Btrow[k + reg_cnt * 0], r0);
                MulAddReg(&Arow[k + reg_cnt * 1], &Btrow[k + reg_cnt * 1], r1);
                MulAddReg(&Arow[k + reg_cnt * 2], &Btrow[k + reg_cnt * 2], r2);
                MulAddReg(&Arow[k + reg_cnt * 3], &Btrow[k + reg_cnt * 3], r3);
            }
            
            StoreReg(&Regs[reg_cnt * 0], r0);
            StoreReg(&Regs[reg_cnt * 1], r1);
            StoreReg(&Regs[reg_cnt * 2], r2);
            StoreReg(&Regs[reg_cnt * 3], r3);
            
            T sum1 = 0, sum2 = 0, sum3 = 0;
            for (size_t k = 0; k < Regs.size(); ++k)
                sum1 += Regs[k];
            
            //for (size_t k = 0; k < A_cols - A_cols % block; ++k) sum3 += Arow[k] * Btrow[k];
            
            for (size_t k = k_hi; k < A_cols; ++k)
                sum2 += Arow[k] * Btrow[k];
            
            C[i * A_rows + j] = sum2 + sum1;
        }
    }
}

#include <random>
#include <thread>
#include <chrono>
#include <type_traits>

template <typename T>
void Test(T const * A, size_t A_rows, size_t A_cols, T const * B, size_t B_rows, size_t B_cols, T const * C, T eps) {
    for (size_t i = 0; i < A_rows / 16; ++i)
        for (size_t j = 0; j < B_cols / 16; ++j) {
            T sum = 0;
            for (size_t k = 0; k < A_cols; ++k)
                sum += A[i * A_cols + k] * B[k * B_cols + j];
            ASSERT_MSG(std::abs(C[i * A_rows + j] - sum) <= eps * A_cols, "i " + std::to_string(i) + " j " + std::to_string(j) +
                " C " + std::to_string(C[i * A_rows + j]) + " ref " + std::to_string(sum));
        }
}

double Time() {
    static auto const gtb = std::chrono::high_resolution_clock::now();
    return std::chrono::duration_cast<std::chrono::duration<double>>(
        std::chrono::high_resolution_clock::now() - gtb).count();
}

template <typename T>
void Run() {
    size_t constexpr A_rows = 1000, A_cols = 1000, B_rows = 1000, B_cols = 1000;
    
    std::string const tname = std::is_same_v<T, float> ? "float" : std::is_same_v<T, double> ?
        "double" : std::is_same_v<T, int32_t> ? "int" : "<unknown>";
    bool const is_int = tname == "int";
    std::mt19937_64 rng{123};
    std::vector<T> A(A_rows * A_cols), B(B_rows * B_cols), C(A_rows * B_cols);
    for (size_t i = 0; i < A.size(); ++i)
        A[i] = is_int ? (int64_t(rng() % (1 << 11)) - (1 << 10)) : (T(int64_t(rng() % (1 << 28)) - (1 << 27)) / T(1 << 27));
    for (size_t i = 0; i < B.size(); ++i)
        B[i] = is_int ? (int64_t(rng() % (1 << 11)) - (1 << 10)) : (T(int64_t(rng() % (1 << 28)) - (1 << 27)) / T(1 << 27));
    auto tim = Time();
    Mul(&A[0], A_rows, A_cols, &B[0], B_rows, B_cols, &C[0]);
    tim = Time() - tim;
    std::cout << std::setw(6) << tname << ": time " << std::fixed << std::setprecision(4) << tim << " sec" << std::endl;
    Test(&A[0], A_rows, A_cols, &B[0], B_rows, B_cols, &C[0], tname == "float" ? T(1e-7) : tname == "double" ? T(1e-15) : T(0));
}

int main() {
    try {
        #if USE_OPENMP
            omp_set_num_threads(std::thread::hardware_concurrency());
        #endif
        Run<float>();
        Run<double>();
        Run<int32_t>();
        return 0;
    } catch (std::exception const & ex) {
        std::cout << "Exception: " << ex.what() << std::endl;
        return -1;
    }
}

Output：

 float: time 0.1569 sec
double: time 0.3168 sec
   int: time 0.1565 sec

如何优化我的 C++ OpenMp 矩阵乘法代码

问题描述

2 个解决方案

解决方案1
2 2022-05-02 00:52:43

解决方案2
2 已采纳 2022-05-02 19:38:05

如何优化我的 C++ OpenMp 矩阵乘法代码

问题描述

2 个解决方案

解决方案1 2 2022-05-02 00:52:43

解决方案2 2 已采纳 2022-05-02 19:38:05

解决方案1
2 2022-05-02 00:52:43

解决方案2
2 已采纳 2022-05-02 19:38:05