Armadillo C++ bad performance ifft

Question

I have current test code

#include <iostream>
#define ARMA_DONT_USE_WRAPPER
#include <armadillo>

using namespace std::complex_literals;

int main()
{
    arma::cx_mat testMat { };
    testMat.set_size(40, 19586);
    auto nPositions = static_cast<arma::sword>(floor(19586/2));
    arma::cx_rowvec a_vec {19586, arma::fill::randu};
    arma::cx_rowvec b_vec {19586, arma::fill::randu};
    arma::cx_rowvec c_vec {19586, arma::fill::randu};

    for (size_t nCo=0; nCo < 3; nCo++) {
        arma::rowvec d {19586, arma::fill::randu};
        for(size_t iDop = 0; iDop < 40; ++iDop)
        {
            
                arma::cx_rowvec signalFi = (b_vec % arma::exp(-1i*M_PI*a_vec));
                testMat.row(iDop) += arma::ifft(arma::shift(arma::fft(signalFi), nPositions).eval() % c_vec).eval();
            
        }
    }
return 0;
}

I am trying to perform some computation. StopWatch shared performance for each iteration around: 300 ms, which is bad performance for my needs.

Is someone which can explain what i am doing wrong or some tricks how can i increase the performance.

I used .eval() to perform 'eager' evaluation. 
gcc 11.2
armadillo 10.8.2
Release Mode -O3

Updated Version. Is possible to redesign the ifft function? Test Code

#include <iostream>
#include <fftw3.h>
#include <armadillo>
#include "StopWatch.h"

using namespace std;

inline arma::cx_mat ifftshift(arma::cx_mat const &axx)
{
    return arma::shift(axx, -ceil(axx.n_rows/2), 0);
}

void ifft(arma::cx_mat &inMat, arma::cx_mat &outMat)
{
    size_t N = inMat.n_rows;
    size_t n_cols = inMat.n_cols;
    for (size_t index = 0; index < n_cols; ++index)
    {
        fftw_complex *in1  = reinterpret_cast<fftw_complex *>(inMat.colptr(index));
        fftw_complex *out1 = reinterpret_cast<fftw_complex *>(outMat.colptr(index));
        fftw_plan pl_ifft_cx1 = fftw_plan_dft_1d(N, in1, out1, FFTW_BACKWARD, FFTW_ESTIMATE);
        fftw_execute_dft(pl_ifft_cx1, in1, out1);
    }
    outMat /= N;
}

int main()
{
    arma::cx_mat B;

    B << std::complex<double>(+1.225e-01,+8.247e-01) << std::complex<double>(+4.078e-01,+5.632e-01) << std::complex<double>(+8.866e-01,+8.386e-01) << arma::endr
      << std::complex<double>(+5.958e-01,+1.015e-01) << std::complex<double>(+7.857e-01,+4.267e-01) << std::complex<double>(+7.997e-01,+9.176e-01) << arma::endr
      << std::complex<double>(+1.877e-01,+3.378e-01) << std::complex<double>(+2.921e-01,+9.651e-01) << std::complex<double>(+1.056e-01,+6.901e-01) << arma::endr
      << std::complex<double>(+2.322e-01,+6.990e-01) << std::complex<double>(+1.547e-01,+4.256e-01) << std::complex<double>(+9.094e-01,+1.194e-01) << arma::endr
      << std::complex<double>(+3.917e-01,+3.886e-01) << std::complex<double>(+2.166e-01,+4.962e-01) << std::complex<double>(+9.777e-01,+4.464e-01) << arma::endr;

    arma::cx_mat output(5,3);
    arma::cx_mat shifted = ifftshift(B);
    arma::cx_mat arma_result = arma::ifft(shifted);
    B.print("B");
    arma_result.print("arma_result");
    ifft(shifted, output);
    output.print("output");

    return 0;
}

Answer 1

I just tried a similar operation with my own library and, according to my measurements, you are correct that each iteration of the loop shouldn't take more than 1 millisecond (instead of 300 ms).

This is the equivalent code, sorry that this is not an Armadillo answer, I am just pointing out what are the concrete goals for minimizing operations and allocations.

#include<multi/adaptors/fftw.hpp>
#include<multi/array.hpp>

namespace fftw = multi::fftw;

int main() {
    multi::array<std::complex<double>, 1> const arr = n_random_complex<double>(19586);
    multi::array<std::complex<double>, 1>       res(arr.extensions());  // output allocated only once                  

    fftw::plan fdft{arr, res, fftw::forward};  // fftw plan and internal buffers allocated only once

    auto const N = 40;
    for(int i = 0; i != N; ++i) {  // each iteration takes ~1ms in an intel-i7
        fdft(arr.base(), res.base());  // fft operation with precalculated plan
        std::rotate(res.begin(), res.begin() + res.size()/2, res.end());  // rotation (shift on size/2) done in place, no allocation either
    }
}

The full code and library is here: https://gitlab.com/correaa/boost-multi/-/blob/master/adaptors/fftw/test/shift.cpp#L45-58 (the extra code is for the timing measurement).

What is also telling is that I tried to do all the possible mistakes to pessimize the code. To try to mimic what I think Armadillo is doing "wrong"... allocating inside the loop and making copies all the time. But what I get is that each iteration take 1.5 milliseconds.

My conclusion is that something is terribly wrong in your Armadillo usage or in the library itself.

    multi::array<std::complex<double>, 1> const arr = n_random_complex<double>(19586);  BOOST_REQUIRE(arr.size() == 19586);

    auto const N = 40;
    for(int i = 0; i != N; ++i) {
        multi::array<std::complex<double>, 1>       res(arr.extensions(), 0.);
        fftw::plan fdft{arr, res, fftw::forward};
        fdft(arr.base(), res.base());
        multi::array<std::complex<double>, 1>       res_copy(arr.extensions(), 0.);
        std::rotate_copy(res.begin(), res.begin() + res.size()/2, res.end(), res_copy.begin());
    }

Armadillo C++ bad performance ifft

Question

1 answers

solution1
2 2022-03-04 18:57:20

Armadillo C++ bad performance ifft

Question

1 answers

solution1 2 2022-03-04 18:57:20

solution1
2 2022-03-04 18:57:20