Performance penalty of using functor to provide a function or an operator as a C++ template parameter?

Question

I have a family of complex functions performing very similar tasks except for one operator right in the middle of the function. A simplified version of my code could be something like that:

#include <assert.h>

static void memopXor(char * buffer1, char * buffer2, char * res, unsigned n){
    for (unsigned x = 0 ; x < n ; x++){
        res[x] = buffer1[x] ^ buffer2[x];
    }
};

static void memopPlus(char * buffer1, char * buffer2, char * res, unsigned n){
    for (unsigned x = 0 ; x < n ; x++){
        res[x] = buffer1[x] + buffer2[x];
    }
};

static void memopMul(char * buffer1, char * buffer2, char * res, unsigned n){
    for (unsigned x = 0 ; x < n ; x++){
        res[x] = buffer1[x] * buffer2[x];
    }
};


int main(int argc, char ** argv){
    char b1[5] = {0, 1, 2, 3, 4};
    char b2[5] = {0, 1, 2, 3, 4};

    char res1[5] = {};
    memopXor(b1, b2, res1, 5);

    assert(res1[0] == 0);
    assert(res1[1] == 0);
    assert(res1[2] == 0);
    assert(res1[3] == 0);
    assert(res1[4] == 1);

    char res2[5] = {};
    memopPlus(b1, b2, res2, 5);

    assert(res2[0] == 0);
    assert(res2[1] == 2);
    assert(res2[2] == 4);
    assert(res2[3] == 6);
    assert(res2[4] == 8);

    char res3[5] = {};
    memopMul(b1, b2, res3, 5);

    assert(res3[0] == 0);
    assert(res3[1] == 1);
    assert(res3[2] == 4);
    assert(res3[3] == 9);
    assert(res3[4] == 16);
}

It looks like a good case to use C++ templates to avoid duplicating code, hence I was looking for a way to change my code to something like below (pseudo code):

#include <assert.h>

template <FUNCTION>
void memop<FUNCTION>(char * buffer1, char * buffer2, char * res, size_t n){
    for (size_t x = 0 ; x < n ; x++){
        res[x] = FUNCTION(buffer1[x], buffer2[x]);
    }
}

int main(int argc, char ** argv){
    char b1[5] = {0, 1, 2, 3, 4};
    char b2[5] = {0, 1, 2, 3, 4};

    char res1[5] = {};
    memop<operator^>(b1, b2, res1, 5);

    assert(res1[0] == 0);
    assert(res1[1] == 0);
    assert(res1[2] == 0);
    assert(res1[3] == 0);
    assert(res1[4] == 0);

    char res2[5] = {};
    memop<operator+>(b1, b2, res2, 5);

    assert(res2[0] == 0);
    assert(res2[1] == 2);
    assert(res2[2] == 4);
    assert(res2[3] == 6);
    assert(res2[4] == 8);

    char res3[5] = {};
    memop<operator*>(b1, b2, res3, 5);

    assert(res3[0] == 0);
    assert(res3[1] == 1);
    assert(res3[2] == 4);
    assert(res3[3] == 9);
    assert(res3[4] == 16);
}

The hard point is that I'm not willing to accept any slowdown of the resulting code. It means solutions implying indirect calls (either through vtable or function pointers) are not OK.

The common C++ solution to this problem seems to be wrapping the operator to call inside the operator() method of a functor class. Typically to get something like the code below:

#include <assert.h>

template <typename Op>
void memop(char * buffer1, char * buffer2, char * res, unsigned n){
    Op o;
    for (unsigned x = 0 ; x < n ; x++){
        res[x] = o(buffer1[x], buffer2[x]);
    }
};


struct Xor
{
    char operator()(char a, char b){
        return a ^ b;
    }
};

struct Plus
{
    char operator()(char a, char b){
        return a + b;
    }
};

struct Mul
{
    char operator()(char a, char b){
        return a * b;
    }
};

int main(int argc, char ** argv){
    char b1[5] = {0, 1, 2, 3, 4};
    char b2[5] = {0, 1, 2, 3, 4};

    char res1[5] = {};
    memop<Xor>(b1, b2, res1, 5);

    assert(res1[0] == 0);
    assert(res1[1] == 0);
    assert(res1[2] == 0);
    assert(res1[3] == 0);
    assert(res1[4] == 0);

    char res2[5] = {};
    memop<Plus>(b1, b2, res2, 5);

    assert(res2[0] == 0);
    assert(res2[1] == 2);
    assert(res2[2] == 4);
    assert(res2[3] == 6);
    assert(res2[4] == 8);

    char res3[5] = {};
    memop<Mul>(b1, b2, res3, 5);

    assert(res3[0] == 0);
    assert(res3[1] == 1);
    assert(res3[2] == 4);
    assert(res3[3] == 9);
    assert(res3[4] == 16);
}

Is there any performance penalty to doing that ?

Answer 1

The code you expose is about useless as far as bencharmk go.

char cversion() {
    char b1[5] = {0, 1, 2, 3, 4};
    char b2[5] = {0, 1, 2, 3, 4};

    char res1[5] = {};
    memopXor(b1, b2, res1, 5);

    return res1[4];
}

char cppversion() {
    char b1[5] = {0, 1, 2, 3, 4};
    char b2[5] = {0, 1, 2, 3, 4};

    char res1[5] = {};
    memop<Xor>(b1, b2, res1, 5);

    return res1[4];
}

Is compiled to such LLVM IR:

define signext i8 @cversion()() nounwind uwtable readnone {
  ret i8 0
}

define signext i8 @cppversion()() nounwind uwtable readnone {
  ret i8 0
}

That is, the compiler makes the whole computation during the compilation.

So I took the liberty of defining a new function:

void cppmemopXor(char * buffer1,
                 char * buffer2,
                 char * res,
                 unsigned n)
{
  memop<Xor>(buffer1, buffer2, res, n);
}

and removed the static qualifier on memopXor and then repeated the experience:

define void @memopXor(char*, char*, char*, unsigned int)(i8* nocapture %buffer1, i8* nocapture %buffer2, i8* nocapture %res, i32 %n) nounwind uwtable {
  %1 = icmp eq i32 %n, 0
  br i1 %1, label %._crit_edge, label %.lr.ph

.lr.ph:                                           ; preds = %.lr.ph, %0
  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
  %2 = getelementptr inbounds i8* %buffer1, i64 %indvars.iv
  %3 = load i8* %2, align 1, !tbaa !0
  %4 = getelementptr inbounds i8* %buffer2, i64 %indvars.iv
  %5 = load i8* %4, align 1, !tbaa !0
  %6 = xor i8 %5, %3
  %7 = getelementptr inbounds i8* %res, i64 %indvars.iv
  store i8 %6, i8* %7, align 1, !tbaa !0
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %n
  br i1 %exitcond, label %._crit_edge, label %.lr.ph

._crit_edge:                                      ; preds = %.lr.ph, %0
  ret void
}

And the C++ version with templates:

define void @cppmemopXor(char*, char*, char*, unsigned int)(i8* nocapture %buffer1, i8* nocapture %buffer2, i8* nocapture %res, i32 %n) nounwind uwtable {
  %1 = icmp eq i32 %n, 0
  br i1 %1, label %_ZL5memopI3XorEvPcS1_S1_j.exit, label %.lr.ph.i

.lr.ph.i:                                         ; preds = %.lr.ph.i, %0
  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %.lr.ph.i ], [ 0, %0 ]
  %2 = getelementptr inbounds i8* %buffer1, i64 %indvars.iv.i
  %3 = load i8* %2, align 1, !tbaa !0
  %4 = getelementptr inbounds i8* %buffer2, i64 %indvars.iv.i
  %5 = load i8* %4, align 1, !tbaa !0
  %6 = xor i8 %5, %3
  %7 = getelementptr inbounds i8* %res, i64 %indvars.iv.i
  store i8 %6, i8* %7, align 1, !tbaa !0
  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next.i to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %n
  br i1 %exitcond, label %_ZL5memopI3XorEvPcS1_S1_j.exit, label %.lr.ph.i

_ZL5memopI3XorEvPcS1_S1_j.exit:                   ; preds = %.lr.ph.i, %0
  ret void
}

As expected, they are structurally identical as the functor code has been completely inlined ( which is visible even without understanding the IR ).

Note that this is not a result in isolation. For example, std::sort performs twice to thrice as fast as qsort because it uses a functor instead of an indirect function call. Of course, using a templated function and a functor means that each different instantiation will generate new code, as if you coded the function manually, but it is exactly what you were doing manually anyway.

Answer 2

Only you can tell if something is fast enough for your needs. But I went ahead and ran your code on my own box to see what happens.

int main(int argc, char ** argv){
  char b1[5] = {0, 1, 2, 3, 4};
  char b2[5] = {0, 1, 2, 3, 4};

  int ans = 0;

  for (int i = 0; i < 100000000; i++) {
    char res1[5] = {};
    memopXor(b1, b2, res1, 5);
//    memop<Xor>(b1, b2, res1, 5);

    char res2[5] = {};
    memopPlus(b1, b2, res2, 5);
//    memop<Plus>(b1, b2, res2, 5);

    char res3[5] = {};
    memopMul(b1, b2, res3, 5);
//    memop<Mul>(b1, b2, res3, 5);

    ans += res1[0] + res2[1] + res3[2];  // prevents optimization
  }

  std::cout << ans << std::endl;

  return 0;
}

I compiled both versions with -O3 on g++. time returns 2.40s for the hand-coded version, and 2.58s for the template version.

(By the way, I had to correct your memopMul() to actually perform a multiplication.)

Answer 3

The only problem I could see with the above code, is the compiler will have trouble aliasing the memory operations when calling memop, see: C++ aliasing rules .

Remember also that in the template version, the compiler will generate a different object for each unique template argument passed in, which means for three calls to memop, with three different operations, you will get three implementations in the binary. This should result in code that is almost identical to your original code.

I agree with the other commenter's that the functions should get inlined. If the performance is critical though, you should benchmark your code before and after the proposed changes, just to be on the safe side, all it takes is the wrong compiler flag to blow things up.

Performance penalty of using functor to provide a function or an operator as a C++ template parameter?

Question

3 answers

solution1
5 ACCPTED 2012-03-07 07:25:51

solution2
1 2012-03-07 03:55:17

solution3
1 2012-03-07 06:32:02

Performance penalty of using functor to provide a function or an operator as a C++ template parameter?

Question

3 answers

solution1 5 ACCPTED 2012-03-07 07:25:51

solution2 1 2012-03-07 03:55:17

solution3 1 2012-03-07 06:32:02

solution1
5 ACCPTED 2012-03-07 07:25:51

solution2
1 2012-03-07 03:55:17

solution3
1 2012-03-07 06:32:02