优化循环和if

Question

I have a procedure looks like this: 我有一个程序看起来像这样：

void Process1(unsigned char* data)
{

}
void Process2(unsigned char* data)
{

}
void Process3(unsigned char* data)
{

}

#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)

void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
    bool b1 = !!(flags & FLAG1);
    bool b2 = !!(flags & FLAG2);
    bool b3 = !!(flags & FLAG3);
    for (unsigned int i = 0; i < bytes; i ++)
    {
        if (b1) Process1(data + i);
        if (b2) Process2(data + i);
        if (b3) Process3(data + i);
    }
}

As it looks, flags & FLAG1 AKA b1 will not change in all the loops. 看起来， flags & FLAG1 AKA b1在所有循环中都不会改变。 But we still have to do branch in every loop. 但是我们仍然需要在每个循环中进行分支。 I just wondering if there's a way to avoid this unnecessary branch dynamically. 我只是想知道是否有办法动态避免这种不必要的分支。

here is a demo of Lundin's solution. 这是Lundin解决方案的演示。

#include <windows.h>
#include <stdio.h>
#include <time.h>
LARGE_INTEGER ls, le, ll;
#define START_CLOCK() QueryPerformanceCounter(&ls)
#define END_CLOCK() printf ("%.0lf ns\n", (QueryPerformanceCounter(&le), ((double)le.QuadPart - ls.QuadPart) / ll.QuadPart * 1000000));


void Process1(unsigned char* data)
{
    (*data)++;
}
void Process2(unsigned char* data)
{
    (*data)--;
}
void Process3(unsigned char* data)
{
    (*data) *= (*data);
}

#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)

void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
    bool b1 = !!(flags & FLAG1);
    bool b2 = !!(flags & FLAG2);
    bool b3 = !!(flags & FLAG3);
    for (unsigned int i = 0; i < bytes; i ++)
    {
        if (b1) Process1(data + i);
        if (b2) Process2(data + i);
        if (b3) Process3(data + i);
    }
}


typedef void (*proc_t)(unsigned char*);

inline static void do_nothing (unsigned char* ptr)
{
    (void)ptr;
}

void ProcessData_x(unsigned char* data, unsigned int bytes, unsigned int flags)
{
    bool b1 = (flags & FLAG1) != 0;  // de-obfuscate the boolean logic
    bool b2 = (flags & FLAG2) != 0;
    bool b3 = (flags & FLAG3) != 0;

    proc_t p1 = b1 ? Process1 : do_nothing;
    proc_t p2 = b2 ? Process2 : do_nothing;
    proc_t p3 = b3 ? Process3 : do_nothing;

    for (unsigned int i = 0; i<bytes; i++)
    {
        p1(data + i);
        p2(data + i);
        p3(data + i);
    }
}

int main()
{
    if (!QueryPerformanceFrequency(&ll)) return 1;

    const unsigned int bytes = 0xffff;
    srand((unsigned int)time(NULL));
    unsigned int flags = rand() & 0x7;
    unsigned char* data = new unsigned char[bytes];
    for (unsigned int i = 0; i < bytes; i++)
    {
        data[i] = (unsigned char)(rand() & 0xff);
    }

    START_CLOCK();

    ProcessData(data, bytes, flags);

    END_CLOCK();

    START_CLOCK();

    ProcessData_x(data, bytes, flags);

    END_CLOCK();
}

here is the output: 这是输出：

134 ns
272 ns

I've run it several times but, unexpectedly, it costs even more time:(.. it is also compiled 'vs2010 Release x86' 我已经运行了好几次，但是出乎意料的是，它花费了更多时间：( ..它也被编译为“ vs2010 Release x86”

Answer 1

First of all, it doesn't any sense to speak about optimizations without a particular system in mind... 首先，在没有特定系统的情况下谈论优化没有任何意义......

That being said, I'd optimize away the branches in the following way: 话虽这么说，我会通过以下方式优化分支：

typedef void (*proc_t)(unsigned char*);

inline static void do_nothing (unsigned char* ptr)
{
    (void)ptr;
}

...

void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags)
{
    bool b1 = (flags & FLAG1) != 0;  // de-obfuscate the boolean logic
    bool b2 = (flags & FLAG2) != 0;
    bool b3 = (flags & FLAG3) != 0;

    proc_t p1 = b1 ? Process1 : do_nothing;
    proc_t p2 = b2 ? Process2 : do_nothing;
    proc_t p3 = b3 ? Process3 : do_nothing;

    for (unsigned int i = 0; i<bytes; i++)
    {
        p1(data + i);
        p2(data + i);
        p3(data + i);
    }
}

Answer 2

A c++ solution. 一个c ++解决方案。 Similar to Lundin's answer but without calls to empty function. 类似于Lundin的答案，但没有调用空函数。 I'm not sure if that makes any difference in performance, the main advantage is that you don't need to manually list all the process calls in the loop. 我不确定这对性能是否有影响，主要优点是您无需手动列出循环中的所有流程调用。 If you want to micro optimize or want c, you could use an array on stack, but you'll have to manage some counters yourself. 如果要进行微优化或使用c，则可以在堆栈上使用数组，但是必须自己管理一些计数器。

typedef void (*proc_t)(unsigned char*);
std::vector<proc_t> processes;
if (b1) processes.push_back(Process1);
if (b2) processes.push_back(Process2);
if (b3) processes.push_back(Process3);

for(auto p : processes)
    for (unsigned int i = 0; i<bytes; i++)
        p(data + i);

Answer 3

    bool b1 = !!(flags & FLAG1);
    bool b2 = !!(flags & FLAG2);
    bool b3 = !!(flags & FLAG3);


    int caseNow=SelectCaseAtOnce(b1,b2,b3);

    if(caseNow==0)
        for (unsigned int i = 0; i < bytes; i ++)
        {
            Process1(data + i);

        }
     else if(caseNow==1)
        for (unsigned int i = 0; i < bytes; i ++)
        {

           Process2(data + i);

        }
     else if(caseNow==2)
        for (unsigned int i = 0; i < bytes; i ++)
        {

             Process3(data + i);
        }
    else if(caseNow==3)
        for (unsigned int i = 0; i < bytes; i ++)
        {
            Process1(data + i);
            Process2(data + i);

        }
    if(caseNow==4)
        for (unsigned int i = 0; i < bytes; i ++)
        {
             Process1(data + i);

             Process3(data + i);
        }
    else if(caseNow==5)
        for (unsigned int i = 0; i < bytes; i ++)
        {

            Process2(data + i);
            Process3(data + i);
        }
    else if(caseNow==6)
        for (unsigned int i = 0; i < bytes; i ++)
        {
            Process1(data + i);
            Process2(data + i);
            Process3(data + i);
        }
    else {}

Answer 4

Here's another solution using templates - this way you'll get an optimized version of the inner loop for each variant. 这是使用模板的另一种解决方案 - 这样您就可以获得每个变体的内部循环的优化版本。 If the ProcessN functions are short / simple enough to be worth inlining then this could be a worthwhile optimization. 如果ProcessN函数足够短/很简单，值得进行内联，那么这可能是值得进行的优化。

#include <tuple>
#include <map>
#include <utility>

using namespace std;

inline void Process1(unsigned char* data) {}
inline void Process2(unsigned char* data) {}
inline void Process3(unsigned char* data) {}

#define FLAG1 (1 << 1)
#define FLAG2 (1 << 2)
#define FLAG3 (1 << 3)

template <bool b1, bool b2, bool b3>
void ProcessData(unsigned char* data, unsigned int bytes) {
    for (unsigned int i = 0; i < bytes; i++) {
        if (b1) Process1(data + i);
        if (b2) Process2(data + i);
        if (b3) Process3(data + i);
    }
}

void ProcessData(unsigned char* data, unsigned int bytes, unsigned int flags) {
    typedef void (*ProcessFunc)(unsigned char*, unsigned int bytes);
    static map<tuple<bool, bool, bool>, ProcessFunc> funcs{
        {make_tuple(false, false, false), ProcessData<false, false, false>},
        {make_tuple(false, false, true), ProcessData<false, false, true>},
        {make_tuple(false, true, false), ProcessData<false, true, false>},
        {make_tuple(false, true, true), ProcessData<false, true, true>},
        {make_tuple(true, false, false), ProcessData<true, false, false>},
        {make_tuple(true, false, true), ProcessData<true, false, true>},
        {make_tuple(true, true, false), ProcessData<true, true, false>},
        {make_tuple(true, true, true), ProcessData<true, true, true>}};

    bool b1 = !!(flags & FLAG1);
    bool b2 = !!(flags & FLAG2);
    bool b3 = !!(flags & FLAG3);
    funcs[make_tuple(b1, b2, b3)](data, bytes);
}

优化循环和if

问题描述

4 个解决方案

解决方案1
4 已采纳 2014-09-16 06:49:59

解决方案2
3 2014-09-16 07:01:56

解决方案3
0 2014-09-16 15:47:13

解决方案4
0 2014-09-16 20:05:52

优化循环和if

问题描述

4 个解决方案

解决方案1 4 已采纳 2014-09-16 06:49:59

解决方案2 3 2014-09-16 07:01:56

解决方案3 0 2014-09-16 15:47:13

解决方案4 0 2014-09-16 20:05:52

解决方案1
4 已采纳 2014-09-16 06:49:59

解决方案2
3 2014-09-16 07:01:56

解决方案3
0 2014-09-16 15:47:13

解决方案4
0 2014-09-16 20:05:52