bit hack vs conditional statement inside loop

Question

I have a CRC calculation function that has the following in its inner loop:

if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x08) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x04) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x02) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x01) crc ^= *pChkTableOffset; pChkTableOffset++;

Profiling has revealed that a lot of time is spent on these statements. And I was wondering if I could get some gain by replacing the conditionals with 'bit hacks'. I tried the following, but got no speed improvement:

crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x08) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x04) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x02) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x01) - 1);

Should this be faster on a recent x86 CPU or is there a better way to implement these 'bit hacks'?

Answer 1

I can't say for sure which is FASTER, but they are definitely different - which is faster depends a lot on exactly which processor make and model is being used, since they behave differently on [presumably unpredictable] branches. And to further complicate things, different processors have different behaviour for "dependent calculations".

I converted the posted code into this (which makes the generated code about half as long, but otherwise identical at a conceptual level):

int func1(int uMsgByte, char* pChkTableOffset)
{
    int crc = 0;
    if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;

    return crc;
}


int func2(int uMsgByte, char* pChkTableOffset)
{
    int crc = 0;

    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);

    return crc;
}

And compiled with clang++ -S -O2 :

func1:

_Z5func1jPh:                            # @_Z5func1jPh
        xorl    %eax, %eax
        testb   %dil, %dil
        jns     .LBB0_2
        movzbl  (%rsi), %eax
.LBB0_2:                                # %if.end
        testb   $64, %dil
        je      .LBB0_4
        movzbl  1(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_4:                                # %if.end.6
        testb   $32, %dil
        je      .LBB0_6
        movzbl  2(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_6:                                # %if.end.13
        testb   $16, %dil
        je      .LBB0_8
        movzbl  3(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_8:                                # %if.end.20
        retq

func2:

_Z5func2jPh:                            # @_Z5func2jPh
        movzbl  (%rsi), %eax
        movl    %edi, %ecx
        shll    $24, %ecx
        sarl    $31, %ecx
        andl    %eax, %ecx
        movzbl  1(%rsi), %eax
        movl    %edi, %edx
        shll    $25, %edx
        sarl    $31, %edx
        andl    %edx, %eax
        xorl    %ecx, %eax
        movzbl  2(%rsi), %ecx
        movl    %edi, %edx
        shll    $26, %edx
        sarl    $31, %edx
        andl    %ecx, %edx
        movzbl  3(%rsi), %ecx
        shll    $27, %edi
        sarl    $31, %edi
        andl    %ecx, %edi
        xorl    %edx, %edi
        xorl    %edi, %eax
        retq

As you can see, the compiler generates branches for the first version, and uses logical operations on the second version - a few more per case.

I could write some code to benchmark each of the loop, but I guarantee that the result will vary greatly between different versions of x86 processors.

I'm not sure if this is a common CRC calculation, but most CRC calculations have optimised versions that perform the right calculation in a faster way than this, using tables and other "clever stuff".

Answer 2

Interested to see if a human could beat an optimising compiler, I wrote your algorithm in two ways:

Here you express intent as if you were writing machine code

std::uint32_t foo1(std::uint8_t uMsgByte, 
                   std::uint32_t crc, 
                   const std::uint32_t* pChkTableOffset)
{
    if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x08) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x04) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x02) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x01) crc ^= *pChkTableOffset; pChkTableOffset++;

    return crc;
}

Here I express intent in a more algorithmic way...

std::uint32_t foo2(std::uint8_t uMsgByte, 
                   std::uint32_t crc, 
                   const std::uint32_t* pChkTableOffset)
{
    for (int i = 0 ; i < 7 ; ++i) {
        if (uMsgByte & (0x01 << (7-i)))
            crc ^= pChkTableOffset[i];

    }
    return crc;
}

Then I compiled using g++ -O3 and the result was...

exactly the same object code in both functions

Moral of the story: select the correct algorithm, avoid repetition, write elegant code and let the optimiser do its thing.

here's the proof:

__Z4foo1hjPKj:                          ## @_Z4foo1hjPKj
    .cfi_startproc
## BB#0:
    pushq   %rbp
Ltmp0:
    .cfi_def_cfa_offset 16
Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp2:
    .cfi_def_cfa_register %rbp
    testb   $-128, %dil
    je  LBB0_2
## BB#1:
    xorl    (%rdx), %esi
LBB0_2:
    testb   $64, %dil
    je  LBB0_4
## BB#3:
    xorl    4(%rdx), %esi
LBB0_4:
    testb   $32, %dil
    je  LBB0_6
## BB#5:
    xorl    8(%rdx), %esi
LBB0_6:
    testb   $16, %dil
    je  LBB0_8
## BB#7:
    xorl    12(%rdx), %esi
LBB0_8:
    testb   $8, %dil
    je  LBB0_10
## BB#9:
    xorl    16(%rdx), %esi
LBB0_10:
    testb   $4, %dil
    je  LBB0_12
## BB#11:
    xorl    20(%rdx), %esi
LBB0_12:
    testb   $2, %dil
    je  LBB0_14
## BB#13:
    xorl    24(%rdx), %esi
LBB0_14:
    testb   $1, %dil
    je  LBB0_16
## BB#15:
    xorl    28(%rdx), %esi
LBB0_16:
    movl    %esi, %eax
    popq    %rbp
    retq
    .cfi_endproc

    .globl  __Z4foo2hjPKj
    .align  4, 0x90
__Z4foo2hjPKj:                          ## @_Z4foo2hjPKj
    .cfi_startproc
## BB#0:
    pushq   %rbp
Ltmp3:
    .cfi_def_cfa_offset 16
Ltmp4:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp5:
    .cfi_def_cfa_register %rbp
    testb   $-128, %dil
    je  LBB1_2
## BB#1:
    xorl    (%rdx), %esi
LBB1_2:
    testb   $64, %dil
    je  LBB1_4
## BB#3:
    xorl    4(%rdx), %esi
LBB1_4:
    testb   $32, %dil
    je  LBB1_6
## BB#5:
    xorl    8(%rdx), %esi
LBB1_6:
    testb   $16, %dil
    je  LBB1_8
## BB#7:
    xorl    12(%rdx), %esi
LBB1_8:
    testb   $8, %dil
    je  LBB1_10
## BB#9:
    xorl    16(%rdx), %esi
LBB1_10:
    testb   $4, %dil
    je  LBB1_12
## BB#11:
    xorl    20(%rdx), %esi
LBB1_12:
    testb   $2, %dil
    je  LBB1_14
## BB#13:
    xorl    24(%rdx), %esi
LBB1_14:
    movl    %esi, %eax
    popq    %rbp
    retq
    .cfi_endproc

It would be interesting to see if the compiler also performs so well with the version of the code that uses logical operations rather than conditional statements.

given:

std::uint32_t logical1(std::uint8_t uMsgByte, 
                       std::uint32_t crc, 
                       const std::uint32_t* pChkTableOffset)
{
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x8) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x4) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x2) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x1) - 1);

    return crc;
}

the resulting machine code is:

8 lots of:

    movl    %edi, %eax     ; get uMsgByte into eax
    shll    $24, %eax      ; shift it left 24 bits so that bit 7 is in the sign bit
    sarl    $31, %eax      ; arithmetic shift right to copy the sign bit into all other bits
    andl    (%rdx), %eax   ; and the result with the value from the table
    xorl    %esi, %eax     ; exclusive-or into crc

so the short answer is yes - it performs very well (eliding the redundant increments of pChkTableOffset)

Is it faster? who knows. Probably not measurably - the number of memory fetches is the same in both cases. The compiler can work out whether it's better to avoid branches or not much better than you can (depending on the architecture the compiler is optimising for).

Is it more elegant and readable? For myself, no. It's the kind of code I used to write when:

c was still a young language
processors were simple enough that I could do a better job of optimising
processors were so slow that I had to

None of these apply any more.

Answer 3

If this checksum is indeed a CRC, there is a much more efficient way to implement it.

Assuming it's a CRC16:

Header:

class CRC16
{
public:
    CRC16(const unsigned short poly);
    unsigned short CalcCRC(unsigned char * pbuf, int len);

protected:
    unsigned short CRCTab[256];
    unsigned long SwapBits(unsigned long swap, int bits);
};

Implementation:

CRC16::CRC16(const unsigned short poly)
{
    for(int i = 0; i < 256; i++) {
        CRCTab[i] = SwapBits(i, 8) << 8;
        for(int j = 0; j < 8; j++)
            CRCTab[i] = (CRCTab[i] << 1) ^ ((CRCTab[i] & 0x8000) ? poly : 0);
        CRCTab[i] = SwapBits(CRCTab[i], 16);
    }
}

unsigned long CRC16::SwapBits(unsigned long swap, int bits)
{
    unsigned long r = 0;
    for(int i = 0; i < bits; i++) {
        if(swap & 1) r |= 1 << (bits - i - 1);
        swap >>= 1;
    }
    return r;
}

unsigned short CRC16::CalcCRC(unsigned char * pbuf, int len)
{
    unsigned short r = 0;
    while(len--) r = (r >> 8) ^ CRCTab[(r & 0xFF) ^ *(pbuf++)];
    return r;
}

As you can see, each byte of the message is used only once, instead of 8 times.

There is a similar implementation for CRC8.

Answer 4

Out of interest, extending alain's excellent suggestion of precomputing the CRC table, it occurs to me that this class can be modified to take advantage of c++14's constexpr :

#include <iostream>
#include <utility>
#include <string>

class CRC16
{
private:

    // the storage for the CRC table, to be computed at compile time
    unsigned short CRCTab[256];

    // private template-expanded constructor allows folded calls to SwapBits at compile time
    template<std::size_t...Is>
    constexpr CRC16(const unsigned short poly, std::integer_sequence<std::size_t, Is...>)
    : CRCTab { SwapBits(Is, 8) << 8 ... }
    {}

    // swap bits at compile time
    static constexpr unsigned long SwapBits(unsigned long swap, int bits)
    {
        unsigned long r = 0;
        for(int i = 0; i < bits; i++) {
            if(swap & 1) r |= 1 << (bits - i - 1);
            swap >>= 1;
        }
        return r;
    }


public:

    // public constexpr defers to private template expansion...
    constexpr CRC16(const unsigned short poly)
    : CRC16(poly, std::make_index_sequence<256>())
    {
        //... and then modifies the table - at compile time
        for(int i = 0; i < 256; i++) {
            for(int j = 0; j < 8; j++)
                CRCTab[i] = (CRCTab[i] << 1) ^ ((CRCTab[i] & 0x8000) ? poly : 0);
            CRCTab[i] = SwapBits(CRCTab[i], 16);
        }
    }

    // made const so that we can instantiate constexpr CRC16 objects
    unsigned short CalcCRC(const unsigned char * pbuf, int len) const
    {
        unsigned short r = 0;
        while(len--) r = (r >> 8) ^ CRCTab[(r & 0xFF) ^ *(pbuf++)];
        return r;
    }

};



int main()
{
    // create my constexpr CRC16 object at compile time
    constexpr CRC16 crctab(1234);

    // caclulate the CRC of something...
    using namespace std;
    auto s = "hello world"s;

    auto crc = crctab.CalcCRC(reinterpret_cast<const unsigned char*>(s.data()), s.size());

    cout << crc << endl;

    return 0;
}

Then the constructor of CRC16(1234) pleasingly boils down to this:

__ZZ4mainE6crctab:
    .short  0                       ## 0x0
    .short  9478                    ## 0x2506
    .short  18956                   ## 0x4a0c
    .short  28426                   ## 0x6f0a
    .short  601                     ## 0x259
    .short  10079                   ## 0x275f
    .short  18517                   ## 0x4855
    .short  27987                   ## 0x6d53
... etc.

and the calculation of the CRC of the entire string becomes this:

        leaq    __ZZ4mainE6crctab(%rip), %rdi ; <- referencing const data :)
        movzwl  (%rdi,%rdx,2), %edx
        jmp     LBB0_8
LBB0_4:
        xorl    %edx, %edx
        jmp     LBB0_11
LBB0_6:
        xorl    %edx, %edx
LBB0_8:                                 ## %.lr.ph.i.preheader.split
        testl   %esi, %esi
        je      LBB0_11
## BB#9:
        leaq    __ZZ4mainE6crctab(%rip), %rsi
        .align  4, 0x90
LBB0_10:                                ## %.lr.ph.i
                                        ## =>This Inner Loop Header: Depth=1
        movzwl  %dx, %edi
        movzbl  %dh, %edx  # NOREX
        movzbl  %dil, %edi
        movzbl  (%rcx), %ebx
        xorq    %rdi, %rbx
        xorw    (%rsi,%rbx,2), %dx
        movzwl  %dx, %edi
        movzbl  %dh, %edx  # NOREX
        movzbl  %dil, %edi
        movzbl  1(%rcx), %ebx
        xorq    %rdi, %rbx
        xorw    (%rsi,%rbx,2), %dx
        addq    $2, %rcx
        addl    $-2, %eax
        jne     LBB0_10
LBB0_11:

bit hack vs conditional statement inside loop

Question

4 answers

solution1
3 2015-12-14 10:30:14

solution2
3 ACCPTED 2015-12-14 10:59:18

solution3
1 2015-12-14 11:34:38

solution4
1 2015-12-14 12:34:51

bit hack vs conditional statement inside loop

Question

4 answers

solution1 3 2015-12-14 10:30:14

solution2 3 ACCPTED 2015-12-14 10:59:18

solution3 1 2015-12-14 11:34:38

solution4 1 2015-12-14 12:34:51

solution1
3 2015-12-14 10:30:14

solution2
3 ACCPTED 2015-12-14 10:59:18

solution3
1 2015-12-14 11:34:38

solution4
1 2015-12-14 12:34:51