I have a CRC calculation function that has the following in its inner loop:
if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x08) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x04) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x02) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x01) crc ^= *pChkTableOffset; pChkTableOffset++;
Profiling has revealed that a lot of time is spent on these statements. And I was wondering if I could get some gain by replacing the conditionals with 'bit hacks'. I tried the following, but got no speed improvement:
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x08) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x04) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x02) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x01) - 1);
Should this be faster on a recent x86 CPU or is there a better way to implement these 'bit hacks'?
I can't say for sure which is FASTER, but they are definitely different - which is faster depends a lot on exactly which processor make and model is being used, since they behave differently on [presumably unpredictable] branches. And to further complicate things, different processors have different behaviour for "dependent calculations".
I converted the posted code into this (which makes the generated code about half as long, but otherwise identical at a conceptual level):
int func1(int uMsgByte, char* pChkTableOffset)
{
int crc = 0;
if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
return crc;
}
int func2(int uMsgByte, char* pChkTableOffset)
{
int crc = 0;
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
return crc;
}
And compiled with clang++ -S -O2
:
func1:
_Z5func1jPh: # @_Z5func1jPh
xorl %eax, %eax
testb %dil, %dil
jns .LBB0_2
movzbl (%rsi), %eax
.LBB0_2: # %if.end
testb $64, %dil
je .LBB0_4
movzbl 1(%rsi), %ecx
xorl %ecx, %eax
.LBB0_4: # %if.end.6
testb $32, %dil
je .LBB0_6
movzbl 2(%rsi), %ecx
xorl %ecx, %eax
.LBB0_6: # %if.end.13
testb $16, %dil
je .LBB0_8
movzbl 3(%rsi), %ecx
xorl %ecx, %eax
.LBB0_8: # %if.end.20
retq
func2:
_Z5func2jPh: # @_Z5func2jPh
movzbl (%rsi), %eax
movl %edi, %ecx
shll $24, %ecx
sarl $31, %ecx
andl %eax, %ecx
movzbl 1(%rsi), %eax
movl %edi, %edx
shll $25, %edx
sarl $31, %edx
andl %edx, %eax
xorl %ecx, %eax
movzbl 2(%rsi), %ecx
movl %edi, %edx
shll $26, %edx
sarl $31, %edx
andl %ecx, %edx
movzbl 3(%rsi), %ecx
shll $27, %edi
sarl $31, %edi
andl %ecx, %edi
xorl %edx, %edi
xorl %edi, %eax
retq
As you can see, the compiler generates branches for the first version, and uses logical operations on the second version - a few more per case.
I could write some code to benchmark each of the loop, but I guarantee that the result will vary greatly between different versions of x86 processors.
I'm not sure if this is a common CRC calculation, but most CRC calculations have optimised versions that perform the right calculation in a faster way than this, using tables and other "clever stuff".
Interested to see if a human could beat an optimising compiler, I wrote your algorithm in two ways:
Here you express intent as if you were writing machine code
std::uint32_t foo1(std::uint8_t uMsgByte,
std::uint32_t crc,
const std::uint32_t* pChkTableOffset)
{
if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x08) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x04) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x02) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x01) crc ^= *pChkTableOffset; pChkTableOffset++;
return crc;
}
Here I express intent in a more algorithmic way...
std::uint32_t foo2(std::uint8_t uMsgByte,
std::uint32_t crc,
const std::uint32_t* pChkTableOffset)
{
for (int i = 0 ; i < 7 ; ++i) {
if (uMsgByte & (0x01 << (7-i)))
crc ^= pChkTableOffset[i];
}
return crc;
}
Then I compiled using g++ -O3 and the result was...
exactly the same object code in both functions
Moral of the story: select the correct algorithm, avoid repetition, write elegant code and let the optimiser do its thing.
here's the proof:
__Z4foo1hjPKj: ## @_Z4foo1hjPKj
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
testb $-128, %dil
je LBB0_2
## BB#1:
xorl (%rdx), %esi
LBB0_2:
testb $64, %dil
je LBB0_4
## BB#3:
xorl 4(%rdx), %esi
LBB0_4:
testb $32, %dil
je LBB0_6
## BB#5:
xorl 8(%rdx), %esi
LBB0_6:
testb $16, %dil
je LBB0_8
## BB#7:
xorl 12(%rdx), %esi
LBB0_8:
testb $8, %dil
je LBB0_10
## BB#9:
xorl 16(%rdx), %esi
LBB0_10:
testb $4, %dil
je LBB0_12
## BB#11:
xorl 20(%rdx), %esi
LBB0_12:
testb $2, %dil
je LBB0_14
## BB#13:
xorl 24(%rdx), %esi
LBB0_14:
testb $1, %dil
je LBB0_16
## BB#15:
xorl 28(%rdx), %esi
LBB0_16:
movl %esi, %eax
popq %rbp
retq
.cfi_endproc
.globl __Z4foo2hjPKj
.align 4, 0x90
__Z4foo2hjPKj: ## @_Z4foo2hjPKj
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
testb $-128, %dil
je LBB1_2
## BB#1:
xorl (%rdx), %esi
LBB1_2:
testb $64, %dil
je LBB1_4
## BB#3:
xorl 4(%rdx), %esi
LBB1_4:
testb $32, %dil
je LBB1_6
## BB#5:
xorl 8(%rdx), %esi
LBB1_6:
testb $16, %dil
je LBB1_8
## BB#7:
xorl 12(%rdx), %esi
LBB1_8:
testb $8, %dil
je LBB1_10
## BB#9:
xorl 16(%rdx), %esi
LBB1_10:
testb $4, %dil
je LBB1_12
## BB#11:
xorl 20(%rdx), %esi
LBB1_12:
testb $2, %dil
je LBB1_14
## BB#13:
xorl 24(%rdx), %esi
LBB1_14:
movl %esi, %eax
popq %rbp
retq
.cfi_endproc
It would be interesting to see if the compiler also performs so well with the version of the code that uses logical operations rather than conditional statements.
given:
std::uint32_t logical1(std::uint8_t uMsgByte,
std::uint32_t crc,
const std::uint32_t* pChkTableOffset)
{
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x8) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x4) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x2) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x1) - 1);
return crc;
}
the resulting machine code is:
8 lots of:
movl %edi, %eax ; get uMsgByte into eax
shll $24, %eax ; shift it left 24 bits so that bit 7 is in the sign bit
sarl $31, %eax ; arithmetic shift right to copy the sign bit into all other bits
andl (%rdx), %eax ; and the result with the value from the table
xorl %esi, %eax ; exclusive-or into crc
so the short answer is yes - it performs very well (eliding the redundant increments of pChkTableOffset)
Is it faster? who knows. Probably not measurably - the number of memory fetches is the same in both cases. The compiler can work out whether it's better to avoid branches or not much better than you can (depending on the architecture the compiler is optimising for).
Is it more elegant and readable? For myself, no. It's the kind of code I used to write when:
None of these apply any more.
If this checksum is indeed a CRC, there is a much more efficient way to implement it.
Assuming it's a CRC16:
Header:
class CRC16
{
public:
CRC16(const unsigned short poly);
unsigned short CalcCRC(unsigned char * pbuf, int len);
protected:
unsigned short CRCTab[256];
unsigned long SwapBits(unsigned long swap, int bits);
};
Implementation:
CRC16::CRC16(const unsigned short poly)
{
for(int i = 0; i < 256; i++) {
CRCTab[i] = SwapBits(i, 8) << 8;
for(int j = 0; j < 8; j++)
CRCTab[i] = (CRCTab[i] << 1) ^ ((CRCTab[i] & 0x8000) ? poly : 0);
CRCTab[i] = SwapBits(CRCTab[i], 16);
}
}
unsigned long CRC16::SwapBits(unsigned long swap, int bits)
{
unsigned long r = 0;
for(int i = 0; i < bits; i++) {
if(swap & 1) r |= 1 << (bits - i - 1);
swap >>= 1;
}
return r;
}
unsigned short CRC16::CalcCRC(unsigned char * pbuf, int len)
{
unsigned short r = 0;
while(len--) r = (r >> 8) ^ CRCTab[(r & 0xFF) ^ *(pbuf++)];
return r;
}
As you can see, each byte of the message is used only once, instead of 8 times.
There is a similar implementation for CRC8.
Out of interest, extending alain's excellent suggestion of precomputing the CRC table, it occurs to me that this class can be modified to take advantage of c++14's constexpr
:
#include <iostream>
#include <utility>
#include <string>
class CRC16
{
private:
// the storage for the CRC table, to be computed at compile time
unsigned short CRCTab[256];
// private template-expanded constructor allows folded calls to SwapBits at compile time
template<std::size_t...Is>
constexpr CRC16(const unsigned short poly, std::integer_sequence<std::size_t, Is...>)
: CRCTab { SwapBits(Is, 8) << 8 ... }
{}
// swap bits at compile time
static constexpr unsigned long SwapBits(unsigned long swap, int bits)
{
unsigned long r = 0;
for(int i = 0; i < bits; i++) {
if(swap & 1) r |= 1 << (bits - i - 1);
swap >>= 1;
}
return r;
}
public:
// public constexpr defers to private template expansion...
constexpr CRC16(const unsigned short poly)
: CRC16(poly, std::make_index_sequence<256>())
{
//... and then modifies the table - at compile time
for(int i = 0; i < 256; i++) {
for(int j = 0; j < 8; j++)
CRCTab[i] = (CRCTab[i] << 1) ^ ((CRCTab[i] & 0x8000) ? poly : 0);
CRCTab[i] = SwapBits(CRCTab[i], 16);
}
}
// made const so that we can instantiate constexpr CRC16 objects
unsigned short CalcCRC(const unsigned char * pbuf, int len) const
{
unsigned short r = 0;
while(len--) r = (r >> 8) ^ CRCTab[(r & 0xFF) ^ *(pbuf++)];
return r;
}
};
int main()
{
// create my constexpr CRC16 object at compile time
constexpr CRC16 crctab(1234);
// caclulate the CRC of something...
using namespace std;
auto s = "hello world"s;
auto crc = crctab.CalcCRC(reinterpret_cast<const unsigned char*>(s.data()), s.size());
cout << crc << endl;
return 0;
}
Then the constructor of CRC16(1234) pleasingly boils down to this:
__ZZ4mainE6crctab:
.short 0 ## 0x0
.short 9478 ## 0x2506
.short 18956 ## 0x4a0c
.short 28426 ## 0x6f0a
.short 601 ## 0x259
.short 10079 ## 0x275f
.short 18517 ## 0x4855
.short 27987 ## 0x6d53
... etc.
and the calculation of the CRC of the entire string becomes this:
leaq __ZZ4mainE6crctab(%rip), %rdi ; <- referencing const data :)
movzwl (%rdi,%rdx,2), %edx
jmp LBB0_8
LBB0_4:
xorl %edx, %edx
jmp LBB0_11
LBB0_6:
xorl %edx, %edx
LBB0_8: ## %.lr.ph.i.preheader.split
testl %esi, %esi
je LBB0_11
## BB#9:
leaq __ZZ4mainE6crctab(%rip), %rsi
.align 4, 0x90
LBB0_10: ## %.lr.ph.i
## =>This Inner Loop Header: Depth=1
movzwl %dx, %edi
movzbl %dh, %edx # NOREX
movzbl %dil, %edi
movzbl (%rcx), %ebx
xorq %rdi, %rbx
xorw (%rsi,%rbx,2), %dx
movzwl %dx, %edi
movzbl %dh, %edx # NOREX
movzbl %dil, %edi
movzbl 1(%rcx), %ebx
xorq %rdi, %rbx
xorw (%rsi,%rbx,2), %dx
addq $2, %rcx
addl $-2, %eax
jne LBB0_10
LBB0_11:
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.