[英]Is there a more efficient way of expanding a char to an uint64_t?
我想通過重復每個位8次將unsigned char
擴展到uint64_t
。 例如
char -> uint64_t
0x00 -> 0x00
0x01 -> 0xFF
0x02 -> 0xFF00
0x03 -> 0xFFFF
0xAA -> 0xFF00FF00FF00FF00
我目前有以下實現,使用位移來測試是否設置了一個位,以實現此目的:
#include <stdint.h>
#include <inttypes.h>
#define BIT_SET(var, pos) ((var) & (1 << (pos)))
static uint64_t inflate(unsigned char a)
{
uint64_t MASK = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (BIT_SET(a, i))
result |= (MASK << (8 * i));
}
return result;
}
但是,我對C來說還是個新手,所以這個擺弄個別位的東西讓我有點不同,可能會有更好的(即更有效的)方法。
編輯添加
好的,所以在嘗試了表查找解決方案后,結果如下。 但是,請記住,我沒有直接測試例程,而是作為更大函數的一部分(確切地說是二進制矩陣的乘法),因此這可能會影響結果的結果。 因此,在我的計算機上,當乘以一百萬個8x8矩陣時,編譯為:
gcc -O2 -Wall -std=c99 foo.c
我有
./a.out original
real 0m0.127s
user 0m0.124s
sys 0m0.000s
./a.out table_lookup
real 0m0.012s
user 0m0.012s
sys 0m0.000s
所以至少在我的機器上(虛擬機64位Linux Mint我應該提到),表查找方法似乎提供了大約10倍的加速,所以我接受這個作為答案。
如果您正在尋找效率,請使用查找表:256個條目的靜態數組,每個條目都已保存所需的結果。 您可以使用上面的代碼生成它。
在選定的體系結構(SSE,Neon)中,存在快速向量操作,可以加速此任務或旨在實現此目的。 如果沒有特別說明,建議的查找表方法既快又便攜。
如果2k大小是個問題,可以模擬並行向量算術運算:
static uint64_t inflate_parallel(unsigned char a) {
uint64_t vector = a * 0x0101010101010101ULL;
// replicate the word all over qword
// A5 becomes A5 A5 A5 A5 A5 A5 A5 A5
vector &= 0x8040201008040201; // becomes 80 00 20 00 00 04 00 01 <--
vector += 0x00406070787c7e7f; // becomes 80 40 80 70 78 80 7e 80
// MSB is correct
vector = (vector >> 7) & 0x0101010101010101ULL; // LSB is correct
return vector * 255; // all bits correct
}
編輯 :2 ^ 31次迭代,(四次展開以減輕循環評估)
time ./parallel time ./original time ./lookup
real 0m2.038s real 0m14.161s real 0m1.436s
user 0m2.030s user 0m14.120s user 0m1.430s
sys 0m0.000s sys 0m0.000s sys 0m0.000s
這大約是7倍的加速,而查找表提供了大約10倍的加速
在擔心優化代碼之前,您應該分析代碼的作用。
在我的本地編譯器上,您的代碼完全內聯,展開並在值未知時變為8個常量test +或指令,並在編譯時知道該值時變為常量。 我可以通過刪除一些分支來略微改進它,但編譯器自己做了一個合理的工作。
然后優化循環有點毫無意義。 表查找可能更有效,但可能會阻止編譯器自行進行優化。
如果你願意花費256 * 8 = 2kB的內存(即在內存方面效率降低,但在所需的CPU周期方面效率更高),最有效的方法是預先計算查找表:
static uint64_t inflate(unsigned char a) {
static const uint64_t charToUInt64[256] = {
0x0000000000000000, 0x00000000000000FF, 0x000000000000FF00, 0x000000000000FFFF,
// ...
};
return charToUInt64[a];
}
通過將源的每個位移動到相應目標字節的lsb(0→0,1→8,2→16,......,7→56),然后將每個lsb擴展到覆蓋,可以實現所需的功能整個字節,可以通過乘以0xff
(255)輕松完成。 不是使用移位將位移動到位,然后組合結果,我們可以使用整數乘法來並行移位多個位。 為了防止自重疊,我們只能以這種方式移動最不重要的七個源位,但需要通過移位單獨移動源msb。
這導致以下ISO-C99實現:
#include <stdint.h>
/* expand each bit in input into one byte in output */
uint64_t fast_inflate (uint8_t a)
{
const uint64_t spread7 = (1ULL << 42) | (1ULL << 35) | (1ULL << 28) | (1ULL << 21) |
(1ULL << 14) | (1ULL << 7) | (1UL << 0);
const uint64_t byte_lsb = (1ULL << 56) | (1ULL << 48) | (1ULL << 40) | (1ULL << 32) |
(1ULL << 24) | (1ULL << 16) | (1ULL << 8) | (1ULL << 0);
uint64_t r;
/* spread bits to lsbs of each byte */
r = (((uint64_t)(a & 0x7f) * spread7) + ((uint64_t)a << 49));
/* extract the lsbs of all bytes */
r = r & byte_lsb;
/* fill each byte with its lsb */
r = r * 0xff;
return r;
}
#define BIT_SET(var, pos) ((var) & (1 << (pos)))
static uint64_t inflate(unsigned char a)
{
uint64_t MASK = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (BIT_SET(a, i))
result |= (MASK << (8 * i));
}
return result;
}
#include <stdio.h>
#include <stdlib.h>
int main (void)
{
uint8_t a = 0;
do {
uint64_t res = fast_inflate (a);
uint64_t ref = inflate (a);
if (res != ref) {
printf ("error @ %02x: fast_inflate = %016llx inflate = %016llx\n",
a, res, ref);
return EXIT_FAILURE;
}
a++;
} while (a);
printf ("test passed\n");
return EXIT_SUCCESS;
}
大多數x64編譯器都會以簡單的方式編譯fast_inflate()
。 例如,我的英特爾編譯器版本13.1.3.198,當使用/Ox
構建時,會在下面生成11指令序列。 注意,最后乘以0xff
實際上是作為移位和減法序列實現的。
fast_inflate PROC
mov rdx, 040810204081H
movzx r9d, cl
and ecx, 127
mov r8, 0101010101010101H
imul rdx, rcx
shl r9, 49
add r9, rdx
and r9, r8
mov rax, r9
shl rax, 8
sub rax, r9
ret
這是另一種僅使用簡單算術的方法:
uint64_t inflate_chqrlie(uint8_t value) {
uint64_t x = value;
x = (x | (x << 28));
x = (x | (x << 14));
x = (x | (x << 7)) & 0x0101010101010101ULL;
x = (x << 8) - x;
return x;
}
另一個非常有效和簡潔的phuclv使用乘法和掩碼:
static uint64_t inflate_phuclv(uint8_t b) {
uint64_t MAGIC = 0x8040201008040201ULL;
uint64_t MASK = 0x8080808080808080ULL;
return ((MAGIC * b) & MASK) >> 7;
}
還有一個小查找表:
static uint32_t const lut_4_32[16] = {
0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF,
0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF,
0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF,
};
static uint64_t inflate_lut32(uint8_t b) {
return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
}
我編寫了一個基准測試程序來確定我系統上不同方法的相對性能(x86_64-apple-darwin16.7.0,Apple LLVM 9.0.0版(clang-900.0.39.2,clang -O3)。
結果顯示我的函數inflate_chqrlie
比天真的方法更快,但比其他復雜的版本慢,所有這些都被inflate_lut64
在緩存最佳情況下使用2KB的查找表擊敗。
使用更小的查找表(64字節而不是2KB)的函數inflate_lut32
沒有inflate_lut64
那么快,但似乎是32位體系結構的一個很好的折衷方案,因為它仍然比所有其他替代方案快得多。
64位基准:
inflate: 0, 848.316ms
inflate_Curd: 0, 845.424ms
inflate_chqrlie: 0, 371.502ms
fast_inflate_njuffa: 0, 288.669ms
inflate_parallel1: 0, 242.827ms
inflate_parallel2: 0, 315.105ms
inflate_parallel3: 0, 363.379ms
inflate_parallel4: 0, 304.051ms
inflate_parallel5: 0, 301.205ms
inflate_phuclv: 0, 109.130ms
inflate_lut32: 0, 197.178ms
inflate_lut64: 0, 25.160ms
32位基准:
inflate: 0, 1451.464ms
inflate_Curd: 0, 955.509ms
inflate_chqrlie: 0, 385.036ms
fast_inflate_njuffa: 0, 463.212ms
inflate_parallel1: 0, 468.070ms
inflate_parallel2: 0, 570.107ms
inflate_parallel3: 0, 511.741ms
inflate_parallel4: 0, 601.892ms
inflate_parallel5: 0, 506.695ms
inflate_phuclv: 0, 192.431ms
inflate_lut32: 0, 140.968ms
inflate_lut64: 0, 28.776ms
這是代碼:
#include <stdio.h>
#include <stdint.h>
#include <time.h>
static uint64_t inflate(unsigned char a) {
#define BIT_SET(var, pos) ((var) & (1 << (pos)))
uint64_t MASK = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (BIT_SET(a, i))
result |= (MASK << (8 * i));
}
return result;
}
static uint64_t inflate_Curd(unsigned char a) {
uint64_t mask = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (a & 1)
result |= mask;
mask <<= 8;
a >>= 1;
}
return result;
}
uint64_t inflate_chqrlie(uint8_t value) {
uint64_t x = value;
x = (x | (x << 28));
x = (x | (x << 14));
x = (x | (x << 7)) & 0x0101010101010101ULL;
x = (x << 8) - x;
return x;
}
uint64_t fast_inflate_njuffa(uint8_t a) {
const uint64_t spread7 = (1ULL << 42) | (1ULL << 35) | (1ULL << 28) | (1ULL << 21) |
(1ULL << 14) | (1ULL << 7) | (1UL << 0);
const uint64_t byte_lsb = (1ULL << 56) | (1ULL << 48) | (1ULL << 40) | (1ULL << 32) |
(1ULL << 24) | (1ULL << 16) | (1ULL << 8) | (1ULL << 0);
uint64_t r;
/* spread bits to lsbs of each byte */
r = (((uint64_t)(a & 0x7f) * spread7) + ((uint64_t)a << 49));
/* extract the lsbs of all bytes */
r = r & byte_lsb;
/* fill each byte with its lsb */
r = r * 0xff;
return r;
}
// Aki Suuihkonen: 1.265
static uint64_t inflate_parallel1(unsigned char a) {
uint64_t vector = a * 0x0101010101010101ULL;
// replicate the word all over qword
// A5 becomes A5 A5 A5 A5 A5 A5 A5 A5
vector &= 0x8040201008040201; // becomes 80 00 20 00 00 04 00 01 <--
vector += 0x00406070787c7e7f; // becomes 80 40 80 70 78 80 7e 80
// MSB is correct
vector = (vector >> 7) & 0x0101010101010101ULL; // LSB is correct
return vector * 255; // all bits correct
}
// By seizet and then combine: 1.583
static uint64_t inflate_parallel2(unsigned char a) {
uint64_t vector1 = a * 0x0002000800200080ULL;
uint64_t vector2 = a * 0x0000040010004001ULL;
uint64_t vector = (vector1 & 0x0100010001000100ULL) | (vector2 & 0x0001000100010001ULL);
return vector * 255;
}
// Stay in 32 bits as much as possible: 1.006
static uint64_t inflate_parallel3(unsigned char a) {
uint32_t vector1 = (( (a & 0x0F) * 0x00204081) & 0x01010101) * 255;
uint32_t vector2 = ((((a & 0xF0) >> 4) * 0x00204081) & 0x01010101) * 255;
return (((uint64_t)vector2) << 32) | vector1;
}
// Do the common computation in 64 bits: 0.915
static uint64_t inflate_parallel4(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint32_t vector2 = ((a & 0xF0) >> 4) * 0x00204081;
uint64_t vector = (vector1 | (((uint64_t)vector2) << 32)) & 0x0101010101010101ULL;
return vector * 255;
}
// Some computation is done in 64 bits a little sooner: 0.806
static uint64_t inflate_parallel5(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint64_t vector2 = (a & 0xF0) * 0x002040810000000ULL;
uint64_t vector = (vector1 | vector2) & 0x0101010101010101ULL;
return vector * 255;
}
static uint64_t inflate_phuclv(uint8_t b) {
uint64_t MAGIC = 0x8040201008040201ULL;
uint64_t MASK = 0x8080808080808080ULL;
return ((MAGIC * b) & MASK) >> 7;
}
static uint32_t const lut_4_32[16] = {
0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF,
0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF,
0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF,
};
static uint64_t inflate_lut32(uint8_t b) {
return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
}
static uint64_t lut_8_64[256];
static uint64_t inflate_lut64(uint8_t b) {
return lut_8_64[b];
}
#define ITER 1000000
int main() {
clock_t t;
uint64_t x;
for (int b = 0; b < 256; b++)
lut_8_64[b] = inflate((uint8_t)b);
#define TEST(func) do { \
t = clock(); \
x = 0; \
for (int i = 0; i < ITER; i++) { \
for (int b = 0; b < 256; b++) \
x ^= func((uint8_t)b); \
} \
t = clock() - t; \
printf("%20s: %llu, %.3fms\n", \
#func, x, t * 1000.0 / CLOCKS_PER_SEC); \
} while (0)
TEST(inflate);
TEST(inflate_Curd);
TEST(inflate_chqrlie);
TEST(fast_inflate_njuffa);
TEST(inflate_parallel1);
TEST(inflate_parallel2);
TEST(inflate_parallel3);
TEST(inflate_parallel4);
TEST(inflate_parallel5);
TEST(inflate_phuclv);
TEST(inflate_lut32);
TEST(inflate_lut64);
return 0;
}
兩個小優化:
一個用於測試輸入中的位(a將被銷毀,但這無關緊要)
另一個用於移動掩模。
static uint64_t inflate(unsigned char a)
{
uint64_t mask = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (a & 1)
result |= mask;
mask <<= 8;
a >>= 1;
}
return result;
}
也許你也可以用'while(a)'循環替換'for(int i = 0; i <8; i ++)' - 循環。 但是,只有當右移>> >> =無符號時才有效(因為我知道C標准允許編譯器執行有符號或無符號)。 否則在某些情況下你會有一個無限循環。
編輯:
為了查看結果,我使用gcc -std=c99 -S source.c
編譯了兩個變體。 快速瀏覽一下所得到的匯編程序輸出結果表明,上面顯示的優化結果為ca. 1/3查看器指令,其中大多數在循環內。
與@Aki回答相同主題的變化。 其中一些在這里更好,但它可能取決於你的編譯器和目標機器(它們應該更適合超級標量處理器,即Aki的功能,即使它們的工作量更少,因為數據依賴性較小)
// Aki Suuihkonen: 1.265
static uint64_t inflate_parallel1(unsigned char a) {
uint64_t vector = a * 0x0101010101010101ULL;
vector &= 0x8040201008040201;
vector += 0x00406070787c7e7f;
vector = (vector >> 7) & 0x0101010101010101ULL;
return vector * 255;
}
// By seizet and then combine: 1.583
static uint64_t inflate_parallel2(unsigned char a) {
uint64_t vector1 = a * 0x0002000800200080ULL;
uint64_t vector2 = a * 0x0000040010004001ULL;
uint64_t vector = (vector1 & 0x0100010001000100ULL) | (vector2 & 0x0001000100010001ULL);
return vector * 255;
}
// Stay in 32 bits as much as possible: 1.006
static uint64_t inflate_parallel3(unsigned char a) {
uint32_t vector1 = (( (a & 0x0F) * 0x00204081) & 0x01010101) * 255;
uint32_t vector2 = ((((a & 0xF0) >> 4) * 0x00204081) & 0x01010101) * 255;
return (((uint64_t)vector2) << 32) | vector1;
}
// Do the common computation in 64 bits: 0.915
static uint64_t inflate_parallel4(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint32_t vector2 = ((a & 0xF0) >> 4) * 0x00204081;
uint64_t vector = (vector1 | (((uint64_t)vector2) << 32)) & 0x0101010101010101ULL;
return vector * 255;
}
// Some computation is done in 64 bits a little sooner: 0.806
static uint64_t inflate_parallel5(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint64_t vector2 = (a & 0xF0) * 0x002040810000000ULL;
uint64_t vector = (vector1 | vector2) & 0x0101010101010101ULL;
return vector * 255;
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.