如何使用Intel内部函数从8位整数数组构建32位整数？

Question

I have an array which consists of 32 bytes. 我有一个由32个字节组成的数组。 I need to build 8 4 bytes integers out of this array. 我需要从该数组中构建8个4字节整数。 Eg 0x00,0x11,0x22,0x33 8bit ints need to be one 0x00112233 32bit int. 例如0x00,0x11,0x22,0x33 8位整数必须是一个0x00112233 32位整数。 I decided to use AVX instructions because I can load whole array to a register with one command. 我决定使用AVX指令，因为我可以使用一个命令将整个数组加载到寄存器中。

Code I wrote: 我写的代码：

#include <stdio.h>
#include "immintrin.h"

typedef unsigned int        uint32_t;
typedef unsigned char       uint8_t;

main() {
  const uint8_t block[32] __attribute((aligned(32))) = {
   0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
  ,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
  };
  uint32_t m[8] __attribute((aligned(32)));

  __m256i ymm9 = _mm256_set_epi8(
        block[ 0],block[ 1],block[ 2],block[ 3],block[ 4],block[ 5],block[ 6],block[ 7],
        block[ 8],block[ 9],block[10],block[11],block[12],block[13],block[14],block[15],
        block[16],block[17],block[18],block[19],block[20],block[21],block[22],block[23],
        block[24],block[25],block[26],block[27],block[28],block[29],block[30],block[31]);
  _mm256_store_si256(&(m[0]),ymm9);
  int i;
  for(i=0;i<32;++i) printf("i=%d, 0x%02x\n",i,block[i]);
  for(i=0;i<8;++i) printf("i=%d, 0x%08x\n",i,m[i]);
}

Do you think it is optimal in terms of performance ? 您认为它在性能方面是最佳的吗？ Can it be done better and run faster ? 能做得更好并且运行得更快吗？ I use Linux @x86_64 and gcc 4.8.2. 我使用Linux @ x86_64和gcc 4.8.2。

I am a beginner in the world of Intel intrinsics. 我是Intel内部函数领域的初学者。 Thanks for your help. 谢谢你的帮助。

Answer 1

As usual, check the disassembly. 与往常一样，检查拆卸情况。 Then as it turns out, with the compiler I used anyway, that it relies on that data being a compile time constant, and it rearranges it so that it can be loaded easily. 事实证明，无论如何，无论如何我使用的编译器都依赖于该数据作为编译时间常数，并且对其进行了重新排列，以便可以轻松地加载它。 If that is actually the case in your real code, this is fine (but then why not use an array of uints to begin with?). 如果实际代码中确实是这种情况，那很好（但是为什么不使用uint数组开头呢？）。 But if, as I suspect it is, this is just an example and the actual array with be variable, this is a disaster, just look at it: 但是，正如我怀疑的那样，如果这只是一个示例，而实际的数组具有可变性，那将是一场灾难，请看一下：

movzx   eax, BYTE PTR [rsp+95]
xor ebx, ebx
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+93]
vmovd   xmm7, DWORD PTR [rsp]
vpinsrb xmm7, xmm7, BYTE PTR [rsp+94], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+91]
vmovd   xmm3, DWORD PTR [rsp]
vpinsrb xmm3, xmm3, BYTE PTR [rsp+92], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+89]
vmovd   xmm1, DWORD PTR [rsp]
vpinsrb xmm1, xmm1, BYTE PTR [rsp+90], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+87]
vmovd   xmm6, DWORD PTR [rsp]
vpunpcklwd  xmm3, xmm7, xmm3
vpinsrb xmm6, xmm6, BYTE PTR [rsp+88], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+85]
vmovd   xmm5, DWORD PTR [rsp]
vpinsrb xmm5, xmm5, BYTE PTR [rsp+86], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+83]
vmovd   xmm2, DWORD PTR [rsp]
vpunpcklwd  xmm1, xmm1, xmm6
vpinsrb xmm2, xmm2, BYTE PTR [rsp+84], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+81]
vmovd   xmm0, DWORD PTR [rsp]
vpunpckldq  xmm1, xmm3, xmm1
vpinsrb xmm0, xmm0, BYTE PTR [rsp+82], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+79]
vmovd   xmm4, DWORD PTR [rsp]
vpunpcklwd  xmm2, xmm5, xmm2
vpinsrb xmm4, xmm4, BYTE PTR [rsp+80], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+77]
vmovd   xmm8, DWORD PTR [rsp]
vpinsrb xmm8, xmm8, BYTE PTR [rsp+78], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+75]
vpunpcklwd  xmm0, xmm0, xmm4
vmovd   xmm4, DWORD PTR [rsp]
vpinsrb xmm4, xmm4, BYTE PTR [rsp+76], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+73]
vpunpckldq  xmm0, xmm2, xmm0
vmovd   xmm2, DWORD PTR [rsp]
vpinsrb xmm2, xmm2, BYTE PTR [rsp+74], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+71]
vmovd   xmm7, DWORD PTR [rsp]
vpunpcklqdq xmm1, xmm1, xmm0
vpunpcklwd  xmm4, xmm8, xmm4
vpinsrb xmm7, xmm7, BYTE PTR [rsp+72], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+69]
vmovd   xmm6, DWORD PTR [rsp]
vpinsrb xmm6, xmm6, BYTE PTR [rsp+70], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+67]
vmovd   xmm0, DWORD PTR [rsp]
vpunpcklwd  xmm2, xmm2, xmm7
vpinsrb xmm0, xmm0, BYTE PTR [rsp+68], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+65]
vmovd   xmm5, DWORD PTR [rsp]
vpunpckldq  xmm2, xmm4, xmm2
vpinsrb xmm5, xmm5, BYTE PTR [rsp+66], 1
mov BYTE PTR [rsp], al
vmovd   xmm3, DWORD PTR [rsp]
vpunpcklwd  xmm0, xmm6, xmm0
vpinsrb xmm3, xmm3, BYTE PTR [rsp+64], 1
vpunpcklwd  xmm3, xmm5, xmm3
vpunpckldq  xmm0, xmm0, xmm3
vpunpcklqdq xmm0, xmm2, xmm0
vinserti128 ymm0, ymm1, xmm0, 0x1
vmovdqa YMMWORD PTR [rsp+32], ymm0

Wow. 哇。 Ok, not so good. 好吧，不是很好。 Indeed worse than if the same thing was done without intrinsics, but not all is lost. 确实比没有内在函数完成同一件事更糟糕，但是并非所有内容都丢失了。 It would be better to load the data as little endian uints, and then swap them around with a _mm256_shuffle_epi8 , sort of like this (but check that shuffle mask, I didn't test it) 最好将数据加载为小的endian _mm256_shuffle_epi8 ，然后用_mm256_shuffle_epi8交换它们，就像这样（但是请检查一下shuffle掩码，我没有对其进行测试）

__m256i ymm9 = _mm256_shuffle_epi8(_mm256_load_si256((__m256i*)block), _mm256_set_epi8(
    0, 1, 2, 3,
    4, 5, 6, 7,
    8, 9, 10, 11,
    12, 13, 14, 15,
    0, 1, 2, 3,
    4, 5, 6, 7,
    8, 9, 10, 11,
    12, 13, 14, 15));
ymm9 = _mm256_permute2x128_si256(ymm9, ymm9, 1);
_mm256_store_si256((__m256i*)m, ymm9);

In general, be very careful with the "set" family of intrinsics, they can compile to very bad instruction sequences. 通常，对内在函数“ set”家族要非常小心，它们会编译成非常糟糕的指令序列。

Answer 2

Thanks all for comments. 谢谢大家的评论。 Especially harold's and Zboson's. 特别是哈罗德（Harold）和兹博森（Zboson）。

This is my second try: 这是我的第二次尝试：

const uint8_t block[32] __attribute((aligned(32))) = {
  0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
  0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff,
  0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
  0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff};
uint32_t m[8] __attribute((aligned(32)));
const uint8_t maska[16] __attribute((aligned(16))) = {
  0x0F,0x0E,0x0D,0x0C,0x0B,0x0A,0x09,0x08,
  0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00};
__m128i mask = _mm_load_si128(maska);
__m128i xmm0 = _mm_load_si128(block);
_mm_store_si128((__m128i*)&(m[0]),_mm_shuffle_epi8(xmm0, mask));
xmm0 = _mm_load_si128(block+16);
_mm_store_si128((__m128i*)&(m[4]),_mm_shuffle_epi8(xmm0, mask));

What do you think about that ? 您如何看待？ I am pretty sure there is a room for improvment. 我很确定还有改进的余地。 I do not know if _mm_load_si128 is the best way to copy data from memory to register. 我不知道_mm_load_si128是否是从内存复制数据到寄存器的最佳方法。 Assembler for first iteration: 第一次迭代的汇编程序：

/* Create a vector with element 0 as *P and the rest zero.  */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128 (__m128i const *__P)
{
return *__P;
mov    0x8(%rsp),%rax
vmovdqa (%rax),%xmm0
    0x0F,0x0E,0x0D,0x0C,0x0B,0x0A,0x09,0x08,
    0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00};
__m128i mask = _mm_load_si128(maska);
vmovdqa %xmm0,0x30(%rsp)
lea    0xf0(%rsp),%rax
mov    %rax,0x10(%rsp)
mov    0x10(%rsp),%rax
vmovdqa (%rax),%xmm0
__m128i xmm0 = _mm_load_si128(block);
vmovdqa %xmm0,0x40(%rsp)
vmovdqa 0x40(%rsp),%xmm0
vmovdqa %xmm0,0x50(%rsp)
vmovdqa 0x30(%rsp),%xmm0
vmovdqa %xmm0,0x60(%rsp)
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y);
vmovdqa 0x60(%rsp),%xmm1
vmovdqa 0x50(%rsp),%xmm0
vpshufb %xmm1,%xmm0,%xmm0
lea    0xb0(%rsp),%rax
mov    %rax,0x18(%rsp)
vmovdqa %xmm0,0x70(%rsp)
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128 (__m128i *__P, __m128i __B)
{
*__P = __B;
mov    0x18(%rsp),%rax
vmovdqa 0x70(%rsp),%xmm0
vmovdqa %xmm0,(%rax)
lea    0xf0(%rsp),%rax
add    $0x10,%rax
mov    %rax,0x20(%rsp)

What do you think ? 你怎么看？

如何使用Intel内部函数从8位整数数组构建32位整数？

问题描述

2 个解决方案

解决方案1
3 2015-05-27 20:47:15

解决方案2
1 2015-05-28 15:30:23

如何使用Intel内部函数从8位整数数组构建32位整数？

问题描述

2 个解决方案

解决方案1 3 2015-05-27 20:47:15

解决方案2 1 2015-05-28 15:30:23

解决方案1
3 2015-05-27 20:47:15

解决方案2
1 2015-05-28 15:30:23