繁体   English   中英

在SSE数据类型(__m128i)中设置单个位的快速方法?

[英]Fast way to set single bit in SSE datatypes (__m128i)?

我用__m128i表示一个位域,需要一种快速方法来检查特定位是否设置,以及设置特定位的方法。 我是否必须将另一个__m128i设置为掩码并将它们设置为OR,或者是否存在我丢失的指令更快? 我正在使用英特尔编译器。

你可以尝试这样的事情。 我不相信有更快的方法。 您可能希望从代码的性能关键部分中提取一些常量值和表。

  __m128i v; // todo: set v to something here

  // to check
  int n; // todo: set n to the zero-indexed bit to check

  __m128i chkmask  = _mm_slli_epi16(_mm_set1_epi16(1), n & 0xF);
  int     movemask = (1 << (n >> 3));
  int     isSet  = (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(chkmask, v), _mm_setzero_si128())) & movemask) ^ movemask;

  // to set
  int m; // todo: set m to the zero-indexed bit to set

  __m128i shuf    = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
          shuf    = _mm_add_epi8(shuf, _mm_set1_epi8(16 - (m >> 3)));
          shuf    = _mm_and_si128(shuf, _mm_set1_epi8(0x0F));
  __m128i setmask = _mm_shuffle_epi8(_mm_cvtsi32_si128(1 << (m & 0x7)), shuf);
  v = _mm_or_si128(v, setmask);

  // or to try the look-up table approach to check and set
  __declspec(align(16)) __m128i lut[] = {
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000001),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000002),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000004),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000008),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000010),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000020),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000040),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000080),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000100),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000200),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000800),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00001000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00002000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00004000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00008000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00010000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00020000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00040000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00080000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00100000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00200000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00400000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00800000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x01000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x02000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x04000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x08000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x10000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x20000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x40000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x80000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000001, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000002, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000004, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000008, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000010, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000020, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000040, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000080, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000100, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000200, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000400, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000800, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00001000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00002000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00004000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00008000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00010000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00020000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00040000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00080000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00100000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00200000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00400000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00800000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x01000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x02000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x04000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x08000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x10000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x20000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x40000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x80000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000001, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000002, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000004, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000008, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000010, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000020, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000040, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000080, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000100, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000200, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000400, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000800, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00001000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00002000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00004000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00008000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00010000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00020000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00040000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00080000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00100000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00200000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00400000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00800000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x01000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x02000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x04000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x08000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x10000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x20000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x40000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x80000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000001, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000002, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000004, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000010, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000020, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000040, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000080, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000100, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000200, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000400, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000800, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00001000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00002000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00004000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00008000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00010000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00020000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00040000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00080000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00100000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00200000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00400000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00800000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x01000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x02000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x04000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x08000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x10000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x20000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x40000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x80000000, 0x00000000, 0x00000000, 0x00000000)
  };

   // to check with look-up table
   movemask = (1 << (n >> 3));
   isSet    = (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(v, _mm_load_si128(lut + m)), _mm_setzero_si128())) & movemask) ^ movemask;

   // to set with look-up table
   v = _mm_or_si128(v, _mm_load_si128(lut + m));

对于它的价值,我想出了一个用于测试的变体。 如果掩码和一个resister可以预先计算,那么这只需要三个内在函数。

为了设置单个位,我认为没有一种有效的方法。 以下是关于从movemask返回SSE寄存器的讨论如何执行_mm256_movemask_epi8(VPMOVMSKB)的反转?

#include <emmintrin.h>
#include <stdio.h>
int main() {
    __m128i x = _mm_setr_epi32(0,0,0,1);
    __m128i mask = _mm_setr_epi32(0,0,0,1);
    __m128i one = _mm_set1_epi8(1);
    int isSet = 0xffff != _mm_movemask_epi8(_mm_sub_epi8(_mm_and_si128(x,mask),one));
    printf("%X\n", isSet);  
}

编辑实际上有一种更快的方法来使用_mm_testz_si128检查SSE4.1。

#include <smmintrin.h>
#include <stdio.h>

int main() {
    __m128i x = _mm_setr_epi32(0,0,0,1);
    __m128i mask = _mm_setr_epi32(0,0,0,1);

    __m128i t = _mm_and_si128(x,mask);
    int isSet = !_mm_testz_si128(t,t);

    printf("%d\n", isSet);  
}

没有说明在设置单独的比特__m128i

您可以尝试使用通用BTS指令,但它可能比制作一个掩码慢,因为它只能写入内存(或32位寄存器,这没有帮助)。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM