簡體   English   中英

在SSE數據類型(__m128i)中設置單個位的快速方法?

[英]Fast way to set single bit in SSE datatypes (__m128i)?

我用__m128i表示一個位域,需要一種快速方法來檢查特定位是否設置,以及設置特定位的方法。 我是否必須將另一個__m128i設置為掩碼並將它們設置為OR,或者是否存在我丟失的指令更快? 我正在使用英特爾編譯器。

你可以嘗試這樣的事情。 我不相信有更快的方法。 您可能希望從代碼的性能關鍵部分中提取一些常量值和表。

  __m128i v; // todo: set v to something here

  // to check
  int n; // todo: set n to the zero-indexed bit to check

  __m128i chkmask  = _mm_slli_epi16(_mm_set1_epi16(1), n & 0xF);
  int     movemask = (1 << (n >> 3));
  int     isSet  = (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(chkmask, v), _mm_setzero_si128())) & movemask) ^ movemask;

  // to set
  int m; // todo: set m to the zero-indexed bit to set

  __m128i shuf    = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
          shuf    = _mm_add_epi8(shuf, _mm_set1_epi8(16 - (m >> 3)));
          shuf    = _mm_and_si128(shuf, _mm_set1_epi8(0x0F));
  __m128i setmask = _mm_shuffle_epi8(_mm_cvtsi32_si128(1 << (m & 0x7)), shuf);
  v = _mm_or_si128(v, setmask);

  // or to try the look-up table approach to check and set
  __declspec(align(16)) __m128i lut[] = {
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000001),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000002),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000004),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000008),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000010),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000020),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000040),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000080),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000100),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000200),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000800),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00001000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00002000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00004000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00008000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00010000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00020000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00040000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00080000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00100000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00200000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00400000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00800000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x01000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x02000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x04000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x08000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x10000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x20000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x40000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x80000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000001, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000002, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000004, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000008, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000010, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000020, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000040, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000080, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000100, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000200, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000400, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000800, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00001000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00002000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00004000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00008000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00010000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00020000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00040000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00080000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00100000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00200000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00400000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00800000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x01000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x02000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x04000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x08000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x10000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x20000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x40000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x80000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000001, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000002, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000004, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000008, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000010, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000020, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000040, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000080, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000100, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000200, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000400, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000800, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00001000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00002000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00004000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00008000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00010000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00020000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00040000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00080000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00100000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00200000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00400000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00800000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x01000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x02000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x04000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x08000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x10000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x20000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x40000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x80000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000001, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000002, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000004, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000010, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000020, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000040, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000080, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000100, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000200, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000400, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000800, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00001000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00002000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00004000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00008000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00010000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00020000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00040000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00080000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00100000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00200000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00400000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00800000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x01000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x02000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x04000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x08000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x10000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x20000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x40000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x80000000, 0x00000000, 0x00000000, 0x00000000)
  };

   // to check with look-up table
   movemask = (1 << (n >> 3));
   isSet    = (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(v, _mm_load_si128(lut + m)), _mm_setzero_si128())) & movemask) ^ movemask;

   // to set with look-up table
   v = _mm_or_si128(v, _mm_load_si128(lut + m));

對於它的價值,我想出了一個用於測試的變體。 如果掩碼和一個resister可以預先計算,那么這只需要三個內在函數。

為了設置單個位,我認為沒有一種有效的方法。 以下是關於從movemask返回SSE寄存器的討論如何執行_mm256_movemask_epi8(VPMOVMSKB)的反轉?

#include <emmintrin.h>
#include <stdio.h>
int main() {
    __m128i x = _mm_setr_epi32(0,0,0,1);
    __m128i mask = _mm_setr_epi32(0,0,0,1);
    __m128i one = _mm_set1_epi8(1);
    int isSet = 0xffff != _mm_movemask_epi8(_mm_sub_epi8(_mm_and_si128(x,mask),one));
    printf("%X\n", isSet);  
}

編輯實際上有一種更快的方法來使用_mm_testz_si128檢查SSE4.1。

#include <smmintrin.h>
#include <stdio.h>

int main() {
    __m128i x = _mm_setr_epi32(0,0,0,1);
    __m128i mask = _mm_setr_epi32(0,0,0,1);

    __m128i t = _mm_and_si128(x,mask);
    int isSet = !_mm_testz_si128(t,t);

    printf("%d\n", isSet);  
}

沒有說明在設置單獨的比特__m128i

您可以嘗試使用通用BTS指令,但它可能比制作一個掩碼慢,因為它只能寫入內存(或32位寄存器,這沒有幫助)。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM