[英]Extract scalar value from SSE vector
我有一段代碼在數組元素上進行比較,如果它們是>值,則以SIMD-ish方式:
void sse(uint *dst, size_t N)
{
const __m128i condition = _mm_set1_epi32(2);
for (uint i = 0; i < N; i += 4)
{
__m128i v = _mm_load_si128((__m128i *)&dst[i]);
__m128i cmp = _mm_cmpgt_epi32(v, condition);
v = _mm_and_si128(v, cmp);
_mm_store_si128((__m128i *)&dst[i], v);
}
}
現在,在比較之后,在和元素之前--_ _mm_and_si128
,我想計算通過條件的元素,即那些設置為'1'的元素,並將總和存儲在int變量中。 我怎么能在SIMD中做到這一點? 例如,如果四個中只有兩個通過了條件,則將此int var = 2。
通常,您將在整個循環中保持向量計數,然后在循環終止時僅對向量的元素求和,例如
#include <emmintrin.h>
uint32_t sse(const uint32_t *dst, const size_t N)
{
const __m128i condition = _mm_set1_epi32(2);
__m128i vcount = _mm_set1_epi32(0);
uint32_t count = 0;
for (size_t i = 0; i < N; i += 4)
{
__m128i v = _mm_load_si128((__m128i *)&dst[i]);
__m128i vcmp = _mm_cmpgt_epi32(v, condition);
v = _mm_and_si128(v, vcmp);
_mm_store_si128((__m128i *)&dst[i], v);
vcount = _mm_add_epi32(vcount, vcmp); // accumulate (negative) counts
}
// ... sum vcount here and store in count (see below) ...
return count;
}
請注意,我們將每個掩碼元素視為一個int,即0或-1,因此我們正在累加一個和,它是實際和的負數。
最終vcount
求和的效率通常不太重要,因為它僅對整個循環執行一次,因此如果N
相當大,則無論需要多少指令(在合理范圍內)都無關緊要。
有幾種方法可以處理最終求和,例如,您可以使用_mm_movemask_epi8
(SSE2)提取16位掩碼並使用它,或者您可以使用_mm_hadd_epi32
(SSSE3)計算向量上的水平和,然后提取總和作為標量,例如
SSE2:
#include <emmintrin.h>
int16_t mask = _mm_movemask_epi8(vcount); // extract 16 bit mask
count = !!(mask & 0x0001) + // count non-zero 32 bit elements
!!(mask & 0x0010) +
!!(mask & 0x0100) +
!!(mask & 0x1000);
SSSE3:
#include <tmmintrin.h>
vcount = _mm_hadd_epi32(vcount, vcount); // horizontal sum of 4 elements
vcount = _mm_hadd_epi32(vcount, vcount);
count = - ((_mm_extract_epi16(vcount, 1) << 16) // extract (and negate) sum to
| _mm_extract_epi16(vcount, 1)); // get total (positive) count
SSE4.2:
#include <smmintrin.h>
vcount = _mm_hadd_epi32(vcount, vcount); // horizontal sum of 4 elements
vcount = _mm_hadd_epi32(vcount, vcount);
count = - _mm_extract_epi32(vcount, 0); // extract (and negate) sum to
// get total (positive) count
這是一個完整的工作版本,帶有SSE4.2版本的測試工具:
#include <stdio.h>
#include <stdint.h>
#include <smmintrin.h>
uint32_t sse(const uint32_t *dst, const size_t N)
{
const __m128i condition = _mm_set1_epi32(2);
__m128i vcount = _mm_set1_epi32(0);
uint32_t count = 0;
for (size_t i = 0; i < N; i += 4)
{
__m128i v = _mm_load_si128((__m128i *)&dst[i]);
__m128i vcmp = _mm_cmpgt_epi32(v, condition);
v = _mm_and_si128(v, vcmp);
_mm_store_si128((__m128i *)&dst[i], v);
vcount = _mm_add_epi32(vcount, vcmp); // accumulate (negative) counts
}
vcount = _mm_hadd_epi32(vcount, vcount); // horizontal sum of 4 elements
vcount = _mm_hadd_epi32(vcount, vcount);
count = - _mm_extract_epi32(vcount, 0); // extract (and negate) sum to
// get total (positive) count
return count;
}
int main(void)
{
uint32_t a[4] __attribute__ ((aligned(16))) = { 1, 2, 3, 4 };
uint32_t count;
count = sse(a, 4);
printf("a = %u %u %u %u \n", a[0], a[1], a[2], a[3]);
printf("count = %u\n", count);
return 0;
}
$ gcc -Wall -std=c99 -msse4 sse_count.c -o sse_count
$ ./sse_count
a = 0 0 3 4
count = 2
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.