[英]Aarch64 assembly LD2 issue
下面的展开式popcount的实现产生了错误的结果,我设法观察到,仅元素b[0]
和b[2]
被计数,而b[1]
和b[3]
不被计数。
#include <stdio.h>
int count_multiple_bits(unsigned long long *b, int size) {
unsigned long long *d = b;
int c;
__asm__("LD2 {v0.D, v1.D}[0], [%1], #16 \n\t"
"LD2 {v0.D, v1.D}[1], [%1] \n\t"
"CNT v0.16b, v0.16b \n\t"
"CNT v1.16b, v1.16b \n\t"
"UADDLV h2, v0.16b \n\t"
"UADDLV h2, v1.16b \n\t"
"UMOV %0, v2.d[0] \n\t"
: "+r"(c)
: "r"(d) : "v0", "v1", "v2");
return c;
}
int main(int argc, const char *argv[]) {
unsigned long long bits[] = { -1ull, -1ull, -1ull, -1ull };
printf("Test: %i\n", count_multiple_bits(bits, 4));
return 0;
}
一次包含2个元素的这一方法效果很好:
int count_multiple_bits(unsigned long long *b, int size) {
unsigned long long *d = b;
int c;
__asm__("LD1 {v0.D}[0], [%1], #8 \n\t"
"LD1 {v0.D}[1], [%1] \n\t"
"CNT v0.16b, v0.16b \n\t"
"UADDLV h1, v0.16b \n\t"
"UMOV %0, v1.d[0] \n\t"
: "+r"(c)
: "r"(d) : "v0", "v1");
return c;
}
在所有其他条件都相同的情况下,我猜这是错误的负载,这是我假定的布局:
v0.D[0] = b[0]
v1.D[0] = b[1]
v0.D[1] = b[2]
v1.D[1] = b[3]
我的错。
问题出在UADDLV
,该寄存器清除了目标寄存器,这是最终版本,它累加了任意长度的输入:
int count_multiple_bits(unsigned long long *b, unsigned int size) {
unsigned long long *d = b;
unsigned int masked = 0, i = 0;
int c = 0;
masked = size & ~3;
for (; i < masked; i += 4)
__asm__("LD1 {v0.2D, v1.2D}[0], [%1], #32 \n\t"
"CNT v0.16b, v0.16b \n\t"
"CNT v1.16b, v1.16b \n\t"
"UADDLV h2, v0.16b \n\t"
"UADDLV h3, v1.16b \n\t"
"ADD d2, d3, d2 \n\t"
"UMOV x0, v2.d[0] \n\t"
"ADD %0, x0, %0 \n\t"
: "+r"(c), "+r"(d)
:: "x0", "v0", "v1", "v2", "v3");
masked = size & 3;
for (i = 0; i < masked; ++i)
__asm__("LD1 {v0.D}[0], [%1], #8 \n\t"
"CNT v0.8b, v0.8b \n\t"
"UADDLV h1, v0.8b \n\t"
"UMOV x0, v1.d[0] \n\t"
"ADD %0, x0, %0 \n\t"
: "+r"(c), "+r"(d)
: : "x0", "v0", "v1");
return c;
}
奇迹般有效 :)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.