[英]OpenACC - Complex loop carried dependence of a->,c->,b-> prevents parallelization
我正在使用 OpenACC 比较使用 PGI 社区版 19.10(在 Windows 上)在 CPU 上并行化和非并行化矩阵乘法运算的执行时间。 我正在使用的代码是:
#include <time.h>
#include <stdlib.h>
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
const int SIZE_XY = 1000;
const int MIN_VAL = 5000;
const int MAX_VAL = 7000000;
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
int *a[SIZE_XY];
int *b[SIZE_XY];
int *c[SIZE_XY];
for (i = 0; i < SIZE_XY; i++)
{
a[i] = (int *)malloc(SIZE_XY * sizeof(int));
b[i] = (int *)malloc(SIZE_XY * sizeof(int));
c[i] = (int *)malloc(SIZE_XY * sizeof(int));
}
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = (rand() % MAX_VAL) + MIN_VAL;
b[i][j] = (rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels //HERE
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
当我在 PGI CMD 中运行以下命令时: pgcc -acc -ta=multicore -Minfo=all,accel matrixACC.c
我收到以下信息:
59, Complex loop carried dependence of a->,c->,b-> prevents parallelization
62, Complex loop carried dependence of a->,c->,b-> prevents parallelization
64, Complex loop carried dependence of a->,c->,b-> prevents parallelization
Loop carried dependence due to exposed use of c[i1][i2] prevents parallelization
我能否获得一些帮助以了解为什么会发生这种情况以及如何并行化计算矩阵乘法的循环。
谢谢
编译器无法确定您的 3 个指针变量( a
、 b
、 c
)是否会相互别名。 如果它们以某种方式相互别名,则无法确定计算任何特定c[i][j]
的独立性,并且将无法正确并行化(任何)循环。
解决此问题的一种可能方法是通知编译器,您向程序员保证(例如)第一个循环代表独立活动(在其各种迭代中)。 您可以通过将#pragma acc loop independent
放置在第一个for
循环语句之前来做到这一点。 对于您在此处选择的矩阵大小(和多核目标),这将为您提供大量暴露的并行性。 (编译器仍会发出有关其他循环非并行化的Minfo
消息,但这很可能是可以的。对于多核目标,拥有 1000 个并行工作项应该足以获得良好的性能)。
请注意,您的计算很容易溢出您选择的初始化范围的int
存储。 你会得到无意义的结果。
以下代码有可能解决上述问题:
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
const int SIZE_XY = 1000;
const int MIN_VAL = 5000;
const int MAX_VAL = 7000000;
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
int * restrict a[SIZE_XY];
int * restrict b[SIZE_XY];
int * restrict c[SIZE_XY];
for (i = 0; i < SIZE_XY; i++)
{
a[i] = (int *)malloc(SIZE_XY * sizeof(int));
b[i] = (int *)malloc(SIZE_XY * sizeof(int));
c[i] = (int *)malloc(SIZE_XY * sizeof(int));
}
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels //HERE
{
#pragma acc loop independent
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
"t1.c", line 21: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict a[SIZE_XY];
^
"t1.c", line 22: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict b[SIZE_XY];
^
"t1.c", line 23: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict c[SIZE_XY];
^
"t1.c", line 11: warning: variable "MIN_VAL" was declared but never referenced
const int MIN_VAL = 5000;
^
"t1.c", line 12: warning: variable "MAX_VAL" was declared but never referenced
const int MAX_VAL = 7000000;
^
main:
33, Loop is parallelizable
Generating Multicore code
33, #pragma acc loop gang
35, Loop is parallelizable
52, Loop is parallelizable
Generating Multicore code
52, #pragma acc loop gang
54, Complex loop carried dependence of a->,c->,b-> prevents parallelization
56, Complex loop carried dependence of a->,c->,b-> prevents parallelization
Loop carried dependence of c-> prevents parallelization
Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 9.010000 seconds
real 0m9.079s
user 0m9.019s
sys 0m0.061s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 20.140000 seconds
real 0m0.563s
user 0m20.053s
sys 0m0.132s
$
在我的机器上,使用 gcc 编译的代码大约需要 9 秒,而使用 PGI OpenACC 编译器编译的代码大约需要 0.5 秒。
顺便说一句,我个人通常会避免您选择的数组分配方法,因为不能保证各种malloc
操作会导致相邻/连续分配。 然而,对于multicore
目标,代码可以正常工作。
为了解决这个问题,我建议对您的代码进行一些更改,如下所示:
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
typedef int mt;
#define SIZE_XY 1000
typedef mt mat[SIZE_XY];
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
mat * restrict a;
mat * restrict b;
mat * restrict c;
a = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
b = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
c = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99 -O3
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
main:
32, Loop is parallelizable
Generating Multicore code
32, #pragma acc loop gang
34, Loop is parallelizable
51, Loop is parallelizable
Generating Multicore code
51, #pragma acc loop gang
53, Loop is parallelizable
55, Complex loop carried dependence of c-> prevents parallelization
Loop carried dependence of c-> prevents parallelization
Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 0.650000 seconds
real 0m0.708s
user 0m0.663s
sys 0m0.047s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 17.510000 seconds
real 0m0.499s
user 0m17.466s
sys 0m0.093s
$
(gcc 4.8.5,pgcc 20.5-0,Xeon E5-2690 v2,总共 40 个内核)
有几个优点:
restrict
我们的意图传达给编译器,而无需使用额外的编译指示a
、 b
和c
的连续分配,如果您决定从multicore
切换到加速器目标,则行为会更简单gcc
) 发出的代码运行速度几乎与 OpenACC 代码一样快。 (~0.7s 对 ~0.5s)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.