[英]Debug Assertion Failed For MPI program for matrix multiplication with some threads
再会。 我在运行乘以矩阵的 MPI 程序时遇到了一些问题。 这是代码(它不是我的代码)我从http://dkl.cs.arizona.edu/teaching/csc522-fall16/examples/hybrid-openmp-mm.c得到它
如果你能帮助我,我将不胜感激我也在寻找类似的问题和解决方案,但没有解决我的问题
#include <omp.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define TAG 13
int main(int argc, char* argv[]) {
double** A, ** B, ** C, * tmp;
double startTime, endTime;
int numElements, offset, stripSize, myrank, numnodes, N, i, j, k;
int numThreads, chunkSize = 10;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &numnodes);
N = atoi(argv[1]);
numThreads = atoi(argv[2]); // difference from MPI: how many threads/rank?
omp_set_num_threads(numThreads); // OpenMP call to set threads per rank
// allocate A, B, and C --- note that you want these to be
// contiguously allocated. Workers need less memory allocated.
if (myrank == 0) {
tmp = (double*)malloc(sizeof(double) * N * N);
A = (double**)malloc(sizeof(double*) * N);
for (i = 0; i < N; i++)
A[i] = &tmp[i * N];
}
else {
tmp = (double*)malloc(sizeof(double) * N * N / numnodes);
A = (double**)malloc(sizeof(double*) * N / numnodes);
for (i = 0; i < N / numnodes; i++)
A[i] = &tmp[i * N];
}
tmp = (double*)malloc(sizeof(double) * N * N);
B = (double**)malloc(sizeof(double*) * N);
for (i = 0; i < N; i++)
B[i] = &tmp[i * N];
if (myrank == 0) {
tmp = (double*)malloc(sizeof(double) * N * N);
C = (double**)malloc(sizeof(double*) * N);
for (i = 0; i < N; i++)
C[i] = &tmp[i * N];
}
else {
tmp = (double*)malloc(sizeof(double) * N * N / numnodes);
C = (double**)malloc(sizeof(double*) * N / numnodes);
for (i = 0; i < N / numnodes; i++)
C[i] = &tmp[i * N];
}
if (myrank == 0) {
// initialize A and B
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
A[i][j] = 1.0;
B[i][j] = 1.0;
}
}
}
// start timer
if (myrank == 0) {
startTime = MPI_Wtime();
}
stripSize = N / numnodes;
// send each node its piece of A -- note could be done via MPI_Scatter
if (myrank == 0) {
offset = stripSize;
numElements = stripSize * N;
for (i = 1; i < numnodes; i++) {
MPI_Send(A[offset], numElements, MPI_DOUBLE, i, TAG, MPI_COMM_WORLD);
offset += stripSize;
}
}
else { // receive my part of A
MPI_Recv(A[0], stripSize * N, MPI_DOUBLE, 0, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
// everyone gets B
MPI_Bcast(B[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Let each process initialize C to zero
for (i = 0; i < stripSize; i++) {
for (j = 0; j < N; j++) {
C[i][j] = 0.0;
}
}
// do the work---this is the primary difference from the pure MPI program
#pragma omp parallel for shared(A,B,C,numThreads) private(i,j,k) schedule (static, chunkSize)
for (i = 0; i < stripSize; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
// master receives from workers -- note could be done via MPI_Gather
if (myrank == 0) {
offset = stripSize;
numElements = stripSize * N;
for (i = 1; i < numnodes; i++) {
MPI_Recv(C[offset], numElements, MPI_DOUBLE, i, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
offset += stripSize;
}
}
else { // send my contribution to C
MPI_Send(C[0], stripSize * N, MPI_DOUBLE, 0, TAG, MPI_COMM_WORLD);
}
// stop timer
if (myrank == 0) {
endTime = MPI_Wtime();
printf("Time is %f\n", endTime - startTime);
}
// print out matrix here, if I'm the master
if (myrank == 0 && N < 10) {
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf("%f ", C[i][j]);
}
printf("\n");
}
}
MPI_Finalize();
return 0;
}
您正在B
上执行MPI_Bcast
,就好像它是N*N
元素的连续块。 然而,它不是:它是一个指针数组,以N
单独的阵列用于长度N
。 所以要么你需要连续分配B
,要么你需要做N
广播。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.