[英]Multiple processes sharing / reading from one large block of CUDA device memory
I have a multi-process application with a single GPU using CUDA multi process service (MPS).我有一个多进程应用程序,其中一个 GPU 使用 CUDA 多进程服务 (MPS)。 Each process creates several device arrays, however one of them is large (~5 GB), and its a constant array, so I thought I could allocate the memory once with one process, and instruct other processes to read from that memory block using "inter process communication" (similar to the CUDA API example shown here ).
每个进程创建几个设备 arrays,但是其中一个很大(~5 GB),并且它是一个常量数组,所以我想我可以用一个进程分配 memory 一次,并指示其他进程使用“BF06CDED818D7”块从该 ZCD69B4957F06CDED818D7 读取进程间通信”(类似于此处显示的 CUDA API 示例)。
Following the linked CUDA example, I tried to implement a simple test program, but have been hitting an API error.按照链接的 CUDA 示例,我尝试实现一个简单的测试程序,但一直遇到 API 错误。 It seems when I call
cudaIPCOpenMemHandle
, I have an invalid argument.似乎当我调用
cudaIPCOpenMemHandle
时,我的论点无效。 I post the code below, in hopes that someone might easily identify the reason for the error, or perhaps suggest a better use of the CUDA API to accomplish what Im trying to do.我在下面发布代码,希望有人可以轻松找出错误的原因,或者建议更好地使用 CUDA API 来完成我想要做的事情。
#include <stdio.h>
#include <mpi.h>
#include <assert.h>
#include <sys/mman.h>
#define blockSize 128
#define N 1000
#define gpuErr(ans) { gpuAssert((ans), __FILE__, __LINE__); }
__global__ void kernel(double* out, double* in, double val){
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int thread_stride = blockDim.x * gridDim.x;
for (int i=tid; i < N; i+=thread_stride){
out[i] = in[i]*val;
}
}
static void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static void error_msg(cudaError_t err, int rank){
if (err != cudaSuccess){
printf("RANK %d recvd CUDA error message: %s\n", rank, cudaGetErrorString(err));
exit(err);
}
}
void check_access(){
cudaDeviceProp prop;
gpuErr(cudaGetDeviceProperties(&prop, 0));
if (prop.unifiedAddressing)
printf("> GPU%d = is capable of UVA\n", 0);
// NOTE: only interested in enabling intra-device peer2peer, so I think this test doesnt matter ?
//int can_access=-1;
//int num_dev=2;
//// note, here I was confused, I want the ability to have a process on device 0 access
//for (peer_dev=0; peer_dev <num_dev, peer_dev++){
// int peer_dev=0; // note if peer_dev is 1
// gpuErr(cudaDeviceCanAccessPeer(&can_access, 0,peer_dev));
// if (can_access)
// printf("device 0 has peerdev=%d access\n", peer_dev);
// else
// printf("device 0 has no peerdev=%d access\n", peer_dev);
//}
}
int main(){
MPI_Init(NULL,NULL);
int size,rank;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank==0)
check_access();
gpuErr(cudaSetDevice(0));
double* out;
double * in;
gpuErr(cudaMallocManaged((void **)&out, N*sizeof(double)));
cudaIpcMemHandle_t * memHand = (cudaIpcMemHandle_t *)
mmap(NULL, sizeof(cudaIpcMemHandle_t),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);
assert(MAP_FAILED != memHand);
memset((void *) memHand, 0, sizeof(cudaIpcMemHandle_t));
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0){
gpuErr(cudaMalloc((void **)&in, N*sizeof(double)));
gpuErr(cudaIpcGetMemHandle((cudaIpcMemHandle_t *) &memHand[0], (void *)in));
double * temp = new double[N];
for (int i=0; i < N; i++)
temp[i] = 1;
gpuErr(cudaMemcpy(in, temp, N*sizeof(double), cudaMemcpyHostToDevice));
delete temp;
}
MPI_Barrier(MPI_COMM_WORLD);
// the following is throwing a CUDAerror, invalid
if (rank >0 )
gpuErr(cudaIpcOpenMemHandle((void **) &in, memHand[0], cudaIpcMemLazyEnablePeerAccess));
MPI_Barrier(MPI_COMM_WORLD);
int numBlocks = (N + blockSize - 1) / blockSize;
double rank_val=(double) rank;
kernel<<<numBlocks, blockSize>>>(out, in, rank_val);
error_msg(cudaGetLastError(), rank);
gpuErr(cudaDeviceSynchronize());
MPI_Barrier(MPI_COMM_WORLD);
// test the kernel results
double sum = 0;
for (int i=0; i < N; i++)
sum += out[i];
printf("mpirank=%d, comm.size=%d, result=%f\n", rank, size, sum);
assert(sum==N*rank);
// cleanup
if (rank>0)
cudaIpcCloseMemHandle(in);
cudaFree(out);
if (rank==0)
cudaFree(in);
return 0;
}
I compile with我编译
nvcc -I/usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/include --compiler-options=-march=skylake-avx512 -L/usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib -lmpi ipc_tester.cu
Single process job output: (looks correct)单进程作业 output:(看起来正确)
$ srun -n1 -c2 ./a.out
> GPU0 = is capable of UVA
mpirank=0, comm.size=1, result=0.000000
Multi process job output: (hits error in call cudaIPCOpenMemHandle
)多进程作业 output:(调用
cudaIPCOpenMemHandle
时遇到错误)
$ srun -n2 -c2 ./a.out
GPUassert: invalid argument ipc_tester.cu 92
Compute sanitizer output:计算消毒剂 output:
$ srun -n2 -c2 compute-sanitizer ./a.out
========= COMPUTE-SANITIZER
========= COMPUTE-SANITIZER
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15553e9095a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90a7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90abfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15553e90b7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:uct_cuda_base_query_devices [0x155546040170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:mca_pml_ucx_open [0x15553edc7e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15554690a5a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690b7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690bbfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:ucp_init_version [0x15554690c7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= Host Frame:mca_pml_ucx_open [0x155546dc8e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15553e9095a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90a7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90abfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15553e90b7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:mca_pml_ucx_open [0x15553edc7e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:uct_cuda_base_query_devices [0x155546040170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15554690a5a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690b7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690bbfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15554690c7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Host Frame:mca_pml_ucx_open [0x155546dc8e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:cuCtxGetDevice [0x155550d083eb]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:uct_cuda_base_query_devices [0x155546040170]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
========= Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15553e9095a9]
========= Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
========= Host Frame: [0x15554690a5a9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690b7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15554690bbfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90a7f9]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame: [0x15553e90abfd]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15553e90b7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:ucp_init_version [0x15554690c7f3]
========= in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
========= Host Frame:mca_pml_ucx_open [0x155546dc8e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_pml_ucx_open [0x15553edc7e70]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:mca_base_framework_components_open [0x15555299ef2d]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame: [0x155554472ec7]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:mca_base_framework_open [0x1555529a8b31]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:ompi_mpi_init [0x15555447fb5b]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= Host Frame:MPI_Init [0x15555442dc01]
========= in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Host Frame: [0x403f04]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
========= Program hit invalid argument (error 1) on CUDA API call to cudaIpcOpenMemHandle.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x155550dde1b3]
========= in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
========= Host Frame: [0x433fac]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame: [0x40412e]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= Host Frame:__libc_start_main [0x1555531173ea]
========= in /lib64/libc.so.6
========= Host Frame: [0x403d1a]
========= in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========
GPUassert: invalid argument ipc_tester.cu 92
========= Error: process didn't terminate successfully
========= Target application returned an error
========= ERROR SUMMARY: 4 errors
System info:系统信息:
$ lsb_release -a
LSB Version: n/a
Distributor ID: SUSE
Description: SUSE Linux Enterprise Server 15 SP2
Release: 15.2
Codename: n/a
$ nvidia-smi
Tue Sep 27 10:05:48 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04 Driver Version: 450.102.04 CUDA Version: 11.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-SXM2... On | 00000000:89:00.0 Off | 0 |
| N/A 34C P0 38W / 300W | 0MiB / 16160MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
As pointed out in the comments , the memHandler was not properly set on ranks>0.正如评论中所指出的,memHandler 没有正确设置为ranks>0。
After learning how to broadcast the memHandler , I arrived at a solution.在学习了如何广播 memHandler 之后,我得出了一个解决方案。 The patch below leads to a working code.
下面的补丁导致了一个工作代码。
@@ -66,12 +66,7 @@ int main(){
double * in;
gpuErr(cudaMallocManaged((void **)&out, N*sizeof(double)));
- cudaIpcMemHandle_t * memHand = (cudaIpcMemHandle_t *)
- mmap(NULL, sizeof(cudaIpcMemHandle_t),
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);
-
- assert(MAP_FAILED != memHand);
- memset((void *) memHand, 0, sizeof(cudaIpcMemHandle_t));
+ cudaIpcMemHandle_t memHand[1];
MPI_Barrier(MPI_COMM_WORLD);
@@ -87,6 +82,21 @@ int main(){
}
MPI_Barrier(MPI_COMM_WORLD);
+// Broadcast the MPI handle
+// get size of memHandler container needed for broadcast
+ int hand_size[1];
+ if (rank==0)
+ hand_size[0]= sizeof(memHand[0]);
+ MPI_Bcast(&hand_size[0], 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+ // create the char container for memHandler broadcast
+ char memHand_C[hand_size[0]];
+ if (rank==0)
+ memcpy(&memHand_C, &memHand[0], hand_size[0]);
+ MPI_Bcast(&memHand_C, hand_size[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+ if (rank >0)
+ memcpy(&memHand[0], &memHand_C, hand_size[0]);
+
// the following is throwing a CUDAerror, invalid
if (rank >0 )
gpuErr(cudaIpcOpenMemHandle((void **) &in, memHand[0], cudaIpcMemLazyEnablePeerAccess));
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.