[英]Why is cuMemAddressReserve() failing with CUDA_INVALID_VALUE?
Consider the following program (written in C syntax):考虑以下程序(用 C 语法编写):
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
int main() {
CUresult result;
unsigned int init_flags = 0;
result = cuInit(init_flags);
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
CUcontext ctx;
unsigned int ctx_create_flags = 0;
CUdevice device_id = 0;
result = cuCtxCreate(&ctx, ctx_create_flags, device_id);
// Note: The created context is also made the current context,
// so we are _in_ a context from now on.
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
CUdeviceptr requested = 0;
CUdeviceptr reserved;
size_t size = 0x20000;
size_t alignment = 0; // default
unsigned long long reserve_flags = 0;
// -----------------------------------
// ==>> FAILURE on next statement <<==
// -----------------------------------
result = cuMemAddressReserve(&reserved, size, alignment, requested, reserve_flags);
if (result != CUDA_SUCCESS) {
const char* error_string;
cuGetErrorString(result, &error_string);
fprintf(stderr, "cuMemAddressReserve() failed: %s\n", error_string);
exit(EXIT_FAILURE);
}
return 0;
}
This fails when trying to make the reservation:尝试预订时失败:
cuMemAddressReserve() failed: invalid argument
what's wrong with my arguments?我的 arguments 怎么了? Is it the size?
是尺码吗? the alignment?
alignment? Requesting an address of 0?
请求地址为 0? If it's the latter - how can I even know what address to request, when I don't really care?
如果是后者 - 当我真的不在乎的时候,我怎么知道要请求哪个地址?
If I recall correctly,the sizes for virtual memory management functions must be a multiple of CUDAs allocation granularity.如果我没记错的话,虚拟 memory 管理函数的大小必须是 CUDA 分配粒度的倍数。 See
cuMemGetAllocationGranularity
and this blog post https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/请参阅
cuMemGetAllocationGranularity
和这篇博文 https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/
The following works on my machine.以下适用于我的机器。
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
int main() {
CUresult result;
unsigned int init_flags = 0;
result = cuInit(init_flags);
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
CUcontext ctx;
unsigned int ctx_create_flags = 0;
CUdevice device_id = 0;
result = cuCtxCreate(&ctx, ctx_create_flags, device_id);
// Note: The created context is also made the current context,
// so we are _in_ a context from now on.
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
CUdeviceptr requested = 0;
CUdeviceptr reserved;
size_t size = 0x20000;
size_t alignment = 0; // default
unsigned long long reserve_flags = 0;
size_t granularity;
CUmemAllocationProp prop;
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = (int)0;
prop.win32HandleMetaData = NULL;
result = cuMemGetAllocationGranularity (&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM );
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
printf("minimum granularity %lu\n", granularity);
size_t padded_size = ((granularity + size - 1) / granularity) * granularity;
result = cuMemAddressReserve(&reserved, padded_size, alignment, requested, reserve_flags);
if (result != CUDA_SUCCESS) {
const char* error_string;
cuGetErrorString(result, &error_string);
fprintf(stderr, "cuMemAddressReserve() failed: %s\n", error_string);
exit(EXIT_FAILURE);
}
return 0;
}
tl;dr: Your reserved region size is not a multiple of (some device's) allocation granularity. tl; dr:您的保留区域大小不是(某些设备的)分配粒度的倍数。
As @AbatorAbetor suggested, cuMemAddressReserve()
implicitly requires the size of the memory region to be a multiple of some granularity value.正如@AbatorAbetor 建议的那样,
cuMemAddressReserve()
隐式要求 memory 区域的大小是某个粒度值的倍数。 And despite 0x20000 seeming like a generous enough value for that (2^21 bytes... system memory pages are typically 4 KiB = 2^12 bytes) - NVIDIA GPUs are very demanding here.尽管 0x20000 似乎是一个足够大的值(2^21 字节......系统 memory 页面通常为 4 KiB = 2^12 字节) - NVIDIA GPU 在这里非常苛刻。
For example, a Pascal GTX 1050 Ti GPU with ~4GB of memory has a granularity of 0x200000, or 2 MiB - 16 times more than what you were trying to allocate.例如,Pascal GTX 1050 Ti GPU 和 ~4GB memory 的粒度为 0x200000,或 2 MiB - 比您尝试分配的多 16 倍。
Now, what would happen if we had two devices with different granularity values?现在,如果我们有两个具有不同粒度值的设备会发生什么? Would we need to use the least-common-multiple?
我们需要使用最小公倍数吗? Who knows.
谁知道。
Anyway, bottom line: Always check the granularity both before allocating and before reserving.无论如何,底线:始终在分配之前和保留之前检查粒度。
I have filed this as a documentation bug with NVIDIA, bug 3486420 (but you may not be able to follow the link, because NVIDIA hide their bugs from their users).我已将此作为文档错误提交给 NVIDIA,错误 3486420 (但您可能无法访问该链接,因为 NVIDIA 对其用户隐藏了他们的错误)。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.