[英]Calling a "__device__ __host__" function from an external file by a CUDA kernel function
I'm trying to play with mixing CUDA and C++. I encountered the following error:我正在尝试混合使用 CUDA 和 C++。我遇到了以下错误:
main.cpp: define "main()".
main.cpp:定义“main()”。 Call "gpu_main()" and "add_test()"
调用“gpu_main()”和“add_test()”
||
|--> add_func.cu: define "gpu_main()" and "__global__ void add()" as kernel. The "add()" will call "add_test()"|--> add_func.cu: 将“gpu_main()”和“__global__ void add()”定义为 kernel。“add()”将调用“add_test()”
||
|--> basic_add.cu: define "__host__ __device__ int add_test(int a, int b)"|--> basic_add.cu: 定义“__host__ __device__ int add_test(int a, int b)”
I compile the code this way:我这样编译代码:
nvcc basic_add.cu -c
nvcc -rdc=true add_func.cu -c
g++ main.cpp -c
g++ -o main main.o basic_add.o add_func.o -lcudart -L/usr/local/cuda/lib64
At the 2nd step, it gave me this error:在第二步,它给了我这个错误:
add_func.cu(14): error: calling a host function("add_test") from a global function("add") is not allowed
add_func.cu(14):错误:不允许从全局函数(“add”)调用主机函数(“add_test”)
add_func.cu(14): error: identifier "add_test" is undefined in device code
add_func.cu(14):错误:标识符“add_test”在设备代码中未定义
Does anyone have any idea of how to fix this problem?有谁知道如何解决这个问题? Or I shouldn't call a host & device function from an external file?
或者我不应该从外部文件调用主机和设备 function? Thanks.
谢谢。
The code is as following (just for reference):代码如下(仅供参考):
#ifndef BASIC_ADD_H_
#define BASIC_ADD_H_
int add_test( int a, int b );
#endif
__host__ __device__ int add_test(int a, int b)
{
return a + b;
}
#ifndef ADD_FUNC_H_
#define ADD_FUNC_H_
#include <iostream>
#include <math.h>
#include "basic_add.h"
int gpu_main(void);
#endif
#include "add_func.h"
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
printf("gridDim %d, blockDim %d, blockIdx %d, threadIdx %d\n", gridDim.x, blockDim.x, blockIdx.x, threadIdx.x);
for (int i = index; i < n; i += stride)
{
y[i] = add_test(x[i],y[i]);
printf("blockIdx %d, threadIdx %d, %d\n", blockIdx.x, threadIdx.x, i);
break;
}
}
int gpu_main(void)
{
int N = 1<<10;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
#include <iostream>
#include <math.h>
#include "add_func.h"
#include "basic_add.h"
int main(void)
{
gpu_main();
int a = add_test(1,2);
std::cout << a << std::endl;
return 0;
}
One of the things you are trying to do here isn't workable.你在这里尝试做的事情之一是不可行的。 If you want to have a function decorated with
__host__ __device__
, first of all you should decorate it the same way everywhere (ie also in your header file where you declare it) and such a function won't be directly callable from a .cpp
file unless you compile that .cpp
file with nvcc
and pass -x cu
as a compile command line switch, so you may as well just put in in a .cu
file from my perspective.如果你想用
__host__ __device__
装饰 function ,首先你应该在任何地方以相同的方式装饰它(即也在你声明它的 header 文件中)并且这样的 function 不会直接从.cpp
文件调用除非您使用nvcc
编译该.cpp
文件并将-x cu
作为编译命令行开关传递,因此从我的角度来看,您不妨放入一个.cu
文件。
You're also not doing relocatable device code linking properly, but that is fixable.您也没有正确链接可重定位设备代码,但这是可以修复的。
If you want to have a __host__ __device__
function callable from a .cpp
file compiled with eg g++
, then the only suggestion I have is to provide a wrapper for it.如果您想从用例如
g++
编译的.cpp
文件中调用__host__ __device__
function ,那么我唯一的建议就是为它提供一个包装器。
The following is the closest I could come to what you have:以下是我最接近你所拥有的:
$ cat basic_add.h
#ifndef BASIC_ADD_H_
#define BASIC_ADD_H_
__host__ __device__ int add_test( int a, int b );
#endif
$ cat basic_add.cu
__host__ __device__ int add_test(int a, int b)
{
return a + b;
}
int my_add_test(int a, int b){ return add_test(a,b);} //wrapper
$ cat add_func.h
#ifndef ADD_FUNC_H_
#define ADD_FUNC_H_
#include <iostream>
#include <math.h>
int my_add_test(int a, int b);
int gpu_main(void);
#endif
$ cat add_func.cu
#include "basic_add.h"
#include <iostream>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
printf("gridDim %d, blockDim %d, blockIdx %d, threadIdx %d\n", gridDim.x, blockDim.x, blockIdx.x, threadIdx.x);
for (int i = index; i < n; i += stride)
{
y[i] = add_test(x[i],y[i]);
printf("blockIdx %d, threadIdx %d, %d\n", blockIdx.x, threadIdx.x, i);
break;
}
}
int gpu_main(void)
{
int N = 1<<10;
float *x, *y;
// Allocate Unified Memory . accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
$ cat main.cpp
#include <iostream>
#include <math.h>
#include "add_func.h"
int main(void)
{
gpu_main();
int a = my_add_test(1,2);
std::cout << a << std::endl;
return 0;
}
$ nvcc -dc basic_add.cu
$ nvcc -dc add_func.cu
$ nvcc -dlink -o add.dlink.o add_func.o basic_add.o
$ g++ -c main.cpp
$ g++ main.o add.dlink.o add_func.o basic_add.o -o test -L/usr/local/cuda/lib64 -lcudart
$
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.