cuda内核调用/传递参数中的编译错误

Question

在实际代码中，我的意图是通过将输入数组与标量进行比较来获取输出数组。 或者只是输出=输入>标量。

如下所示，简单的示例主机端代码正在按预期方式工作。

    float *h_data1 = (float *)malloc(W1*H1 * sizeof(float));
    bool *h_result = (bool *)malloc(H1*W2 * sizeof(bool));

    float *d_data1;      gpuErrchk(cudaMalloc(&d_data1, W1*H1 * sizeof(float)));
    bool *d_result;    gpuErrchk(cudaMalloc(&d_result, H1*W2 * sizeof(bool)));

    for (int i = 0; i < W1*H1; i++) h_data1[i] = (float)i;

    gpuErrchk(cudaMemcpy(d_data1, h_data1, W1*H1 * sizeof(float), cudaMemcpyHostToDevice));

    float scalar = 2;
    compGraterRetOut<float, bool><< <outw, outh >> > (d_data1, d_result, scalar);

    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

设备端代码是

template<typename TType, typename TTypeOut>
__global__  void compGraterRetOut(TType *dataIn, TTypeOut *dataOut, const TType scalar)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    dataOut[i] = (dataIn[i] > scalar);
}

进入实际的代码，我有一个如下所示的图像类（仅显示该类的一部分）。

template<typename TType, ImageType TImageType>
class Image
{
public:
    Image(uint32_t width, uint32_t height, uint32_t depth = 1);

private:
    TType* m_data;
    uint32_t m_width;
    uint32_t m_height;
    uint32_t m_depth;
    uint32_t m_bufferSize;
};

template<typename TType, ImageType TImageType>
Image<TType, TImageType>::Image(uint32_t width, uint32_t height, uint32_t depth) :m_width(width), \
m_height(height), m_depth(depth)
{
    if (width == 0 || height == 0)
        return;
    cudaError_t cudaStatus;

    //m_data = new TType[m_width * m_height * m_depth];
    gpuErrchk(cudaStatus = cudaMalloc(&m_data, sizeof(TType) * m_width * m_height * m_depth));
    if (cudaStatus == cudaSuccess)
    {
        m_bufferSize = m_width * m_height * m_depth;
    }
    else
    {
        std::cout << "Error malloc function failed [" << cudaStatus << "]" << std::endl;
    }
};

为了实现目标out = in>标量，对operator>进行了如下所示的重载。 这引发了编译错误，因为

“成员“ Image :: m_data [with TType = float_t，TImageType = ImageType :: WHD]””

代码如下所示。

inline Image<uint32_t, TImageType> Image<TType, TImageType>::operator>(TType scalar) const
{
        Image<uint32_t, TImageType> ret(m_width, m_height, m_depth);

        compGraterRetOut<TType, uint32_t> << <m_width * 4, (m_height * m_depth/4) >> > (m_data, ret.m_data, scalar);

        gpuErrchk(cudaGetLastError());
        gpuErrchk(cudaDeviceSynchronize());
        return std::move(ret);
}

为了解决编译错误，我更改了函数operator>。 在这里，CUDA内存分配在函数内部，而不是在类的构造函数内部。

template<class TType, ImageType TImageType>
inline Image<uint32_t, TImageType> Image<TType, TImageType>::operator>(TType scalar) const
{
        cudaError_t cudaStatus;

        uint32_t *dataout;
        gpuErrchk(cudaMalloc(&dataout, m_width*m_height*m_depth * sizeof(uint32_t)));

        Image<uint32_t, TImageType> ret(dataout, m_width, m_height, m_depth);

        compGraterRetOut<TType, uint32_t> << <m_width * 4, (m_height * m_depth/4) >> > (m_data, dataout, scalar);

        gpuErrchk(cudaGetLastError());
        gpuErrchk(cudaDeviceSynchronize());

        return std::move(ret);
}

最后，我的问题是为什么最后一个编译的代码没有错误，但是没有错误呢？

Answer 1

这个问题与Cuda无关。 这是模板和OOPS的问题。 当模板类以其自己的类型访问成员时，它不会违反OOPS范式。 使用不同的模板参数访问同一类的私有成员会违反OOPS范式。 那就是答案。

cuda内核调用/传递参数中的编译错误

问题描述

1 个解决方案

解决方案1
1 2019-07-29 07:01:40

cuda内核调用/传递参数中的编译错误

问题描述

1 个解决方案

解决方案1 1 2019-07-29 07:01:40

解决方案1
1 2019-07-29 07:01:40