Cuda - 内核执行后设备值为 0

Question

For some reason when I execute my program the device variables have a zero values.出于某种原因，当我执行我的程序时，设备变量的值为零。 Just before I execute the cuda kernel the device variables have the correct values.就在我执行 cuda 内核之前，设备变量具有正确的值。 The output image is just black of the original image size.输出图像只是原始图像大小的黑色。 All the memory allocations and copying to and from host seem to be correct.所有内存分配和复制到主机和从主机复制似乎都是正确的。

Thanks for any help!谢谢你的帮助！

    // Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#ifdef _WIN32
#  define WINDOWS_LEAN_AND_MEAN
#  define NOMINMAX
#  include <windows.h>
#endif

#define Image_Size 512
#define Kernel_Size 3

// Includes CUDA
#include <cuda_runtime.h>

// Utilities and timing functions
#include "./inc/helper_functions.h"    // includes cuda.h and cuda_runtime_api.h

// CUDA helper functions
#include "./inc/helper_cuda.h"         // helper functions for CUDA error check

const char *imageFilename = "lena_bw.pgm";

const char *sampleName = "simpleTexture";

#define C_PI 3.141592653589793238462643383279502884197169399375

void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
    // This function effectively swirls an image
    // This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
    // Each thread on the GPU will process exactly one pixel
    // Before doing anything, we need to determine the current pixel we are calculating in this thread
    // Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel

    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    // Test to see if we're testing a valid pixel
    if (i >= height || j >= width) return;  // Don't bother doing the calculation. We're not in a valid pixel location

    double cX = (double)width/2.0f;
    double cY = (double)height/2.0f;
    double relY = cY-i;
    double relX = j-cX;
    // relX and relY are points in our UV space
    // Calculate the angle our points are relative to UV origin. Everything is in radians.
    double originalAngle;
    if (relX != 0)
    {
        originalAngle = atan(abs(relY)/abs(relX));
        if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
        else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
        else if (relX <=0 && relY <0) originalAngle += C_PI;
    }
    else
    {
        // Take care of rare special case
        if (relY >= 0) originalAngle = 0.5f * C_PI;
        else originalAngle = 1.5f * C_PI;
    }
    // Calculate the distance from the center of the UV using pythagorean distance
    double radius = sqrt(relX*relX + relY*relY);
    // Use any equation we want to determine how much to rotate image by
    //double newAngle = originalAngle + factor*radius;  // a progressive twist
    double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
    // Transform source UV coordinates back into bitmap coordinates
    int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
    int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
    srcX += cX;
    srcY += cY;
    srcY = height - srcY;
    // Clamp the source to legal image pixel
    if (srcX < 0) srcX = 0;
    else if (srcX >= width) srcX = width-1;
    if (srcY < 0) srcY = 0;
    else if (srcY >= height) srcY = height-1;
    // Set the pixel color
    // Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
    pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}




////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("%s starting...\n", sampleName);

    // Process command-line arguments
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "input"))
        {
            getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
        }
        else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
        {
            printf("-reference flag should be used with -input flag");
            exit(EXIT_FAILURE);
        }
    }

    runTest(argc, argv);

    cudaDeviceReset();
    printf("%s completed",
           sampleName);
    //exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
    int devID = findCudaDevice(argc, (const char **) argv);
    unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
    // load image from disk
    float *hData = NULL;
    float *host_array_kernel = 0;

    float *device_array_Image = 0;
    float *device_array_kernel = 0;
    float *device_array_Result = 0;


    unsigned int width, height;
    char *imagePath = sdkFindFilePath(imageFilename, argv[0]);

    if (imagePath == NULL)
    {
        printf("Unable to source image file: %s\n", imageFilename);
        exit(EXIT_FAILURE);
    }

    sdkLoadPGM(imagePath, &hData, &width, &height);

    unsigned int size = width * height * sizeof(float);
    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);

    // Allocation of device arrays using CudaMalloc
    cudaMalloc((void**)&device_array_Image, size);
    cudaMalloc((void**)&device_array_kernel, kernel_bytes);
    cudaMalloc((void**)&device_array_Result, size);


    host_array_kernel = (float*)malloc(kernel_bytes); // kernel


   // Allocate mem for the result on host side
   float *hOutputDataSharp = (float *) malloc(size);

    GenerateKernel (host_array_kernel);


// copy arrays and kernel from host to device
    checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));



    dim3 dimBlock(16, 16, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

    //Do the Convolution
    printf("DImage : '%.8f'\n",device_array_Image);
    printf("DKernel : '%.8f'\n",device_array_kernel);
    //serialConvolution(hData, host_array_kernel ,hOutputDataSharp);


    SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
    printf("DResult : '%.8f'\n",device_array_Result);
    checkCudaErrors(cudaDeviceSynchronize());
    cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
    printf("HResult : '%.8f'\n",hOutputDataSharp);
    // Write result to file
    char outputSharp[1024];

    strcpy(outputSharp, imagePath);
    strcpy(outputSharp, "data/serial_sharptest.pgm");
    sdkSavePGM(outputSharp, hOutputDataSharp, width, height);

    cudaFree(device_array_Result);
    cudaFree(device_array_Image);
    cudaFree(device_array_kernel);
    free(hData);
    free(imagePath);
    //free(host_array_Image);
    free(host_array_kernel);
    free(hOutputDataSharp);
    //free(hOutputImage);
    //free(hOutputKernel);
}

Answer 1

Your code is writing in the source image:您的代码正在源图像中写入：

pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];

which writes to device_array_Image which is the source, not the destination you are expecting results in.它写入device_array_Image这是源，而不是您期望的目标结果。

Moreover, I am very curious on the output of printf("DResult : '%.8f'\\n",device_array_Result);此外，我对printf("DResult : '%.8f'\\n",device_array_Result);的输出很好奇printf("DResult : '%.8f'\\n",device_array_Result); as device_array_Result is in GPU address space and allocated with cudaMalloc .因为device_array_Result在 GPU 地址空间中并使用cudaMalloc分配。 On which device are you running ?你在哪个设备上运行？

Cuda - 内核执行后设备值为 0

问题描述

1 个解决方案

解决方案1
1 2016-04-28 20:48:12

Cuda - 内核执行后设备值为 0

问题描述

1 个解决方案

解决方案1 1 2016-04-28 20:48:12

解决方案1
1 2016-04-28 20:48:12