Cuda - 內核執行后設備值為 0

[英]Cuda - Device values 0 after kernel execution

出於某種原因,當我執行我的程序時,設備變量的值為零。 就在我執行 cuda 內核之前,設備變量具有正確的值。 輸出圖像只是原始圖像大小的黑色。 所有內存分配和復制到主機和從主機復制似乎都是正確的。


    // Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#ifdef _WIN32
#  define NOMINMAX
#  include <windows.h>

#define Image_Size 512
#define Kernel_Size 3

// Includes CUDA
#include <cuda_runtime.h>

// Utilities and timing functions
#include "./inc/helper_functions.h"    // includes cuda.h and cuda_runtime_api.h

// CUDA helper functions
#include "./inc/helper_cuda.h"         // helper functions for CUDA error check

const char *imageFilename = "lena_bw.pgm";

const char *sampleName = "simpleTexture";

#define C_PI 3.141592653589793238462643383279502884197169399375

void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
    // This function effectively swirls an image
    // This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
    // Each thread on the GPU will process exactly one pixel
    // Before doing anything, we need to determine the current pixel we are calculating in this thread
    // Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel

    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    // Test to see if we're testing a valid pixel
    if (i >= height || j >= width) return;  // Don't bother doing the calculation. We're not in a valid pixel location

    double cX = (double)width/2.0f;
    double cY = (double)height/2.0f;
    double relY = cY-i;
    double relX = j-cX;
    // relX and relY are points in our UV space
    // Calculate the angle our points are relative to UV origin. Everything is in radians.
    double originalAngle;
    if (relX != 0)
        originalAngle = atan(abs(relY)/abs(relX));
        if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
        else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
        else if (relX <=0 && relY <0) originalAngle += C_PI;
        // Take care of rare special case
        if (relY >= 0) originalAngle = 0.5f * C_PI;
        else originalAngle = 1.5f * C_PI;
    // Calculate the distance from the center of the UV using pythagorean distance
    double radius = sqrt(relX*relX + relY*relY);
    // Use any equation we want to determine how much to rotate image by
    //double newAngle = originalAngle + factor*radius;  // a progressive twist
    double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
    // Transform source UV coordinates back into bitmap coordinates
    int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
    int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
    srcX += cX;
    srcY += cY;
    srcY = height - srcY;
    // Clamp the source to legal image pixel
    if (srcX < 0) srcX = 0;
    else if (srcX >= width) srcX = width-1;
    if (srcY < 0) srcY = 0;
    else if (srcY >= height) srcY = height-1;
    // Set the pixel color
    // Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
    pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];

// Declaration, forward
void runTest(int argc, char **argv);

// Program main
int main(int argc, char **argv)
    printf("%s starting...\n", sampleName);

    // Process command-line arguments
    if (argc > 1)
        if (checkCmdLineFlag(argc, (const char **) argv, "input"))
            getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
        else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
            printf("-reference flag should be used with -input flag");

    runTest(argc, argv);

    printf("%s completed",
    //exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);

//! Run a simple test for CUDA
void runTest(int argc, char **argv)
    int devID = findCudaDevice(argc, (const char **) argv);
    unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
    // load image from disk
    float *hData = NULL;
    float *host_array_kernel = 0;

    float *device_array_Image = 0;
    float *device_array_kernel = 0;
    float *device_array_Result = 0;

    unsigned int width, height;
    char *imagePath = sdkFindFilePath(imageFilename, argv[0]);

    if (imagePath == NULL)
        printf("Unable to source image file: %s\n", imageFilename);

    sdkLoadPGM(imagePath, &hData, &width, &height);

    unsigned int size = width * height * sizeof(float);
    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);

    // Allocation of device arrays using CudaMalloc
    cudaMalloc((void**)&device_array_Image, size);
    cudaMalloc((void**)&device_array_kernel, kernel_bytes);
    cudaMalloc((void**)&device_array_Result, size);

    host_array_kernel = (float*)malloc(kernel_bytes); // kernel

   // Allocate mem for the result on host side
   float *hOutputDataSharp = (float *) malloc(size);

    GenerateKernel (host_array_kernel);

// copy arrays and kernel from host to device
    checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));

    dim3 dimBlock(16, 16, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

    //Do the Convolution
    printf("DImage : '%.8f'\n",device_array_Image);
    printf("DKernel : '%.8f'\n",device_array_kernel);
    //serialConvolution(hData, host_array_kernel ,hOutputDataSharp);

    SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
    printf("DResult : '%.8f'\n",device_array_Result);
    cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
    printf("HResult : '%.8f'\n",hOutputDataSharp);
    // Write result to file
    char outputSharp[1024];

    strcpy(outputSharp, imagePath);
    strcpy(outputSharp, "data/serial_sharptest.pgm");
    sdkSavePGM(outputSharp, hOutputDataSharp, width, height);



此外,我對printf("DResult : '%.8f'\\n",device_array_Result);的輸出很好奇printf("DResult : '%.8f'\\n",device_array_Result); 因為device_array_Result在 GPU 地址空間中並使用cudaMalloc分配。 你在哪個設備上運行?


