[英]Cuda - Device values 0 after kernel execution
For some reason when I execute my program the device variables have a zero values.出于某种原因,当我执行我的程序时,设备变量的值为零。 Just before I execute the cuda kernel the device variables have the correct values.
就在我执行 cuda 内核之前,设备变量具有正确的值。 The output image is just black of the original image size.
输出图像只是原始图像大小的黑色。 All the memory allocations and copying to and from host seem to be correct.
所有内存分配和复制到主机和从主机复制似乎都是正确的。
Thanks for any help!谢谢你的帮助!
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef _WIN32
# define WINDOWS_LEAN_AND_MEAN
# define NOMINMAX
# include <windows.h>
#endif
#define Image_Size 512
#define Kernel_Size 3
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include "./inc/helper_functions.h" // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include "./inc/helper_cuda.h" // helper functions for CUDA error check
const char *imageFilename = "lena_bw.pgm";
const char *sampleName = "simpleTexture";
#define C_PI 3.141592653589793238462643383279502884197169399375
void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
// This function effectively swirls an image
// This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
// Each thread on the GPU will process exactly one pixel
// Before doing anything, we need to determine the current pixel we are calculating in this thread
// Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
// Test to see if we're testing a valid pixel
if (i >= height || j >= width) return; // Don't bother doing the calculation. We're not in a valid pixel location
double cX = (double)width/2.0f;
double cY = (double)height/2.0f;
double relY = cY-i;
double relX = j-cX;
// relX and relY are points in our UV space
// Calculate the angle our points are relative to UV origin. Everything is in radians.
double originalAngle;
if (relX != 0)
{
originalAngle = atan(abs(relY)/abs(relX));
if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
else if (relX <=0 && relY <0) originalAngle += C_PI;
}
else
{
// Take care of rare special case
if (relY >= 0) originalAngle = 0.5f * C_PI;
else originalAngle = 1.5f * C_PI;
}
// Calculate the distance from the center of the UV using pythagorean distance
double radius = sqrt(relX*relX + relY*relY);
// Use any equation we want to determine how much to rotate image by
//double newAngle = originalAngle + factor*radius; // a progressive twist
double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
// Transform source UV coordinates back into bitmap coordinates
int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
srcX += cX;
srcY += cY;
srcY = height - srcY;
// Clamp the source to legal image pixel
if (srcX < 0) srcX = 0;
else if (srcX >= width) srcX = width-1;
if (srcY < 0) srcY = 0;
else if (srcY >= height) srcY = height-1;
// Set the pixel color
// Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}
////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
// Process command-line arguments
if (argc > 1)
{
if (checkCmdLineFlag(argc, (const char **) argv, "input"))
{
getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
}
else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
{
printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE);
}
}
runTest(argc, argv);
cudaDeviceReset();
printf("%s completed",
sampleName);
//exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
int devID = findCudaDevice(argc, (const char **) argv);
unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
// load image from disk
float *hData = NULL;
float *host_array_kernel = 0;
float *device_array_Image = 0;
float *device_array_kernel = 0;
float *device_array_Result = 0;
unsigned int width, height;
char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
if (imagePath == NULL)
{
printf("Unable to source image file: %s\n", imageFilename);
exit(EXIT_FAILURE);
}
sdkLoadPGM(imagePath, &hData, &width, &height);
unsigned int size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
// Allocation of device arrays using CudaMalloc
cudaMalloc((void**)&device_array_Image, size);
cudaMalloc((void**)&device_array_kernel, kernel_bytes);
cudaMalloc((void**)&device_array_Result, size);
host_array_kernel = (float*)malloc(kernel_bytes); // kernel
// Allocate mem for the result on host side
float *hOutputDataSharp = (float *) malloc(size);
GenerateKernel (host_array_kernel);
// copy arrays and kernel from host to device
checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
//Do the Convolution
printf("DImage : '%.8f'\n",device_array_Image);
printf("DKernel : '%.8f'\n",device_array_kernel);
//serialConvolution(hData, host_array_kernel ,hOutputDataSharp);
SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
printf("DResult : '%.8f'\n",device_array_Result);
checkCudaErrors(cudaDeviceSynchronize());
cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
printf("HResult : '%.8f'\n",hOutputDataSharp);
// Write result to file
char outputSharp[1024];
strcpy(outputSharp, imagePath);
strcpy(outputSharp, "data/serial_sharptest.pgm");
sdkSavePGM(outputSharp, hOutputDataSharp, width, height);
cudaFree(device_array_Result);
cudaFree(device_array_Image);
cudaFree(device_array_kernel);
free(hData);
free(imagePath);
//free(host_array_Image);
free(host_array_kernel);
free(hOutputDataSharp);
//free(hOutputImage);
//free(hOutputKernel);
}
Your code is writing in the source image:您的代码正在源图像中写入:
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
which writes to device_array_Image
which is the source, not the destination you are expecting results in.它写入
device_array_Image
这是源,而不是您期望的目标结果。
Moreover, I am very curious on the output of printf("DResult : '%.8f'\\n",device_array_Result);
此外,我对
printf("DResult : '%.8f'\\n",device_array_Result);
的输出很好奇printf("DResult : '%.8f'\\n",device_array_Result);
as device_array_Result
is in GPU address space and allocated with cudaMalloc
.因为
device_array_Result
在 GPU 地址空间中并使用cudaMalloc
分配。 On which device are you running ?你在哪个设备上运行?
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.