简体   繁体   English

错误的结果在CUDA

[英]wrong results in cuda

I try to code a simple example with cuda C, I follow a screencast about this but I have wrong result 我尝试使用cuda C编写一个简单的示例,我对此进行了截屏,但结果有误

this is an the example : 这是一个例子:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
#define SIZE    1024

__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x;

    if (i < n){
        c[i] = a[i] + b[i];
    }

}

int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU         installed?");

    }
    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    c = (int *)malloc(SIZE*sizeof(int));

    cudaMalloc(&d_a, SIZE*sizeof(int));
    cudaMalloc(&d_b, SIZE*sizeof(int));
    cudaMalloc(&d_c, SIZE*sizeof(int));

    for (int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);

    VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < 10; ++i)
        printf("c[%d] = %d\n", i, c[i]);

    free(a);
    free(b);
    free(c);
    enter code here
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

the result is : 结果是:

c[0]=0
c[1]=0
c[2]=0
c[3]=0
c[4]=0
c[5]=0
c[6]=0
c[7]=0
c[8]=0
c[9]=0

but I expect this result : 但我期望这个结果:

c[0]=0
c[1]=2
c[2]=4
c[3]=6
c[4]=8
c[5]=10
c[6]=12
c[7]=14
c[8]=16
c[9]=18

please any one can help about this ! 请任何人都可以帮忙!

I did some wrong comments, so I will try fix my errors and give a correct answer here. 我的评论有误,因此我将尝试修正错误并在此处给出正确的答案。 First all, please, attend the comments related to proper CUDA error checking . 首先,请参加与正确CUDA错误检查有关的评论。

Second, the Maximum Thread Block Size for a GT210 (CC 1.2) is 512, not 256 as I commented in a moment of confusion. 其次,GT210(CC 1.2)的最大线程块大小为512,而不是我在混乱中评论的256。

That said, You should get the following error by doing the mentioned error checking: 就是说,您应该通过执行上述错误检查来得到以下错误:

GPUassert: invalid device function 

In this case, this error indicates the architecture for which you have compiled your code is higher than the one you are using to run the example. 在这种情况下,此错误表明您为其编译代码的体系结构高于用于运行示例的体系结构。 You are compiling the example for devices of compute capability = 2.0 or above (as you commented), but then you execute the code in your GT210 which has a compute capability = 1.2 . 您正在编译compute capability = 2.0或更高(如您所评论)的设备的示例,但是随后您在GT210中执行了compute capability = 1.2

So, first, re-compile your example for the corresponding architecture. 因此,首先,为相应的体系结构重新编译示例。 Change the 改变

-gencode=arch=compute_20 TO -gencode=arch=compute_12

Once you have successfully compiled the example for your architecture, you will get the following error (because you ALREADY are doing proper error checking ;) 一旦你成功的编译了您架构的例子,你会得到以下错误(因为你已经在做正确的错误检查 ;)

GPUassert: invalid configuration argument 

In this case, the error indicates that you are using more resources than the ones available for your architecture (compute capability 1.2) because you are trying to launch blocks of SIZE = 1024 but the Maximum Thread Block Size is 512 , that is, you can not configure a block with more than 512 threads. 在这种情况下,该错误表明您正在使用的资源比体系结构可用的资源(计算能力1.2)更​​多,因为您正在尝试启动SIZE = 1024块,但最大线程块大小为512 ,也就是说,您可以不能用超过512个线程配置一个块。

So, adjust the SIZE to 512 and everything should work as expected. 因此,将SIZE调整为512,一切都会按预期工作。 Below is your example, doing proper CUDA error checking . 下面是您的示例,进行适当的CUDA错误检查

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
#define SIZE    1024

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x;

    if (i < n){
        c[i] = a[i] + b[i];
    }
}

int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU         installed?");
    }
    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    c = (int *)malloc(SIZE*sizeof(int));

    gpuErrchk( cudaMalloc(&d_a, SIZE*sizeof(int)) );
    gpuErrchk( cudaMalloc(&d_b, SIZE*sizeof(int)) );
    gpuErrchk( cudaMalloc(&d_c, SIZE*sizeof(int)) );

    for (int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    gpuErrchk( cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice) );

    VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    gpuErrchk( cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost) );

    for (int i = 0; i < 10; ++i)
        printf("c[%d] = %d\n", i, c[i]);

    free(a);
    free(b);
    free(c);
    // enter code here 
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM