简体   繁体   English

CUDA C - 如何将 Texture2D 用于双精度浮点

[英]CUDA C - how to use Texture2D for double precision floating point

I want to use texture 2D memory for double precision.我想将纹理 2D 内存用于双精度。 I want to read from texture to shared memory and convert int2 to double, and then transfer back to host memory But I am getting only first row as desired and all other row's value is 2.00000000.我想从纹理读取到共享内存并将 int2 转换为双精度,然后传输回主机内存但我只根据需要获取第一行,所有其他行的值为 2.00000000。

#include<stdio.h>
#include<cuda.h>
#define Xdim 8
#define Ydim 8
texture<int2,2>me_texture;

static __inline__ __device__ double fetch_double(int2 p){
    return __hiloint2double(p.y, p.x);
}

__global__ void kern(double *o, int pitch){
    __shared__ double A[Xdim][Ydim];
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
    int2 jj;

    if(i<Xdim && j<Ydim){

        jj = tex2D(me_texture, i, j);

        A[threadIdx.x][threadIdx.y] = fetch_double(jj);
    }
    __syncthreads();

    if(i<Xdim && j<Ydim){
        o[j*Xdim + i] = A[threadIdx.x][threadIdx.y];
    }
}

int main(int argc, char *argv[]){
    double hbuf[Xdim][Ydim];
    double hout[Xdim][Ydim]; 
    double *dob; 
    double *dbuf;
    size_t pitch_bytes;

    cudaMallocPitch((void**)&dbuf, &pitch_bytes, sizeof(double)*Xdim, Ydim);
    cudaMallocPitch((void**)&dob, &pitch_bytes,  sizeof(double)*Xdim, Ydim);


    hbuf[0][0] = 1.234567891234567;
    hbuf[0][1] = 12.34567891234567;
    hbuf[0][2] = 123.4567891234567;
    hbuf[0][3] = 1234.567891234567;
    hbuf[0][4] = 12345.67891234567;
    hbuf[0][5] = 123456.7891234567;
    hbuf[0][6] = 1234567.891234567;
    hbuf[0][7] = 12345678.91234567;
    hbuf[1][0] = 123456789.1234567;
    hbuf[1][1] = 1234567891.234567;
    hbuf[1][2] = 12345678912.34567;
    hbuf[1][3] = 123456789123.4567;
    hbuf[1][4] = 1234567891234.567;
    hbuf[1][5] = 12345678912345.67;
    hbuf[1][6] = 123456789123456.7;
    hbuf[1][7] = 1234567891234567;
    hbuf[2][0] = 123456789.7654321;
    hbuf[2][1] = 1234567897.654321;
    hbuf[2][2] = 12345678976.54321;
    hbuf[2][3] = 123456789765.4321;
    hbuf[2][4] = 1234567897654.321;
    hbuf[2][5] = 12345678976543.21;
    hbuf[2][6] = 123456789765432.1;
    hbuf[2][7] = 1234567897654321;
    hbuf[3][0] = 9.876543211234567;
    hbuf[3][1] = 98.76543211234567;
    hbuf[3][2] = 987.6543211234567;
    hbuf[3][3] = 9876.543211234567;
    hbuf[3][4] = 98765.43211234567;
    hbuf[3][5] = 987654.3211234567;
    hbuf[3][6] = 9876543.211234567;
    hbuf[3][7] = 98765432.11234567;
    hbuf[4][0] = 987654321.1234567;
    hbuf[4][1] = 9876543211.234567;
    hbuf[4][2] = 98765432112.34567;
    hbuf[4][3] = 987654321123.4567;
    hbuf[4][4] = 9876543211234.567;
    hbuf[4][5] = 98765432112345.67;
    hbuf[4][6] = 987654321123456.7;
    hbuf[4][7] = 9876543211234567;
    hbuf[5][0] = 987654321.7654321;
    hbuf[5][1] = 9876543217.654321;
    hbuf[5][2] = 98765432176.54321;
    hbuf[5][3] = 987654321765.4321;
    hbuf[5][4] = 9876543217654.321;
    hbuf[5][5] = 98765432176543.21;
    hbuf[5][6] = 987654321765432.1;
    hbuf[5][7] = 9876543217654321;
    hbuf[6][0] = 1234567891234567;
    hbuf[6][1] = 123456789123456.7;
    hbuf[6][2] = 12345678912345.67;
    hbuf[6][3] = 1234567891234.567;
    hbuf[6][4] = 123456789123.4567;
    hbuf[6][5] = 12345678912.34567;
    hbuf[6][6] = 1234567891.234567;
    hbuf[6][7] = 123456789.1234567;
    hbuf[7][0] = 12345678.91234567;
    hbuf[7][1] = 1234567.891234567;
    hbuf[7][2] = 123456.7891234567;
    hbuf[7][3] = 12345.67891234567;
    hbuf[7][4] = 1234.567891234567;
    hbuf[7][5] = 123.4567891234567;
    hbuf[7][6] = 12.34567891234567;
    hbuf[7][7] = 1.234567891234567; 
    for (int i=0; i<Xdim; i++){
        for(int j=0; j<Ydim; j++){

            printf("%.16f\t", hbuf[i][j]);
        }
        printf("\n");
    }

    cudaMemcpy2D(dbuf, pitch_bytes, hbuf, Xdim*sizeof(double), Xdim*sizeof(double), Ydim, cudaMemcpyHostToDevice);

    me_texture.addressMode[0] = cudaAddressModeClamp;
    me_texture.addressMode[1] = cudaAddressModeClamp;
    me_texture.filterMode = cudaFilterModeLinear;
    me_texture.normalized = false;  

    cudaBindTexture2D(0, me_texture, dbuf, cudaCreateChannelDesc(32,32,0,0, cudaChannelFormatKindSigned), Xdim, Ydim, pitch_bytes ); 

    int pitch = pitch_bytes/sizeof(double);

    kern<<<1, 64>>>(dob, pitch);

    cudaMemcpy2D(hout,Xdim*sizeof(double), dob, pitch_bytes, Xdim*sizeof(double),Ydim,  cudaMemcpyDeviceToHost);

    printf("\nI am Fine\n");

    for(int i = 0 ; i < Xdim ; i++){
        for(int j=0; j<Ydim; j++){
            printf("%.16f\t", hout[i][j]);
        }
        printf("\n");
    }
    cudaUnbindTexture(me_texture);
    cudaFree(dbuf);
    cudaFree(dob);
    return 0;
}

Above code work fine if you change the following things.如果您更改以下内容,上面的代码可以正常工作。 Replace代替

kern<<<1, 64>>>(..., ..)

to

dim3 blockPerGrid(1, 1)
dim3 threadPerBlock(8, 8)
kern<<<blockPerGrid, threadPerBlock>>>(....)

here in place of Xdim change it to pitch在这里代替Xdim将其更改为音调

 o[j*pitch + i] = A[threadIdx.x][threadIdx.y]; 

And change cudaFilterModeLinear to cudaFilterModePoint .并将cudaFilterModeLinear更改为cudaFilterModePoint For the compilation you need to specify the computing capability, suppose your compute capability ie 3.0 then it would be对于编译你需要指定计算能力,假设你的计算能力即 3.0 那么它将是

 nvcc -arch=sm_30 file.cu 

If your code contained error checking, you would realise that your kernel launch is failing with an invalid filter mode.如果您的代码包含错误检查,您会意识到您的内核启动失败,并且过滤器模式无效。 It isn't legal in CUDA to use a cudaFilterModeLinear with non-float types, so nothing is actually running.在 CUDA 中使用非浮点类型的cudaFilterModeLinear是不合法的,所以实际上没有任何东西在运行。 If you change the filter mode to cudaFilterModePoint , you might find things start working.如果您将过滤器模式更改为cudaFilterModePoint ,您可能会发现事情开始起作用了。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM