[英]CUDA C - how to use Texture2D for double precision floating point
I want to use texture 2D memory for double precision.我想将纹理 2D 内存用于双精度。 I want to read from texture to shared memory and convert int2 to double, and then transfer back to host memory But I am getting only first row as desired and all other row's value is 2.00000000.我想从纹理读取到共享内存并将 int2 转换为双精度,然后传输回主机内存但我只根据需要获取第一行,所有其他行的值为 2.00000000。
#include<stdio.h>
#include<cuda.h>
#define Xdim 8
#define Ydim 8
texture<int2,2>me_texture;
static __inline__ __device__ double fetch_double(int2 p){
return __hiloint2double(p.y, p.x);
}
__global__ void kern(double *o, int pitch){
__shared__ double A[Xdim][Ydim];
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
int2 jj;
if(i<Xdim && j<Ydim){
jj = tex2D(me_texture, i, j);
A[threadIdx.x][threadIdx.y] = fetch_double(jj);
}
__syncthreads();
if(i<Xdim && j<Ydim){
o[j*Xdim + i] = A[threadIdx.x][threadIdx.y];
}
}
int main(int argc, char *argv[]){
double hbuf[Xdim][Ydim];
double hout[Xdim][Ydim];
double *dob;
double *dbuf;
size_t pitch_bytes;
cudaMallocPitch((void**)&dbuf, &pitch_bytes, sizeof(double)*Xdim, Ydim);
cudaMallocPitch((void**)&dob, &pitch_bytes, sizeof(double)*Xdim, Ydim);
hbuf[0][0] = 1.234567891234567;
hbuf[0][1] = 12.34567891234567;
hbuf[0][2] = 123.4567891234567;
hbuf[0][3] = 1234.567891234567;
hbuf[0][4] = 12345.67891234567;
hbuf[0][5] = 123456.7891234567;
hbuf[0][6] = 1234567.891234567;
hbuf[0][7] = 12345678.91234567;
hbuf[1][0] = 123456789.1234567;
hbuf[1][1] = 1234567891.234567;
hbuf[1][2] = 12345678912.34567;
hbuf[1][3] = 123456789123.4567;
hbuf[1][4] = 1234567891234.567;
hbuf[1][5] = 12345678912345.67;
hbuf[1][6] = 123456789123456.7;
hbuf[1][7] = 1234567891234567;
hbuf[2][0] = 123456789.7654321;
hbuf[2][1] = 1234567897.654321;
hbuf[2][2] = 12345678976.54321;
hbuf[2][3] = 123456789765.4321;
hbuf[2][4] = 1234567897654.321;
hbuf[2][5] = 12345678976543.21;
hbuf[2][6] = 123456789765432.1;
hbuf[2][7] = 1234567897654321;
hbuf[3][0] = 9.876543211234567;
hbuf[3][1] = 98.76543211234567;
hbuf[3][2] = 987.6543211234567;
hbuf[3][3] = 9876.543211234567;
hbuf[3][4] = 98765.43211234567;
hbuf[3][5] = 987654.3211234567;
hbuf[3][6] = 9876543.211234567;
hbuf[3][7] = 98765432.11234567;
hbuf[4][0] = 987654321.1234567;
hbuf[4][1] = 9876543211.234567;
hbuf[4][2] = 98765432112.34567;
hbuf[4][3] = 987654321123.4567;
hbuf[4][4] = 9876543211234.567;
hbuf[4][5] = 98765432112345.67;
hbuf[4][6] = 987654321123456.7;
hbuf[4][7] = 9876543211234567;
hbuf[5][0] = 987654321.7654321;
hbuf[5][1] = 9876543217.654321;
hbuf[5][2] = 98765432176.54321;
hbuf[5][3] = 987654321765.4321;
hbuf[5][4] = 9876543217654.321;
hbuf[5][5] = 98765432176543.21;
hbuf[5][6] = 987654321765432.1;
hbuf[5][7] = 9876543217654321;
hbuf[6][0] = 1234567891234567;
hbuf[6][1] = 123456789123456.7;
hbuf[6][2] = 12345678912345.67;
hbuf[6][3] = 1234567891234.567;
hbuf[6][4] = 123456789123.4567;
hbuf[6][5] = 12345678912.34567;
hbuf[6][6] = 1234567891.234567;
hbuf[6][7] = 123456789.1234567;
hbuf[7][0] = 12345678.91234567;
hbuf[7][1] = 1234567.891234567;
hbuf[7][2] = 123456.7891234567;
hbuf[7][3] = 12345.67891234567;
hbuf[7][4] = 1234.567891234567;
hbuf[7][5] = 123.4567891234567;
hbuf[7][6] = 12.34567891234567;
hbuf[7][7] = 1.234567891234567;
for (int i=0; i<Xdim; i++){
for(int j=0; j<Ydim; j++){
printf("%.16f\t", hbuf[i][j]);
}
printf("\n");
}
cudaMemcpy2D(dbuf, pitch_bytes, hbuf, Xdim*sizeof(double), Xdim*sizeof(double), Ydim, cudaMemcpyHostToDevice);
me_texture.addressMode[0] = cudaAddressModeClamp;
me_texture.addressMode[1] = cudaAddressModeClamp;
me_texture.filterMode = cudaFilterModeLinear;
me_texture.normalized = false;
cudaBindTexture2D(0, me_texture, dbuf, cudaCreateChannelDesc(32,32,0,0, cudaChannelFormatKindSigned), Xdim, Ydim, pitch_bytes );
int pitch = pitch_bytes/sizeof(double);
kern<<<1, 64>>>(dob, pitch);
cudaMemcpy2D(hout,Xdim*sizeof(double), dob, pitch_bytes, Xdim*sizeof(double),Ydim, cudaMemcpyDeviceToHost);
printf("\nI am Fine\n");
for(int i = 0 ; i < Xdim ; i++){
for(int j=0; j<Ydim; j++){
printf("%.16f\t", hout[i][j]);
}
printf("\n");
}
cudaUnbindTexture(me_texture);
cudaFree(dbuf);
cudaFree(dob);
return 0;
}
Above code work fine if you change the following things.如果您更改以下内容,上面的代码可以正常工作。 Replace代替
kern<<<1, 64>>>(..., ..)
to到
dim3 blockPerGrid(1, 1)
dim3 threadPerBlock(8, 8)
kern<<<blockPerGrid, threadPerBlock>>>(....)
here in place of Xdim change it to pitch在这里代替Xdim将其更改为音调
o[j*pitch + i] = A[threadIdx.x][threadIdx.y];
And change cudaFilterModeLinear to cudaFilterModePoint .并将cudaFilterModeLinear更改为cudaFilterModePoint 。 For the compilation you need to specify the computing capability, suppose your compute capability ie 3.0 then it would be对于编译你需要指定计算能力,假设你的计算能力即 3.0 那么它将是
nvcc -arch=sm_30 file.cu
If your code contained error checking, you would realise that your kernel launch is failing with an invalid filter mode.如果您的代码包含错误检查,您会意识到您的内核启动失败,并且过滤器模式无效。 It isn't legal in CUDA to use a cudaFilterModeLinear
with non-float types, so nothing is actually running.在 CUDA 中使用非浮点类型的cudaFilterModeLinear
是不合法的,所以实际上没有任何东西在运行。 If you change the filter mode to cudaFilterModePoint
, you might find things start working.如果您将过滤器模式更改为cudaFilterModePoint
,您可能会发现事情开始起作用了。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.