简体   繁体   中英

How to bind different kinds of textures to a texture reference in CUDA?

This piece of code works on Cuda 4.2

extern "C" texture<int,1,cudaReadModeElementType> __tex0;
extern "C" __global__ void kernel(){
  float4 f = tex1Dfetch(*(texture<float4,1,cudaReadModeElementType>*)&__tex0,ii_z)
}

Since Cuda have changed grammer, I can not fetch different kind of textures from a texture, Any idea?

PS . I've found Cuda texture object in reference, but That's a lot of work to change all occurances. Is there a better solution with minor code change?

Thanks

If anyone want the original code, please click here .

It seems like the minimum repro case for this is:

texture<int,1,cudaReadModeElementType> __tex0;

__global__ void kernel0(float4 *out)
{
    int t__a = blockIdx.x*blockDim.x+threadIdx.x;
    int ii = (t__a*3);
    float4 rr = tex1Dfetch(*(texture<float4,1,cudaReadModeElementType>*)&__tex0,ii);
    out[t__a] = rr;
}

CUDA 7.5 will fail to compile this kernel with an error:

texture_repo.cu(7): error: cannot take address of texture/surface variable "__tex0" in __device__/__global__ functions

I believe this is correct. Texture references are opaque placeholder types which don't have any of the usual properties of POD types and I would be very suspicious about ever writing code like the example you provided a link to.

However, it is true that CUDA 4.2 will compile this and emit valid PTX:

.entry _Z7kernel0P6float4(
        .param .u64 _Z7kernel0P6float4_param_0
)
{
        .reg .f32       %f<25>;
        .reg .s32       %r<8>;
        .reg .s64       %rl<5>;


        ld.param.u64    %rl1, [_Z7kernel0P6float4_param_0];
        cvta.to.global.u64      %rl2, %rl1;
        .loc 2 5 1
        mov.u32         %r2, %ntid.x;
        mov.u32         %r3, %ctaid.x;
        mov.u32         %r4, %tid.x;
        mad.lo.s32      %r5, %r2, %r3, %r4;
        .loc 2 6 1
        mul.lo.s32      %r1, %r5, 3;
        mov.u32         %r6, 0;
        // inline asm
        tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [__tex0, {%r1}];
        // inline asm
        .loc 2 8 1
        mul.wide.s32    %rl3, %r5, 16;
        add.s64         %rl4, %rl2, %rl3;
        st.global.v4.f32        [%rl4], {%f1, %f2, %f3, %f4};
        .loc 2 9 2
        ret;
}

The cast apparently has no effect other than suppressing a compiler error, and at a PTX level the read works because texture reference reads always return a four wide vector type, even if the extra vector elements are empty and ignored. I would regard the fact this compiles in CUDA 4.2 as a compiler bug, and it would seem that CUDA 7.5 is correct in this case.

That said, a very hacky work-around would be to do this:

texture<int,1,cudaReadModeElementType> __tex0;

__device__ float4 tex_load0(int idx)
{
    float4 temp;
    asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [__tex0, {%4}];" :
        "=f"(temp.x), "=f"(temp.y), "=f"(temp.z), "=f"(temp.w) : "r"(idx));
    return temp;
}

__global__ void kernel1(float4 *out)
{
    int t__a = blockIdx.x*blockDim.x+threadIdx.x;
    int ii = (t__a*3);
    float4 rr = tex_load0(ii); 
    out[t__a] = rr;
}

[DISCLAIMER: compiled but never tested. Not recommended. Use at own risk].

ie insert the same PTX emitted inline by the CUDA 4.2 compiler into a device function, and replace the texture fetches with calls to the device function. With the CUDA 7.5 toolchain, this emits:

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//

.version 4.3
.target sm_30
.address_size 64

    // .globl   _Z9tex_load0i
.global .texref __tex0;

.visible .func  (.param .align 16 .b8 func_retval0[16]) _Z9tex_load0i(
    .param .b32 _Z9tex_load0i_param_0
)
{
    .reg .f32   %f<5>;
    .reg .b32   %r<2>;


    ld.param.u32    %r1, [_Z9tex_load0i_param_0];
    // inline asm
    tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [__tex0, {%r1}];
    // inline asm
    st.param.f32    [func_retval0+0], %f1;
    st.param.f32    [func_retval0+4], %f2;
    st.param.f32    [func_retval0+8], %f3;
    st.param.f32    [func_retval0+12], %f4;
    ret;
}

    // .globl   _Z7kernel1P6float4
.visible .entry _Z7kernel1P6float4(
    .param .u64 _Z7kernel1P6float4_param_0
)
{
    .reg .f32   %f<5>;
    .reg .b32   %r<6>;
    .reg .b64   %rd<5>;


    ld.param.u64    %rd1, [_Z7kernel1P6float4_param_0];
    cvta.to.global.u64  %rd2, %rd1;
    mov.u32     %r2, %ctaid.x;
    mov.u32     %r3, %ntid.x;
    mov.u32     %r4, %tid.x;
    mad.lo.s32  %r5, %r3, %r2, %r4;
    mul.lo.s32  %r1, %r5, 3;
    mul.wide.s32    %rd3, %r5, 16;
    add.s64     %rd4, %rd2, %rd3;
    // inline asm
    tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [__tex0, {%r1}];
    // inline asm
    st.global.v4.f32    [%rd4], {%f1, %f2, %f3, %f4};
    ret;
}

which is the same PTX as the CUDA 4.2 toolchain emitted. This works because the compiler can't apply nearly the same level of type safety checking to inline PTX. But think hard about whether you really want to do this, because it is (in my opinion) undefined behaviour.

Also note that because of the way texture references are handled in PTX, you can't pass them as explicit arguments, so you will require defining one read function per texture in your code.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM