简体   繁体   中英

how to use fma on gpu v100?

I've tried the example of FMA on https://docs.nvidia.com/cuda/floating-point/index.html

union  {
    float  f;
    unsigned  int  i
} a,  b;
float  r;

a.i = 0x3F800001;
b.i = 0xBF800002;
r = a.f  * a.f  + b.f;

printf("a %.8g\n", a.f); 
printf("b %.8g\n", b.f); 
printf("r %.8g\n",   r);

However, I got 0 on gpu. My test.cu is shown as below:

#include <stdio.h>
#include <iostream>
using namespace std;
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    return EXIT_FAILURE;}} while(0)

#define BLOCKS 1 //useless
#define TPB 3    //useless
#define TIMES 5  //useless

__global__ void test() {
    union  {
        float  f;
        unsigned  int  i;
    } a,  b;
    float  r;

    a.i = 0x3F800001;
    b.i = 0xBF800002;
    r = a.f  * a.f  + b.f;
    printf("a %.30g\n", a.f);
    printf("b %.30g\n", b.f);
    printf("r %.30g\n",   r);
}

int main() {
    float *devResults; //useless

    CUDA_CALL(cudaMalloc((void **)&devResults, BLOCKS * TPB * TIMES *
              sizeof(float)));

    CUDA_CALL(cudaMemset(devResults, 0, BLOCKS * TPB * TIMES *
              sizeof(float)));
    test<<<1, 1>>>();
    CUDA_CALL(cudaFree(devResults));
    return 0;
}

I compiled test.cu using: nvcc test.cu --fmad=true

When I call 'fma' function, it works. However, it is supposed to work without 'fma'.

Your interpretation of what is happening is incorrect. Before looking at your code in details, look at my version of what you are trying to do (let's leave aside the issue of the union which is technically undefined behaviour):

#include <stdio.h>

typedef union  {
        float  f;
        unsigned  int  i;
       } bodge;

__global__ void test(unsigned int x, unsigned int y, float* out, bool dowrite) {

    bodge a, b;
    a.i = x;
    b.i = y;

    float r = a.f  * a.f  + b.f;
    printf("a %.30g\n", a.f);
    printf("b %.30g\n", b.f);
    printf("r %.30g\n",   r);
    if (dowrite) *out = r;
}

int main() {
    test<<<1, 1>>>(0x3F800001, 0xBF800002, (float*)0, false);
    cudaDeviceSynchronize();
    cudaDeviceReset();
    return 0;
}

which compiles and runs like so on a Maxwell GPU:

$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 1.42108547152020037174224853516e-14

If we look at the assembler output for this, we clearly see an FMAD instruction at /*0058*/ :

$ cuobjdump -sass fmad

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

    code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

    code for sm_52
        Function : _Z4testjjPfb
    .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                                /* 0x001c4400fe0007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                        /* 0x4c98078000870001 */
        /*0010*/         {         IADD32I R1, R1, -0x8;                        /* 0x1c0fffffff870101 */
        /*0018*/                   F2F.F64.F32 R10, c[0x0][0x140];        }     /* 0x4ca8000005070b0a */
                                                                                /* 0x001fc400fe2007f5 */
        /*0028*/                   MOV R17, c[0x0][0x140];                      /* 0x4c98078005070011 */
        /*0030*/                   LOP.OR R2, R1, c[0x0][0x4];                  /* 0x4c47020000170102 */
        /*0038*/                   MOV32I R4, 0x0;                              /* 0x010000000007f004 */
                                                                                /* 0x001fc800fc2007f1 */
        /*0048*/                   MOV32I R5, 0x0;                              /* 0x010000000007f005 */
        /*0050*/                   MOV R7, RZ;                                  /* 0x5c9807800ff70007 */
        /*0058*/                   FFMA R17, R17, R17, c[0x0][0x144];           /* 0x5180088005171111 */
                                                                                /* 0x0023c800ffe007f1 */
        /*0068*/                   LOP32I.AND R16, R2, 0xffffff;                /* 0x04000ffffff70210 */
        /*0070*/                   MOV R6, R2;                                  /* 0x5c98078000270006 */
        /*0078*/                   STL.64 [R16], R10;                           /* 0xef5500000007100a */
                                                                                /* 0x001c4400fe000ffd */
        /*0088*/                   JCAL 0x0;                                    /* 0xe220000000000040 */
        /*0090*/         {         MOV32I R4, 0x0;                              /* 0x010000000007f004 */
        /*0098*/                   F2F.F64.F32 R10, c[0x0][0x144];        }     /* 0x4ca8000005170b0a */
                                                                                /* 0x001ffc00fe2007f1 */
        /*00a8*/                   MOV32I R5, 0x0;                              /* 0x010000000007f005 */
        /*00b0*/                   MOV R6, R2;                                  /* 0x5c98078000270006 */
        /*00b8*/                   MOV R7, RZ;                                  /* 0x5c9807800ff70007 */
                                                                                /* 0x001fc001ffa008f2 */
        /*00c8*/                   STL.64 [R16], R10;                           /* 0xef5500000007100a */
        /*00d0*/                   JCAL 0x0;                                    /* 0xe220000000000040 */
        /*00d8*/         {         MOV R6, R2;                                  /* 0x5c98078000270006 */
        /*00e8*/                   F2F.F64.F32 R10, R17;        }               /* 0x001fc400fe200711 */
                                                                                /* 0x5ca8000001170b0a */
        /*00f0*/                   MOV32I R4, 0x0;                              /* 0x010000000007f004 */
        /*00f8*/                   MOV32I R5, 0x0;                              /* 0x010000000007f005 */
                                                                                /* 0x003ff4011e4007ff */
        /*0108*/                   MOV R7, RZ;                                  /* 0x5c9807800ff70007 */
        /*0110*/                   STL.64 [R16], R10;                           /* 0xef5500000007100a */
        /*0118*/                   JCAL 0x0;                                    /* 0xe220000000000040 */
                                                                                /* 0x003fb401e3a0071f */
        /*0128*/                   LDC.U8 R0, c[0x0][0x150];                    /* 0xef9000001507ff00 */
        /*0130*/                   I2I.S16.S8 R0, R0;                           /* 0x5ce0000000073100 */
        /*0138*/                   LOP.AND.NZ P0, RZ, R0, 0xff;                 /* 0x384030000ff700ff */
                                                                                /* 0x001fc800fe2007fd */
        /*0148*/              @!P0 EXIT;                                        /* 0xe30000000008000f */
        /*0150*/                   MOV R2, c[0x0][0x148];                       /* 0x4c98078005270002 */
        /*0158*/                   MOV R3, c[0x0][0x14c];                       /* 0x4c98078005370003 */
                                                                                /* 0x001ffc00ffe000f1 */
        /*0168*/                   STG.E [R2], R17;                             /* 0xeedc200000070211 */
        /*0170*/                   EXIT;                                        /* 0xe30000000007000f */
        /*0178*/                   BRA 0x178;                                   /* 0xe2400fffff87000f */
        .......................

So now let's look at your kernel code:

$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 0

Different result. Dissassembly shows why:

$ cuobjdump -sass fmad

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

    code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

    code for sm_52
        Function : _Z5test0v
    .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                           /* 0x001fc400fec007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                   /* 0x4c98078000870001 */
        /*0010*/                   IADD32I R1, R1, -0x8;                   /* 0x1c0fffffff870101 */
        /*0018*/                   LOP.OR R2, R1, c[0x0][0x4];             /* 0x4c47020000170102 */
                                                                           /* 0x001fc400fe2007f1 */
        /*0028*/                   MOV32I R10, 0x20000000;                 /* 0x010200000007f00a */
        /*0030*/                   MOV32I R11, 0x3ff00000;                 /* 0x0103ff000007f00b */
        /*0038*/                   MOV32I R4, 0x0;                         /* 0x010000000007f004 */
                                                                           /* 0x001fc000fe4007e2 */
        /*0048*/                   MOV32I R5, 0x0;                         /* 0x010000000007f005 */
        /*0050*/                   LOP32I.AND R16, R2, 0xffffff;           /* 0x04000ffffff70210 */
        /*0058*/         {         MOV R7, RZ;                             /* 0x5c9807800ff70007 */
        /*0068*/                   STL.64 [R16], R10;        }             /* 0x003ff400fec000f1 */
                                                                           /* 0xef5500000007100a */
        /*0070*/                   MOV R6, R2;                             /* 0x5c98078000270006 */
        /*0078*/                   JCAL 0x0;                               /* 0xe220000000000040 */
                                                                           /* 0x001fc000fe4007f1 */
        /*0088*/                   MOV32I R10, 0x40000000;                 /* 0x010400000007f00a */
        /*0090*/                   MOV32I R11, 0xbff00000;                 /* 0x010bff000007f00b */
        /*0098*/         {         MOV32I R4, 0x0;                         /* 0x010000000007f004 */
        /*00a8*/                   STL.64 [R16], R10;        }             /* 0x001fc400fe2000f1 */
                                                                           /* 0xef5500000007100a */
        /*00b0*/                   MOV32I R5, 0x0;                         /* 0x010000000007f005 */
        /*00b8*/                   MOV R6, R2;                             /* 0x5c98078000270006 */
                                                                           /* 0x001fc001ffa007e6 */
        /*00c8*/                   MOV R7, RZ;                             /* 0x5c9807800ff70007 */
        /*00d0*/                   JCAL 0x0;                               /* 0xe220000000000040 */
        /*00d8*/         {         MOV R6, R2;                             /* 0x5c98078000270006 */
        /*00e8*/                   STL.64 [R16], RZ;        }              /* 0x001fc400fe2000f1 */
                                                                           /* 0xef550000000710ff */
        /*00f0*/                   MOV32I R4, 0x0;                         /* 0x010000000007f004 */
        /*00f8*/                   MOV32I R5, 0x0;                         /* 0x010000000007f005 */
                                                                           /* 0x001ffc01ffa007f6 */
        /*0108*/                   MOV R7, RZ;                             /* 0x5c9807800ff70007 */
        /*0110*/                   JCAL 0x0;                               /* 0xe220000000000040 */
        /*0118*/                   EXIT;                                   /* 0xe30000000007000f */
                                                                           /* 0x001f8000fc0007ff */
        /*0128*/                   BRA 0x120;                              /* 0xe2400fffff07000f */
        /*0130*/                   NOP;                                    /* 0x50b0000000070f00 */
        /*0138*/                   NOP;                                    /* 0x50b0000000070f00 */
        ....................

You can see that there are no floating point instructions at all. Why? Because the compiler has determined that everything within the kernel is constant and it is safe to pre-calculate the result and substitute that into the emitted code. And I would hypothesize that the computations were done in a higher precision, which is why the result is 0 (or there is an internal compiler bug).

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM