[英]cudaMemcpyToSymbol just hangs and never returns. GPU processing at 100%. Code works fine on K40 but not on V100
[英]how to use fma on gpu v100?
我已經在https://docs.nvidia.com/cuda/floating-point/index.html上嘗試了 FMA 的示例
union {
float f;
unsigned int i
} a, b;
float r;
a.i = 0x3F800001;
b.i = 0xBF800002;
r = a.f * a.f + b.f;
printf("a %.8g\n", a.f);
printf("b %.8g\n", b.f);
printf("r %.8g\n", r);
但是,我在 gpu 上得到了 0。 我的 test.cu 如下所示:
#include <stdio.h>
#include <iostream>
using namespace std;
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
return EXIT_FAILURE;}} while(0)
#define BLOCKS 1 //useless
#define TPB 3 //useless
#define TIMES 5 //useless
__global__ void test() {
union {
float f;
unsigned int i;
} a, b;
float r;
a.i = 0x3F800001;
b.i = 0xBF800002;
r = a.f * a.f + b.f;
printf("a %.30g\n", a.f);
printf("b %.30g\n", b.f);
printf("r %.30g\n", r);
}
int main() {
float *devResults; //useless
CUDA_CALL(cudaMalloc((void **)&devResults, BLOCKS * TPB * TIMES *
sizeof(float)));
CUDA_CALL(cudaMemset(devResults, 0, BLOCKS * TPB * TIMES *
sizeof(float)));
test<<<1, 1>>>();
CUDA_CALL(cudaFree(devResults));
return 0;
}
我編譯 test.cu 使用: nvcc test.cu --fmad=true
當我調用“fma”function 時,它可以工作。 但是,它應該在沒有“fma”的情況下工作。
您對正在發生的事情的解釋是不正確的。 在詳細查看您的代碼之前,請先查看我對您正在嘗試執行的操作的版本(讓我們拋開聯合問題,這在技術上是未定義的行為):
#include <stdio.h>
typedef union {
float f;
unsigned int i;
} bodge;
__global__ void test(unsigned int x, unsigned int y, float* out, bool dowrite) {
bodge a, b;
a.i = x;
b.i = y;
float r = a.f * a.f + b.f;
printf("a %.30g\n", a.f);
printf("b %.30g\n", b.f);
printf("r %.30g\n", r);
if (dowrite) *out = r;
}
int main() {
test<<<1, 1>>>(0x3F800001, 0xBF800002, (float*)0, false);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
它在 Maxwell GPU 上像這樣編譯和運行:
$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 1.42108547152020037174224853516e-14
如果我們為此查看匯編程序 output,我們可以清楚地看到/*0058*/
處的 FMAD 指令:
$ cuobjdump -sass fmad
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_52
Function : _Z4testjjPfb
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c4400fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ { IADD32I R1, R1, -0x8; /* 0x1c0fffffff870101 */
/*0018*/ F2F.F64.F32 R10, c[0x0][0x140]; } /* 0x4ca8000005070b0a */
/* 0x001fc400fe2007f5 */
/*0028*/ MOV R17, c[0x0][0x140]; /* 0x4c98078005070011 */
/*0030*/ LOP.OR R2, R1, c[0x0][0x4]; /* 0x4c47020000170102 */
/*0038*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/* 0x001fc800fc2007f1 */
/*0048*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*0050*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0058*/ FFMA R17, R17, R17, c[0x0][0x144]; /* 0x5180088005171111 */
/* 0x0023c800ffe007f1 */
/*0068*/ LOP32I.AND R16, R2, 0xffffff; /* 0x04000ffffff70210 */
/*0070*/ MOV R6, R2; /* 0x5c98078000270006 */
/*0078*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/* 0x001c4400fe000ffd */
/*0088*/ JCAL 0x0; /* 0xe220000000000040 */
/*0090*/ { MOV32I R4, 0x0; /* 0x010000000007f004 */
/*0098*/ F2F.F64.F32 R10, c[0x0][0x144]; } /* 0x4ca8000005170b0a */
/* 0x001ffc00fe2007f1 */
/*00a8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*00b0*/ MOV R6, R2; /* 0x5c98078000270006 */
/*00b8*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/* 0x001fc001ffa008f2 */
/*00c8*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/*00d0*/ JCAL 0x0; /* 0xe220000000000040 */
/*00d8*/ { MOV R6, R2; /* 0x5c98078000270006 */
/*00e8*/ F2F.F64.F32 R10, R17; } /* 0x001fc400fe200711 */
/* 0x5ca8000001170b0a */
/*00f0*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00f8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/* 0x003ff4011e4007ff */
/*0108*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0110*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/*0118*/ JCAL 0x0; /* 0xe220000000000040 */
/* 0x003fb401e3a0071f */
/*0128*/ LDC.U8 R0, c[0x0][0x150]; /* 0xef9000001507ff00 */
/*0130*/ I2I.S16.S8 R0, R0; /* 0x5ce0000000073100 */
/*0138*/ LOP.AND.NZ P0, RZ, R0, 0xff; /* 0x384030000ff700ff */
/* 0x001fc800fe2007fd */
/*0148*/ @!P0 EXIT; /* 0xe30000000008000f */
/*0150*/ MOV R2, c[0x0][0x148]; /* 0x4c98078005270002 */
/*0158*/ MOV R3, c[0x0][0x14c]; /* 0x4c98078005370003 */
/* 0x001ffc00ffe000f1 */
/*0168*/ STG.E [R2], R17; /* 0xeedc200000070211 */
/*0170*/ EXIT; /* 0xe30000000007000f */
/*0178*/ BRA 0x178; /* 0xe2400fffff87000f */
.......................
所以現在讓我們看看你的 kernel 代碼:
$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 0
結果不同。 反匯編顯示了原因:
$ cuobjdump -sass fmad
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_52
Function : _Z5test0v
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001fc400fec007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ IADD32I R1, R1, -0x8; /* 0x1c0fffffff870101 */
/*0018*/ LOP.OR R2, R1, c[0x0][0x4]; /* 0x4c47020000170102 */
/* 0x001fc400fe2007f1 */
/*0028*/ MOV32I R10, 0x20000000; /* 0x010200000007f00a */
/*0030*/ MOV32I R11, 0x3ff00000; /* 0x0103ff000007f00b */
/*0038*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/* 0x001fc000fe4007e2 */
/*0048*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*0050*/ LOP32I.AND R16, R2, 0xffffff; /* 0x04000ffffff70210 */
/*0058*/ { MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0068*/ STL.64 [R16], R10; } /* 0x003ff400fec000f1 */
/* 0xef5500000007100a */
/*0070*/ MOV R6, R2; /* 0x5c98078000270006 */
/*0078*/ JCAL 0x0; /* 0xe220000000000040 */
/* 0x001fc000fe4007f1 */
/*0088*/ MOV32I R10, 0x40000000; /* 0x010400000007f00a */
/*0090*/ MOV32I R11, 0xbff00000; /* 0x010bff000007f00b */
/*0098*/ { MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00a8*/ STL.64 [R16], R10; } /* 0x001fc400fe2000f1 */
/* 0xef5500000007100a */
/*00b0*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*00b8*/ MOV R6, R2; /* 0x5c98078000270006 */
/* 0x001fc001ffa007e6 */
/*00c8*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*00d0*/ JCAL 0x0; /* 0xe220000000000040 */
/*00d8*/ { MOV R6, R2; /* 0x5c98078000270006 */
/*00e8*/ STL.64 [R16], RZ; } /* 0x001fc400fe2000f1 */
/* 0xef550000000710ff */
/*00f0*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00f8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/* 0x001ffc01ffa007f6 */
/*0108*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0110*/ JCAL 0x0; /* 0xe220000000000040 */
/*0118*/ EXIT; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0128*/ BRA 0x120; /* 0xe2400fffff07000f */
/*0130*/ NOP; /* 0x50b0000000070f00 */
/*0138*/ NOP; /* 0x50b0000000070f00 */
....................
可以看到根本沒有浮點指令。 為什么? 因為編譯器已經確定 kernel 中的所有內容都是常量,並且可以安全地預先計算結果並將其替換為發出的代碼。 而且我會假設計算是以更高的精度完成的,這就是結果為 0 的原因(或者存在內部編譯器錯誤)。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.