How to increase FLOPS achieved using CUDA

Question

So I'm just getting into CUDA (I've been using c++ for several years but I'm new to dealing with GPUs so forgive my lack of experience). I'm working on programming a 3D n-body simulation on my computer with an NVIDIA GEFORCE GTX860M graphics card. This card has a published peak theoretical FP32 performance of 1,389 GFLOPS ( https://www.techpowerup.com/gpu-specs/geforce-gtx-860m.c2536 ). I'm using the code below to figure out roughly how many "effective FLOPS" I can achieve and currently I'm only getting 7.100 GFLOPS when using global memory and 5.100 GFLOPS when using shared memory. I was under the impression that shared memory was 100X faster than global memory, so why am I not seeing an increase in FLOPS?

Side note 1: I'm assuming that each thread in "cudaFunction" is doing approximately 100,000 floating point operations per kernel call. Therefore, (512 * 128) threads * (100000) FP32 operations / (1.285) seconds = 5.100 GFLOPS.

Side note 2: I realize I'm probably not measuring FLOPS correctly but my goal is to maximize the number of floating point computations done within all my CUDA threads per unit time so hence I am referring to that quantity as "effective FLOPS".

My second question is what kind of effective flop rate can I expect to achieve and what sort of optimizations can I implement in order to increase my 5.1 GFLOPS to closer to the published maximum? 0.37% (5.1 GFLOPS / 1389 GFLOPS) of peak seems quite low so I'm assuming I'm hitting a bottleneck somewhere?

#include <cuda_runtime.h>
#include <iostream>
#include <time.h>
#include <math.h>
#include "device_launch_parameters.h"
#include <iomanip>
#include <cuda.h>


#define numPtcls 512*128//Total number of particles
#define threadsPerBlock 128//Number of threads per block
#define BLOCKS numPtcls / threadsPerBlock//total number of blocks

using namespace std;

struct Particles {
    float testVariable;
};

//USING SHARED MEMORY
__global__ void cudaFunction(Particles *particle)
{
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    float sum = 1;

    __shared__ float position;// Allocate share memory

    position = particle[0].testVariable;

    for (int i = 0; i < 100000; i++) {
        sum *= position;
    }

    particle[0].testVariable = 1;
}

////USING GLOBAL MEMORY
//__global__ void cudaFunction(Particles *particle)
//{
//  int index = threadIdx.x + blockIdx.x * blockDim.x;
//  float sum = 1;
//
//  for (int i = 0; i < 100000; i++) {
//      sum *= particle[0].testVariable;
//  }
//
//  particle[0].testVariable = 1;
//}

int main()
{
    Particles *particle = new Particles[numPtcls];

    particle[0].testVariable = 1;

    Particles *device_location;//POINTER TO MEMORY FOR CUDA
    int size = numPtcls * sizeof(Particles);//SIZE OF PARTICLE DATA TO MAKE ROOM FOR IN CUDA
    cudaMalloc((void**)&device_location, size);// allocate device copies
    cudaMemcpy(device_location, particle, size, cudaMemcpyHostToDevice);// copy inputs to device

    clock_t start, end;
    double cpu_time_used;

    while (true) {

        start = clock();

        cudaFunction << <BLOCKS, threadsPerBlock >> > (device_location);//CUDA CALL
        cudaMemcpy(particle, device_location, size, cudaMemcpyDeviceToHost);

        end = clock();

        cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
        std::cout << fixed << setprecision(6) << cpu_time_used << std::endl;

    }

    cudaFree(device_location);//FREE DEVICE MEMORY
    delete[] particle;//FREE CPU MEMORY

    return 0;
}

Answer 1

TLDR: Performance measurement is hard to get right -- the code you use, how you compile it, and how you time it all matter.

There was quite a lot wrong with your attempt, at least:

Unless the result of the kernel loop participates in a memory write, compiler optimization will treat the floating point calculations as dead code and remove them
Unless you compile for release and not debug, it is pointless to benchmark code because it eliminates all compiler optimization
The use of shared memory is completely irrelevant in this example because the compiler will cache results in the loop in registers anyway, and there is no optimization in memory transaction patterns by using shared memory in this case
clock measures cpu time, not wallclock time, so it is invalid to use it to time asynchronous operations on the GPU which do not consume CPU cycles
your timing, as broken as it was, also includes memcpy time, which is not really valid if you objective is to measure FLOPs within the kernel

Fixing all the above which gets me this:

$ cat floppy.cu 

#include <iostream>
#include <iomanip>
#include <cmath>
#include <limits>

#define numPtcls (512*128) //Total number of particles
#define threadsPerBlock (128) //Number of threads per block
#define BLOCKS numPtcls / threadsPerBlock//total number of blocks
#define niters (10000) // FMAD iterations per thread


struct Particles {
    float testVariable;
};

__global__ void cudaFunction(Particles *particle)
{
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    float sum = 1;

    float position = particle[0].testVariable;

    for (int i = 0; i < niters; i++) {
        sum *= position;
    }

    particle[0].testVariable = sum;
}

int main()
{
    Particles *particle = new Particles[numPtcls];

    particle[0].testVariable = 1;

    Particles *device_location;//POINTER TO MEMORY FOR CUDA
    int size = numPtcls * sizeof(Particles);//SIZE OF PARTICLE DATA TO MAKE ROOM FOR IN CUDA
    cudaMalloc((void**)&device_location, size);// allocate device copies
    cudaMemcpy(device_location, particle, size, cudaMemcpyHostToDevice);// copy inputs to device

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float flopcount = float(niters) * float(numPtcls);

    for(int i=0; i<10; i++) {
        cudaEventRecord(start, 0);
        cudaFunction << <BLOCKS, threadsPerBlock >> > (device_location);//CUDA CALL
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaMemcpy(particle, device_location, size, cudaMemcpyDeviceToHost);

    float gpu_time_used;
    cudaEventElapsedTime(&gpu_time_used, start, stop);
        std::cout << std::fixed << std::setprecision(6) << 1e-6 * (flopcount / gpu_time_used) << std::endl;
    }

    cudaFree(device_location);//FREE DEVICE MEMORY
    delete[] particle;//FREE CPU MEMORY

    return 0;
}

which is only a very modest modification of what you had (basically store the result of the kernel to memory to defeat dead code removal, use CUDA events to time the kernel)

$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Fri_Feb__8_19:08:17_PST_2019
Cuda compilation tools, release 10.1, V10.1.105

$ nvcc -arch=sm_52 -std=c++11 -Xptxas="-v" -o floppy floppy.cu 
floppy.cu(18): warning: variable "index" was declared but never referenced

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z12cudaFunctionP9Particles' for 'sm_52'
ptxas info    : Function properties for _Z12cudaFunctionP9Particles
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 6 registers, 328 bytes cmem[0]


$ ./floppy 
1557.296000
1534.312192
1575.505792
1547.762944
1541.820288
1555.521792
1561.808896
1545.193856
1545.543680
1581.345152

This fairly naïve code runs in about 4ms and gets me about 1550 GFLOP/s on my GTX970, which is about 40% of the peak of around 4000 GFLOP/s on the device I used to run it. The code emitted by the compiler is worth inspecting:

.version 6.4
.target sm_52
.address_size 64

    // .globl   _Z12cudaFunctionP9Particles

.visible .entry _Z12cudaFunctionP9Particles(
    .param .u64 _Z12cudaFunctionP9Particles_param_0
)
{
    .reg .pred  %p<2>;
    .reg .f32   %f<55>;
    .reg .b32   %r<5>;
    .reg .b64   %rd<3>;


    ld.param.u64    %rd2, [_Z12cudaFunctionP9Particles_param_0];
    cvta.to.global.u64  %rd1, %rd2;
    ld.global.f32   %f1, [%rd1];
    mov.f32     %f54, 0f3F800000;
    mov.u32     %r4, -10000;

BB0_1:
    mul.f32     %f5, %f1, %f54;
    mul.f32     %f6, %f1, %f5;
    mul.f32     %f7, %f1, %f6;
    mul.f32     %f8, %f1, %f7;
    mul.f32     %f9, %f1, %f8;
    mul.f32     %f10, %f1, %f9;
    mul.f32     %f11, %f1, %f10;
    mul.f32     %f12, %f1, %f11;
    mul.f32     %f13, %f1, %f12;
    mul.f32     %f14, %f1, %f13;
    mul.f32     %f15, %f1, %f14;
    mul.f32     %f16, %f1, %f15;
    mul.f32     %f17, %f1, %f16;
    mul.f32     %f18, %f1, %f17;
    mul.f32     %f19, %f1, %f18;
    mul.f32     %f20, %f1, %f19;
    mul.f32     %f21, %f1, %f20;
    mul.f32     %f22, %f1, %f21;
    mul.f32     %f23, %f1, %f22;
    mul.f32     %f24, %f1, %f23;
    mul.f32     %f25, %f1, %f24;
    mul.f32     %f26, %f1, %f25;
    mul.f32     %f27, %f1, %f26;
    mul.f32     %f28, %f1, %f27;
    mul.f32     %f29, %f1, %f28;
    mul.f32     %f30, %f1, %f29;
    mul.f32     %f31, %f1, %f30;
    mul.f32     %f32, %f1, %f31;
    mul.f32     %f33, %f1, %f32;
    mul.f32     %f34, %f1, %f33;
    mul.f32     %f35, %f1, %f34;
    mul.f32     %f36, %f1, %f35;
    mul.f32     %f37, %f1, %f36;
    mul.f32     %f38, %f1, %f37;
    mul.f32     %f39, %f1, %f38;
    mul.f32     %f40, %f1, %f39;
    mul.f32     %f41, %f1, %f40;
    mul.f32     %f42, %f1, %f41;
    mul.f32     %f43, %f1, %f42;
    mul.f32     %f44, %f1, %f43;
    mul.f32     %f45, %f1, %f44;
    mul.f32     %f46, %f1, %f45;
    mul.f32     %f47, %f1, %f46;
    mul.f32     %f48, %f1, %f47;
    mul.f32     %f49, %f1, %f48;
    mul.f32     %f50, %f1, %f49;
    mul.f32     %f51, %f1, %f50;
    mul.f32     %f52, %f1, %f51;
    mul.f32     %f53, %f1, %f52;
    mul.f32     %f54, %f1, %f53;
    add.s32     %r4, %r4, 50;
    setp.ne.s32 %p1, %r4, 0;
    @%p1 bra    BB0_1;

    st.global.f32   [%rd1], %f54;
    ret;
}

You can see that the compiler has unrolled the loop into a long stream of single precision mul instructions, which retire at the rate of 1 per clock cycle, or 1 FLOP per core per clock cycle. Note that if you change your kernel to this:

__global__ void cudaFunction(Particles *particle)
{
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    float sum = 1;

    float position = particle[0].testVariable;

    for (int i = 0; i < niters; i++) {
        sum += sum * position;
    }

    particle[0].testVariable = sum;
}

The compile will emit this:

.version 6.4
.target sm_52
.address_size 64

    // .globl   _Z12cudaFunctionP9Particles

.visible .entry _Z12cudaFunctionP9Particles(
    .param .u64 _Z12cudaFunctionP9Particles_param_0
)
{
    .reg .pred  %p<2>;
    .reg .f32   %f<45>;
    .reg .b32   %r<5>;
    .reg .b64   %rd<5>;


    ld.param.u64    %rd2, [_Z12cudaFunctionP9Particles_param_0];
    cvta.to.global.u64  %rd1, %rd2;
    ld.global.f32   %f1, [%rd1];
    mov.f32     %f44, 0f3F800000;
    mov.u32     %r4, -10000;

BB0_1:
    fma.rn.f32  %f5, %f1, %f44, %f44;
    fma.rn.f32  %f6, %f1, %f5, %f5;
    fma.rn.f32  %f7, %f1, %f6, %f6;
    fma.rn.f32  %f8, %f1, %f7, %f7;
    fma.rn.f32  %f9, %f1, %f8, %f8;
    fma.rn.f32  %f10, %f1, %f9, %f9;
    fma.rn.f32  %f11, %f1, %f10, %f10;
    fma.rn.f32  %f12, %f1, %f11, %f11;
    fma.rn.f32  %f13, %f1, %f12, %f12;
    fma.rn.f32  %f14, %f1, %f13, %f13;
    fma.rn.f32  %f15, %f1, %f14, %f14;
    fma.rn.f32  %f16, %f1, %f15, %f15;
    fma.rn.f32  %f17, %f1, %f16, %f16;
    fma.rn.f32  %f18, %f1, %f17, %f17;
    fma.rn.f32  %f19, %f1, %f18, %f18;
    fma.rn.f32  %f20, %f1, %f19, %f19;
    fma.rn.f32  %f21, %f1, %f20, %f20;
    fma.rn.f32  %f22, %f1, %f21, %f21;
    fma.rn.f32  %f23, %f1, %f22, %f22;
    fma.rn.f32  %f24, %f1, %f23, %f23;
    fma.rn.f32  %f25, %f1, %f24, %f24;
    fma.rn.f32  %f26, %f1, %f25, %f25;
    fma.rn.f32  %f27, %f1, %f26, %f26;
    fma.rn.f32  %f28, %f1, %f27, %f27;
    fma.rn.f32  %f29, %f1, %f28, %f28;
    fma.rn.f32  %f30, %f1, %f29, %f29;
    fma.rn.f32  %f31, %f1, %f30, %f30;
    fma.rn.f32  %f32, %f1, %f31, %f31;
    fma.rn.f32  %f33, %f1, %f32, %f32;
    fma.rn.f32  %f34, %f1, %f33, %f33;
    fma.rn.f32  %f35, %f1, %f34, %f34;
    fma.rn.f32  %f36, %f1, %f35, %f35;
    fma.rn.f32  %f37, %f1, %f36, %f36;
    fma.rn.f32  %f38, %f1, %f37, %f37;
    fma.rn.f32  %f39, %f1, %f38, %f38;
    fma.rn.f32  %f40, %f1, %f39, %f39;
    fma.rn.f32  %f41, %f1, %f40, %f40;
    fma.rn.f32  %f42, %f1, %f41, %f41;
    fma.rn.f32  %f43, %f1, %f42, %f42;
    fma.rn.f32  %f44, %f1, %f43, %f43;
    add.s32     %r4, %r4, 40;
    setp.ne.s32 %p1, %r4, 0;
    @%p1 bra    BB0_1;

    ld.param.u64    %rd4, [_Z12cudaFunctionP9Particles_param_0];
    cvta.to.global.u64  %rd3, %rd4;
    st.global.f32   [%rd3], %f44;
    ret;
}

Note that the mul instructions have now been replaced with fma (fused multiply-add), which still retire at the rate of 1 per clock cycle, but perform 2 FLOP per core per cycle (ie doubling the floating point operations per unit time). In this case the operation count in the above code changes to:

    float flopcount = 2.0f * float(niters) * float(numPtcls);

This version of the code runs in the same time as the original, but is now doing double the number of FLOP:

$ ./floppy 
3158.544128
3134.614016
3083.408640
3098.570240
3100.915968
3089.688576
3182.842368
3108.682496
3139.659520
3098.570240

This represents 80% of the theoretical peak of my device (which is also predicated on fused single precision multiply-add instructions).

Finally, just for comparisons sake, here is the best performing code compiled for device debugging:

$ nvcc -arch=sm_52 -std=c++11 -G -o floppy floppy.cu

$ ./floppy 
66.823832
69.371288
67.816480
69.234680
68.168728
76.703976
79.013264
78.954016
79.187560
77.139656

ie the performance drops from about 80% of peak to about 2% of peak. The code emitted by the compiler is instructive:

.visible .entry _Z12cudaFunctionP9Particles(
    .param .u64 _Z12cudaFunctionP9Particles_param_0
)
{
    .reg .pred  %p<3>;
    .reg .f32   %f<9>;
    .reg .b32   %r<12>;
    .reg .b64   %rd<2>;


    .loc 1 16 1
func_begin6:
    .loc    1 0 0

    .loc 1 16 1

    ld.param.u64    %rd1, [_Z12cudaFunctionP9Particles_param_0];
func_exec_begin6:
    .loc    1 18 15
tmp12:
    mov.u32     %r4, %tid.x;
    mov.u32     %r5, %ctaid.x;
    mov.u32     %r6, %ntid.x;
    mul.lo.s32  %r7, %r5, %r6;
    add.s32     %r8, %r4, %r7;
    mov.b32     %r9, %r8;
tmp13:
    mov.f32     %f5, 0f3F800000;
    .loc    1 19 15
    mov.f32     %f1, %f5;
tmp14:
    .loc    1 21 20
    ld.f32  %f6, [%rd1];
    mov.f32     %f2, %f6;
tmp15:
    .loc    1 23 16
    mov.u32     %r10, 0;
    mov.b32     %r1, %r10;
tmp16:
    mov.f32     %f8, %f1;
tmp17:
    mov.u32     %r11, %r1;
tmp18:

BB6_1:
    .loc    1 23 5
    mov.u32     %r2, %r11;
    mov.f32     %f3, %f8;
tmp19:
    setp.lt.s32 %p1, %r2, 10000;
    not.pred    %p2, %p1;
    @%p2 bra    BB6_4;
    bra.uni     BB6_2;

BB6_2:
    .loc    1 24 9
tmp20:
    mul.f32     %f7, %f3, %f2;
    add.f32     %f4, %f3, %f7;
tmp21:

    .loc    1 23 34
    add.s32     %r3, %r2, 1;
tmp22:
    mov.f32     %f8, %f4;
tmp23:
    mov.u32     %r11, %r3;
tmp24:
    bra.uni     BB6_1;
tmp25:

BB6_4:
    .loc    1 27 5
    st.f32  [%rd1], %f3;
    .loc    1 28 1
    ret;
tmp26:
func_end6:
}

The loop unrolling is suppressed and the fused multiply-add instructions are replaced with a separate mul and add . Never discount the power of compiler optimization -- here the compiler is giving you about a 40 times performance increase for free. Ignore that at your peril.

How to increase FLOPS achieved using CUDA

Question

1 answers

solution1
2

How to increase FLOPS achieved using CUDA

Question

1 answers

solution1 2

solution1
2