Numba CUDA `vectorize` and `reduce` decorators slower than expected

Question

I've been testing out some basic CUDA functions using the Numba package. My main goal is to implement a Richardson-Lucy algorithm on the GPU. It is possible to accelerate the algorithm and one of the main steps in doing so can be summarized in the following dummy function

def dummy(arr1, arr2):
    return (arr1 * arr2).sum() / ((arr2**2).sum() + eps)

This function runs reasonably fast on the CPU but I'd like to keep everything on the GPU to avoid host <---> device copies.

To compare the speeds of the different calculations I wrote a short set of functions:

import numpy as np
from numba import njit, jit
import numba
import numba.cuda as cuda
import timeit
import time


# define our functions
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def add_gpu(a, b):
    return a + b

@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def mult_gpu(a, b):
    return a * b

@cuda.reduce
def sum_gpu(a, b):
    return a + b

@cuda.jit
def add_gpu_1d(a, b, c):
    x = cuda.grid(1)
    if x < c.size:
        c[x] = a[x] + b[x]

@cuda.jit
def mult_gpu_1d(a, b, c):
    x = cuda.grid(1)
    if x < c.size:
        c[x] = a[x] * b[x]

@cuda.jit
def mult_gpu_2d(a, b, c):
    x, y = cuda.grid(2)
    if x < c.shape[0] and y < c.shape[1]:
        c[x, y] = a[x, y] * b[x, y]

@cuda.jit
def add_gpu_2d(a, b, c):
    x, y = cuda.grid(2)
    if x < c.shape[0] and y < c.shape[1]:
        c[x, y] = a[x, y] + b[x, y]

and some timer functions:

def avg_t(t, num):
    return np.mean(t) / num

def format_t(t):
    """Turn t into nice formating"""
    if t < 1e-3:
        return "{:.1f} us".format(t * 1e6)
    elif t < 1:
        return "{:.1f} ms".format(t * 1e3)
    else:
        return "{:.1f} s".format(t)

def test_1d_times(data_len, dtype=np.float32):
    num_times = 10

    title = "Testing 1D Data, Data length = {}, data type = {}".format(data_len, dtype)
    print(len(title) * "=")
    print(title)
    print(len(title) * "=")

    t = time.time()
    arr1, arr2 = np.empty((2, data_len), dtype=dtype)
    d_arr1 = cuda.to_device(arr1)
    d_arr2 = cuda.to_device(arr2)
    d_result = cuda.device_array_like(d_arr1)
    print("Data generated in " + format_t(time.time() - t))
    print("d_arr1 dtype =", d_arr1.dtype)
    print("d_arr1 size = ", d_arr1.size)

    print()
    print("Testing multiplication times")
    print("----------------------------")

    t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_mult_1d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing sum times")
    print("------------------")

    t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_add_1d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing reduction times")
    print("-----------------------")

    t = timeit.repeat((lambda: arr1.sum()), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
    print("sum_gpu time = " + format_t(avg_t(t, num_times)))
    print()

def test_2d_times(data_len, dtype=np.float32):
    num_times = 10

    title = "Testing 2D Data, Data length = {}, data type = {}".format(data_len, dtype)
    print(len(title) * "=")
    print(title)
    print(len(title) * "=")

    t = time.time()
    arr1, arr2 = np.empty((2, data_len, data_len), dtype=dtype)
    d_arr1 = cuda.to_device(arr1)
    d_arr2 = cuda.to_device(arr2)
    d_result = cuda.device_array_like(d_arr1)
    print("Data generated in {} seconds".format(time.time() - t))
    print("d_arr1 dtype =", d_arr1.dtype)
    print("d_arr1 size = ", d_arr1.size)

    print()
    print("Testing multiplication times")
    print("----------------------------")

    t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: mult_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_mult_2d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing sum times")
    print("------------------")

    t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: add_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_add_2d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing reduction times")
    print("-----------------------")

    t = timeit.repeat((lambda: arr1.sum()), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu.reduce(d_arr1.ravel())), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: sum_gpu(d_arr1.ravel())), number=num_times)
    print("sum_gpu time = " + format_t(avg_t(t, num_times)))
    print()

Running the test functions

numba.cuda.detect()
test_1d_times(2**24)
test_2d_times(2**12)
test_1d_times(2**24, dtype=np.float64)
test_2d_times(2**12, dtype=np.float64)

gives the following output:

Found 1 CUDA devices
id 0    b'GeForce GTX TITAN X'                              [SUPPORTED]
                      compute capability: 5.2
                           pci device id: 0
                              pci bus id: 3
Summary:
    1/1 devices are supported
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float32'>
============================================================================
Data generated in 88.2 ms
d_arr1 dtype = float32
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 122.8 ms
cuda_mult_1d time = 206.8 us

Testing sum times
------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 106.1 ms
cuda_add_1d time = 212.6 us

Testing reduction times
-----------------------
cpu/numpy time = 16.7 ms
cuda vectorize time = 11.1 ms
sum_gpu time = 127.3 ms

========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float32'>
========================================================================
Data generated in 0.0800013542175293 seconds
d_arr1 dtype = float32
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 35.4 ms
cuda vectorize time = 97.9 ms
cuda_mult_2d time = 208.9 us

Testing sum times
------------------
cpu/numpy time = 36.3 ms
cuda vectorize time = 94.5 ms
cuda_add_2d time = 250.8 us

Testing reduction times
-----------------------
cpu/numpy time = 16.4 ms
cuda vectorize time = 15.8 ms
sum_gpu time = 125.4 ms

============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float64'>
============================================================================
Data generated in 171.0 ms
d_arr1 dtype = float64
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 73.2 ms
cuda vectorize time = 114.9 ms
cuda_mult_1d time = 201.9 us

Testing sum times
------------------
cpu/numpy time = 71.4 ms
cuda vectorize time = 71.0 ms
cuda_add_1d time = 217.2 us

Testing reduction times
-----------------------
cpu/numpy time = 29.0 ms
cuda vectorize time = 12.8 ms
sum_gpu time = 123.5 ms

========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float64'>
========================================================================
Data generated in 0.301849365234375 seconds
d_arr1 dtype = float64
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 73.7 ms
cuda vectorize time = 84.2 ms
cuda_mult_2d time = 226.2 us

Testing sum times
------------------
cpu/numpy time = 74.9 ms
cuda vectorize time = 84.3 ms
cuda_add_2d time = 208.7 us

Testing reduction times
-----------------------
cpu/numpy time = 29.9 ms
cuda vectorize time = 14.3 ms
sum_gpu time = 121.2 ms

It seems like the @cuda.vectorize decorated functions perform slower than the CPU and custom written @cuda.jit functions. While the @cuda.jit functions give the expected orders of magnitude speed up and nearly constant time performance (results not shown).

On the other hand the @cuda.reduce function runs significantly slower than either the @cuda.vectorize function or the CPU function.

Is there a reason for the poor performance of the @cuda.vectorize and @cuda.reduce functions? Is it possible to write a CUDA reduction kernel using just Numba?

EDIT:

Looks like this is a legitimate bug in Numba : https://github.com/numba/numba/issues/2266 , https://github.com/numba/numba/issues/2268

Answer 1

I can not explain the behavior of @cuda.vectorize and @cuda.reduce . Some times the results looks a bit strange for me. For example here Negative Speed Gain Using Numba Vectorize target='cuda' @cuda.vectorize slows down the calculations, while using of @cuda.jit allow to speed up it. Here I would suggest to try PyCUDA ( https://documen.tician.de/pycuda/ ). I've tested the performance of dot product ( https://documen.tician.de/pycuda/array.html ).

import numpy as np
from pycuda.curandom import rand as curand
import pycuda.gpuarray as gpuarray
import pycuda.driver as pycu
import pycuda.autoinit
from pycuda.reduction import ReductionKernel
import numba.cuda as cuda 
from time import time

dot = ReductionKernel(dtype_out=np.float32, neutral="0",
                      reduce_expr="a+b", map_expr="x[i]*y[i]",
                      arguments="float *x, float *y")
n = 2**24
x = curand((n), dtype=np.float32)
y = curand((n), dtype=np.float32)

x_cpu = np.random.random((n))
y_cpu = np.random.random((n))

st = time()
x_dot_y = dot(x, y).get()
gpu_time = (time() - st)
print "GPU: ", gpu_time

st = time()
x_dot_y_cpu = np.dot(x_cpu, y_cpu)
cpu_time = (time() - st)
print "CPU: ", cpu_time
print "speedup: ", cpu_time/gpu_time

On my PC CPU: Intel Core2 Quad 3GHz, GPU: NVIDIA GeForce GTX 580. I've got the following results:

GPU:  0.00191593170166
CPU:  0.0518710613251
speedup:  27.0735440518

It is necessary to notice, that the time necessary for initialization and precompiling of the kernel did not taken into account in the code above. However, this time may be significant. Taking this time into account I've obtained:

GPU:  0.316560029984
CPU:  0.0511090755463
speedup:  0.161451449031

So, in this case the GPU code is slower than CPU ones. At the same time, for most application you need initialize the kernel just once and then use it many times. In this case it looks reasonable to use PyCUDA reduction kernels.

Before, I have tested the performance of @cuda.jit , PyCUDA and CUDA-C code by calculation of 2D diffusivity equation. I was found that PyCUDA allows to get almost the same performance as CUDA-C, while Numba demonstrates worse performance. The figure below demonstrates these results.

Numba CUDA `vectorize` and `reduce` decorators slower than expected

Question

1 answers

solution1
2 2017-02-03 04:42:36

Numba CUDA `vectorize` and `reduce` decorators slower than expected

Question

1 answers

solution1 2 2017-02-03 04:42:36

solution1
2 2017-02-03 04:42:36