从Python Numba CUDA内核调用加速的FFT

Question

I need to calculate the Fourier transform of a 256 element float64 signal. 我需要计算256元素float64信号的傅立叶变换。 The requirement is as such that I need to invoke these FFTs from inside a cuda.jitted section and it must be completed within 25usec. 因此，我需要从cuda.jitted部分内部调用这些FFT，并且必须在25usec内完成。 Alas cuda.jit-compiled functions do not allow to invoke external libraries => I wrote my own. 可惜cuda.jit编译的函数不允许调用外部库=>我写了自己的库。 Alas my single-core code is still way too slow (~250usec on a Quadro P4000). my，我的单核代码仍然太慢（在Quadro P4000上为250usec）。 Is there a better way? 有没有更好的办法？

I created a single core FFT-function that gives correct results, but is alas 10x too slow. 我创建了一个单核FFT函数，该函数可提供正确的结果，但是实在太慢了10倍。 I don't understand how to make good use of multiple cores. 我不明白如何充分利用多个内核。

---fft.py

from numba import cuda, boolean, void, int32, float32, float64, complex128
import math, sys, cmath

def _transform_radix2(vector, inverse, out):    
    n = len(vector) 
    levels = int32(math.log(float32(n))/math.log(float32(2)))

    assert 2**levels==n # error: Length is not a power of 2 

    #uncomment either Numba.Cuda or Numpy memory allocation, (intelligent conditional compileation??)               
    exptable = cuda.local.array(1024, dtype=complex128)   
    #exptable = np.zeros(1024, np.complex128)

    assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

    coef = complex128((2j if inverse else -2j) * math.pi / n)   
    for i in range(n // 2):                       
        exptable[i] = cmath.exp(i * coef)       

    for i in range(n):
        x = i   
        y = 0
        for j in range(levels):
            y = (y << 1) | (x & 1)
            x >>= 1
        out[i] = vector[y]      

    size = 2
    while size <= n:
        halfsize = size // 2
        tablestep = n // size
        for i in range(0, n, size):
            k = 0
            for j in range(i, i + halfsize):
                temp = out[j + halfsize] * exptable[k]    
                out[j + halfsize] = out[j] - temp
                out[j] += temp
                k += tablestep
        size *= 2

    scale=float64(n if inverse else 1)
    for i in range(n):
        out[i]=out[i]/scale   # the inverse requires a scaling

# now create the Numba.cuda version to be called by a GPU
gtransform_radix2 = cuda.jit(device=True)(_transform_radix2)

---test.py

from numba import cuda, void, float64, complex128, boolean
import cupy as cp
import numpy as np
import timeit  
import fft

@cuda.jit(void(float64[:],boolean, complex128[:]))    
def fftbench(y, inverse, FT):
  Y  = cuda.local.array(256, dtype=complex128)

  for i in range(len(y)):
    Y[i]=complex128(y[i])    
  fft.gtransform_radix2(Y, False, FT)


str='\nbest [%2d/%2d] iterations, min:[%9.3f], max:[%9.3f], mean:[%9.3f], std:[%9.3f] usec'
a=[127.734375 ,130.87890625 ,132.1953125  ,129.62109375 ,118.6015625
 ,110.2890625  ,106.55078125 ,104.8203125  ,106.1875     ,109.328125
 ,113.5        ,118.6640625  ,125.71875    ,127.625      ,120.890625
 ,114.04296875 ,112.0078125  ,112.71484375 ,110.18359375 ,104.8828125
 ,104.47265625 ,106.65625    ,109.53515625 ,110.73828125 ,111.2421875
 ,112.28125    ,112.38671875 ,112.7734375  ,112.7421875  ,113.1328125
 ,113.24609375 ,113.15625    ,113.66015625 ,114.19921875 ,114.5
 ,114.5546875  ,115.09765625 ,115.2890625  ,115.7265625  ,115.41796875
 ,115.73828125 ,116.         ,116.55078125 ,116.5625     ,116.33984375
 ,116.63671875 ,117.015625   ,117.25       ,117.41015625 ,117.6640625
 ,117.859375   ,117.91015625 ,118.38671875 ,118.51171875 ,118.69921875
 ,118.80859375 ,118.67578125 ,118.78125    ,118.49609375 ,119.0078125
 ,119.09375    ,119.15234375 ,119.33984375 ,119.31640625 ,119.6640625
 ,119.890625   ,119.80078125 ,119.69140625 ,119.65625    ,119.83984375
 ,119.9609375  ,120.15625    ,120.2734375  ,120.47265625 ,120.671875
 ,120.796875   ,120.4609375  ,121.1171875  ,121.35546875 ,120.94921875
 ,120.984375   ,121.35546875 ,120.87109375 ,120.8359375  ,121.2265625
 ,121.2109375  ,120.859375   ,121.17578125 ,121.60546875 ,121.84375
 ,121.5859375  ,121.6796875  ,121.671875   ,121.78125    ,121.796875
 ,121.8828125  ,121.9921875  ,121.8984375  ,122.1640625  ,121.9375
 ,122.         ,122.3515625  ,122.359375   ,122.1875     ,122.01171875
 ,121.91015625 ,122.11328125 ,122.1171875  ,122.6484375  ,122.81640625
 ,122.33984375 ,122.265625   ,122.78125    ,122.44921875 ,122.34765625
 ,122.59765625 ,122.63671875 ,122.6796875  ,122.6171875  ,122.34375
 ,122.359375   ,122.7109375  ,122.83984375 ,122.546875   ,122.25390625
 ,122.06640625 ,122.578125   ,122.7109375  ,122.83203125 ,122.5390625
 ,122.2421875  ,122.06640625 ,122.265625   ,122.13671875 ,121.8046875
 ,121.87890625 ,121.88671875 ,122.2265625  ,121.63671875 ,121.14453125
 ,120.84375    ,120.390625   ,119.875      ,119.34765625 ,119.0390625
 ,118.4609375  ,117.828125   ,117.1953125  ,116.9921875  ,116.046875
 ,115.16015625 ,114.359375   ,113.1875     ,110.390625   ,108.41796875
 ,111.90234375 ,117.296875   ,127.0234375  ,147.58984375 ,158.625
 ,129.8515625  ,120.96484375 ,124.90234375 ,130.17578125 ,136.47265625
 ,143.9296875  ,150.24609375 ,141.         ,117.71484375 ,109.80859375
 ,115.24609375 ,118.44140625 ,120.640625   ,120.9921875  ,111.828125
 ,101.6953125  ,111.21484375 ,114.91015625 ,115.2265625  ,118.21875
 ,125.3359375  ,139.44140625 ,139.76953125 ,135.84765625 ,137.3671875
 ,141.67578125 ,139.53125    ,136.44921875 ,135.08203125 ,135.7890625
 ,137.58203125 ,138.7265625  ,154.33203125 ,172.01171875 ,152.24609375
 ,129.8046875  ,125.59375    ,125.234375   ,127.32421875 ,132.8984375
 ,147.98828125 ,152.328125   ,153.7734375  ,155.09765625 ,156.66796875
 ,159.0546875  ,151.83203125 ,138.91796875 ,138.0546875  ,140.671875
 ,143.48046875 ,143.99609375 ,146.875      ,146.7578125  ,141.15234375
 ,141.5        ,140.76953125 ,140.8828125  ,145.5625     ,150.78125
 ,148.89453125 ,150.02734375 ,150.70703125 ,152.24609375 ,148.47265625
 ,131.95703125 ,125.40625    ,123.265625   ,123.57421875 ,129.859375
 ,135.6484375  ,144.51171875 ,155.05078125 ,158.4453125  ,140.8125
 ,100.08984375 ,104.29296875 ,128.55078125 ,139.9921875  ,143.38671875
 ,143.69921875 ,137.734375   ,124.48046875 ,116.73828125 ,114.84765625
 ,113.85546875 ,117.45703125 ,122.859375   ,125.8515625  ,133.22265625
 ,139.484375   ,135.75       ,122.69921875 ,115.7734375  ,116.9375
 ,127.57421875] 
y1 =cp.zeros(len(a), cp.complex128) 
FT1=cp.zeros(len(a), cp.complex128)

for i in range(len(a)):
  y1[i]=a[i]  #convert to complex to feed the FFT

r=1000
series=sorted(timeit.repeat("fftbench(y1, False, FT1)",      number=1, repeat=r, globals=globals()))
series=series[0:r-5]
print(str % (len(series), r, 1e6*np.min(series), 1e6*np.max(series), 1e6*np.mean(series), 1e6*np.std(series)));



a faster implementation t<<25usec

Answer 1

The drawback of your algorithm is that even on GPU it runs on a single-core. 该算法的缺点是，即使在GPU上，它也可以在单核上运行。

In order to understand how to design algorithms on Nvidia GPGPU I recommend to look at : the CUDA C Programming guide and to the numba documentation to apply the code in python. 为了了解如何在Nvidia GPGPU上设计算法，我建议您看一下： CUDA C编程指南和numba文档，以在python中应用代码。

Moreover to understand what's wrong with your code, I recommend to use Nvidia profiler . 此外，要了解您的代码有什么问题，我建议使用Nvidia profiler 。

The following parts of the answer will explained how to apply the basics on your example. 答案的以下部分将说明如何在示例中应用基础知识。

Run multiples threads 运行多个线程

To improve performances, you will first need to launch multiples threads that can run in parallel, CUDA handle threads as follow: 为了提高性能，您首先需要启动可以并行运行的多个线程，CUDA处理线程如下：

Threads are grouped into blocs of n threads (n < 1024) 线程被分组为n个线程的块（n <1024）
Each thread withing the same bloc can be synchronized and have access to a (fast) common memory space called "shared memory". 具有相同块的每个线程都可以同步，并可以访问称为“共享内存”的（快速）公共内存空间。
You can run multiples blocs in parallel in a "grid" but you will lose the synchronization mechanism. 您可以在“网格”中并行运行多个块，但是会丢失同步机制。

The syntax to run multiples threads is the following: 运行多个线程的语法如下：

fftbench[griddim, blockdim](y1, False, FT1)

to simplify, I will use only one bloc of size 256: 为了简化，我将仅使用一个大小为256的块：

fftbench[1, 256](y1, False, FT1)

Memory 记忆

To improve GPU performances it's important to look where the data will be stored, their is three main spaces: 为了提高GPU性能，重要的是要查看数据的存储位置，它们是三个主要空间：

global memory: it's the "RAM" of your GPU, it's slow and have a high latency, this is where all your array are placed when you send them to the GPU. 全局内存：这是GPU的“ RAM”，速度很慢且具有较高的延迟，这是将所有阵列发送到GPU时要放置的位置。
shared memory: it's a little fast access memory, all the thread of a bloc have access to the same shared memory. 共享内存：这是一个快速访问内存，一个块的所有线程都可以访问同一共享内存。
local memory: physically it's the same that global memory, but each thread access its own local memory. 本地内存：物理上与全局内存相同，但是每个线程都访问其自己的本地内存。

Typically, if you use multiples times the sames data, you should try store them in shared memory to prevent latency from the global memory. 通常，如果您使用相同数据的倍数，则应尝试将它们存储在共享内存中，以防止全局内存造成延迟。

In your code, you can store exptable in shared memory: 在您的代码中，您可以将exptable存储在共享内存中：

exptable = cuda.shared.array(1024, dtype=complex128)

and if n is not too big, you may want to use a working instead of using out : 如果n不太大，则可能要使用work而不是out ：

working = cuda.shared.array(256, dtype=complex128)

Assign tasks to each thread 将任务分配给每个线程

Of course if you don't change your function, all thread will do the same job and it will just slow down your program. 当然，如果您不更改函数，则所有线程都将执行相同的工作，并且只会降低程序速度。

In this example we will assign each thread to one cell of the array. 在此示例中，我们将每个线程分配给数组的一个单元。 To do so, we have to get the unique id of thread withing a bloc: 为此，我们必须使用bloc获取线程的唯一ID：

idx = cuda.threadIdx.x

Now we will be able to speed up the for loops, lets handle them one by one: 现在我们将能够加快for循环的速度，让我们一个接一个地处理它们：

exptable = cuda.shared.array(1024, dtype=complex128)   
...
for i in range(n // 2):                       
    exptable[i] = cmath.exp(i * coef)

Here is the goal: we will want the n/2 first threads to fill this array, then all the thread will be able to use it. 这是目标：我们希望第n / 2个线程填充该数组，然后所有线程都可以使用它。

So in this case just replace the for loop by a condition on the thread idx's: 因此，在这种情况下，只需用线程idx上的条件替换for循环即可：

if idx < n // 2:
    exptable[idx] = cmath.exp(idx * coef)

For the two last loops it's easier, each thread will deal with one cell of the array: 对于最后两个循环，它更容易实现，每个线程将处理数组的一个单元：

for i in range(n):
    x = i   
    y = 0
    for j in range(levels):
        y = (y << 1) | (x & 1)
        x >>= 1
    out[i] = vector[y]

become 成为

x = idx   
y = 0
for j in range(levels):
    y = (y << 1) | (x & 1)
    x >>= 1
working[idx] = vector[y]

and 和

for i in range(n):
    out[i]=out[i]/scale   # the inverse requires a scaling

become 成为

out[idx]=working[idx]/scale   # the inverse requires a scaling

I use the shared array working but you can replace it by out if you want to use global memory. 我使用共享阵列working但如果要使用全局内存out则可以将其替换为可用。

Now, lets look at the while loop, we said that we want each thread to only deal with one cell of the array. 现在，让我们看一下while循环，我们说过我们希望每个线程只处理数组的一个单元。 So we can try to parallelize the two for loops inside. 因此，我们可以尝试并行化内部的两个for循环。

    ...
    for i in range(0, n, size):
        k = 0
        for j in range(i, i + halfsize):
            temp = out[j + halfsize] * exptable[k]    
            out[j + halfsize] = out[j] - temp
            out[j] += temp
            k += tablestep
    ...

To simplify I will only use half of the threads, we will take the 128 first threads and determine j as follow: 为了简化，我将只使用一半的线程，我们将采用128个第一个线程并按如下方式确定j ：

    ...
    if idx < 128:
        j = (idx%halfsize) + size*(idx//halfsize)
    ...

k is: k是：

    k = tablestep*(idx%halfsize)

so we got the loop: 所以我们得到了循环：

size = 2
while size <= n:
    halfsize = size // 2
    tablestep = n // size

    if idx < 128:
        j = (idx%halfsize) + size*(idx//halfsize)            
        k = tablestep*(idx%halfsize)
        temp = working[j + halfsize] * exptable[k]
        working[j + halfsize] = working[j] - temp
        working[j] += temp
    size *= 2

Synchronization 同步化

Last but not least, we need to synchronize all theses threads. 最后但并非最不重要的一点是，我们需要同步所有这些线程。 In fact the program will not work if we do not synch. 实际上，如果我们不同步，该程序将无法运行。 On the GPU thread may not run at the same time so you can get issues when data are produced by one thread and used by another one, for example: 在GPU上线程可能不会同时运行，因此当一个线程生成数据并被另一个线程使用数据时，您可能会遇到问题，例如：

exptable[0] is used by thread_2 before thread_0 fill store its value 在线程_0填充存储其值之前，线程_2使用exptable[0]
working[j + halfsize] is moddified by another thread before you store it in temp 在将其存储在temp之前， working[j + halfsize]被另一个线程修改

to prevent this we can use the function: 为了防止这种情况，我们可以使用以下功能：

cuda.syncthreads()

All the threads in the same bloc will finish this line before execution the rest of the code. 同一块中的所有线程将在执行其余代码之前完成此行。

In this example, you need to synchronize at two point, after the working initialization and after each iteration of the while loop. 在此示例中，您需要在working初始化之后和while循环的每次迭代之后的两点进行同步。

then your code look like: 那么您的代码如下所示：

def _transform_radix2(vector, inverse, out):    
  n = len(vector) 
  levels = int32(math.log(float32(n))/math.log(float32(2)))

  assert 2**levels==n # error: Length is not a power of 2 

  exptable = cuda.shared.array(1024, dtype=complex128)
  working = cuda.shared.array(256, dtype=complex128)

  assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

  coef = complex128((2j if inverse else -2j) * math.pi / n)   
  if idx < n // 2:
    exptable[idx] = cmath.exp(idx * coef)    

  x = idx   
  y = 0
  for j in range(levels):
    y = (y << 1) | (x & 1)
    x >>= 1
  working[idx] = vector[y]    
  cuda.syncthreads()

  size = 2
  while size <= n:
    halfsize = size // 2
    tablestep = n // size

    if idx < 128:
      j = (idx%halfsize) + size*(idx//halfsize)            
      k = tablestep*(idx%halfsize)
      temp = working[j + halfsize] * exptable[k]
      working[j + halfsize] = working[j] - temp
      working[j] += temp
    size *= 2
    cuda.syncthreads()

  scale=float64(n if inverse else 1)
  out[idx]=working[idx]/scale   # the inverse requires a scaling

I feel like your question is a good way to introduce some basics about GPGPU computing and I try to answer it in a didactic way. 我觉得您的问题是介绍有关GPGPU计算的一些基础知识的好方法，我试图以一种有说服力的方式回答。 The final code is far from perfect and can be optimized a lot, I highly recommend you to read this Programming guide if you want to learn more about GPU optimizations. 最终的代码远非完美，可以进行很多优化，如果您想了解有关GPU优化的更多信息，我强烈建议您阅读本编程指南。

从Python Numba CUDA内核调用加速的FFT

问题描述

1 个解决方案

解决方案1
2 2019-06-27 11:45:44

Run multiples threads 运行多个线程

Memory 记忆

Assign tasks to each thread 将任务分配给每个线程

Synchronization 同步化

从Python Numba CUDA内核调用加速的FFT

问题描述

1 个解决方案

解决方案1 2 2019-06-27 11:45:44

Run multiples threads 运行多个线程

Memory 记忆

Assign tasks to each thread 将任务分配给每个线程

Synchronization 同步化

解决方案1
2 2019-06-27 11:45:44