Solve the Poisson equation using FFT with CUDA

Question

I'm following a tutorial on using the cuFFT library here: http://gpgpu.org/static/sc2007/SC07_CUDA_3_Libraries.pdf

After following line by line of its code, I'm getting really strange results.

I have input data that is an NxN array of float s. The program does a FFT forward transform, solves Poisson's equation, and then does an inverse FFT . The input data (and output data) is referred to as a square image with sidelength N . When I comment out solve_poisson <<<dimGrid, dimBlock>>> (r_complex_d, kx_d, ky_d, N); , it correctly forward transforms the data and then performs an inverse transform, which causes the output data to be the same as the input data. This is supposed to happen.

Here is the output without calling the solve_poisson method.

0       r_initial: 0.00125126   r: 0.00125132
1       r_initial: 0.563585     r: 0.563585
2       r_initial: 0.193304     r: 0.193304
3       r_initial: 0.80874      r: 0.80874
4       r_initial: 0.585009     r: 0.585009
5       r_initial: 0.479873     r: 0.479873
6       r_initial: 0.350291     r: 0.350291
7       r_initial: 0.895962     r: 0.895962
8       r_initial: 0.82284      r: 0.82284
9       r_initial: 0.746605     r: 0.746605
10      r_initial: 0.174108     r: 0.174108
11      r_initial: 0.858943     r: 0.858943
12      r_initial: 0.710501     r: 0.710502
13      r_initial: 0.513535     r: 0.513535
14      r_initial: 0.303995     r: 0.303995
15      r_initial: 0.0149846    r: 0.0149846
Press any key to continue . . .

However, when I uncomment out the solve_poisson method, the output data is inf or nan , which leads me to believe that the scale variable was somehow close to zero in the solve_poisson method. So I changed float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]); to float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f . This change is not in the original tutorial. The results computed here are not supposed to have extreme positive or negative values.

0       r_initial: 0.00125126   r: -11448.1
1       r_initial: 0.563585     r: 11449.3
2       r_initial: 0.193304     r: -11448.3
3       r_initial: 0.80874      r: 11449.2
4       r_initial: 0.585009     r: 11449.4
5       r_initial: 0.479873     r: -11448.4
6       r_initial: 0.350291     r: 11449.5
7       r_initial: 0.895962     r: -11448.6
8       r_initial: 0.82284      r: -11448.5
9       r_initial: 0.746605     r: 11449.4
10      r_initial: 0.174108     r: -11448.3
11      r_initial: 0.858943     r: 11449.3
12      r_initial: 0.710501     r: 11449.2
13      r_initial: 0.513535     r: -11448.4
14      r_initial: 0.303995     r: 11449.3
15      r_initial: 0.0149846    r: -11448.1
Press any key to continue . . .

In the tutorial, a sample calculation on slide 43 on page 22 is computed=0.975879 reference=0.975882 , yet my results are completely different and really large.

The following code is what I used.

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cufft.h>

#include <stdlib.h>
#include <iostream>

#define N 4 //4 X 4 // N is the sidelength of the image -> 16 pixels in entire image
#define block_size_x 2 
#define block_size_y 2

__global__ void real2complex(cufftComplex *c, float *a, int n);
__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n);
__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n);


int main()
{
    float *kx, *ky, *r;
    kx = (float *)malloc(sizeof(float) * N);
    ky = (float *)malloc(sizeof(float) * N);
    r = (float *)malloc(sizeof(float) * N * N);

    float *kx_d, *ky_d, *r_d;
    cufftComplex *r_complex_d;
    cudaMalloc((void **)&kx_d, sizeof(float) * N);
    cudaMalloc((void **)&ky_d, sizeof(float) * N);
    cudaMalloc((void **)&r_d, sizeof(float) * N * N);
    cudaMalloc((void **)&r_complex_d, sizeof(cufftComplex) * N * N);

    for (int y = 0; y < N; y++)
        for (int x = 0; x < N; x++)
            r[x + y * N] = rand() / (float)RAND_MAX;
            //r[x + y * N] = sin(exp(-((x - N / 2.0f) * (x - N / 2.0f) + (N / 2.0f - y) * (N / 2.0f - y)) / (20 * 20))) * 255 / sin(1); //Here is sample data that will high values at the center of the image and low values as you go farther and farther away from the center.

    float* r_inital = (float *)malloc(sizeof(float) * N * N);
    for (int i = 0; i < N * N; i++)
        r_inital[i] = r[i];

    for (int i = 0; i < N; i++)
    {
        kx[i] = i - N / 2.0f; //centers kx values to be at center of image
        ky[i] = N / 2.0f - i; //centers ky values to be at center of image
    }

    cudaMemcpy(kx_d, kx, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(ky_d, ky, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(r_d, r, sizeof(float) * N * N, cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan2d(&plan, N, N, CUFFT_C2C);

    /* Compute the execution configuration, block_size_x*block_size_y = number of threads */
    dim3 dimBlock(block_size_x, block_size_y);
    dim3 dimGrid(N / dimBlock.x, N / dimBlock.y);
    /* Handle N not multiple of block_size_x or block_size_y */
    if (N % block_size_x != 0) dimGrid.x += 1;
    if (N % block_size_y != 0) dimGrid.y += 1;

    real2complex << < dimGrid, dimBlock >> > (r_complex_d, r_d, N);

    cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_FORWARD);
    solve_poisson << <dimGrid, dimBlock >> > (r_complex_d, kx_d, ky_d, N);
    cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_INVERSE);

    float scale = 1.0f / (N * N);
    complex2real_scaled << <dimGrid, dimBlock >> > (r_d, r_complex_d, scale, N);

    cudaMemcpy(r, r_d, sizeof(float) * N * N, cudaMemcpyDeviceToHost);

    for (int i = 0; i < N * N; i++)
        std::cout << i << "\tr_initial: " << r_inital[i] << "\tr: " << r[i] << std::endl;
    system("pause");

    /* Destroy plan and clean up memory on device*/
    free(kx);
    free(ky);
    free(r);
    free(r_inital);
    cufftDestroy(plan);
    cudaFree(r_complex_d);
    cudaFree(kx_d);
}

__global__ void real2complex(cufftComplex *c, float *a, int n)
{
    /* compute idx and idy, the location of the element in the original NxN array */
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;
    if (idx < n && idy < n)
    {
        int index = idx + idy * n;
        c[index].x = a[index];
        c[index].y = 0.0f;
    }
}

__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n)
{
    /* compute idx and idy, the location of the element in the original NxN array */
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;
    if (idx < n && idy < n)
    {
        int index = idx + idy * n;
        a[index] = scale * c[index].x;
    }
}


__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n)
{
    /* compute idx and idy, the location of the element in the original NxN array */
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;
    if (idx < n && idy < n)
    {
        int index = idx + idy * n;
        float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f;
        if (idx == 0 && idy == 0) scale = 1.0f;
        scale = 1.0f / scale;
        c[index].x *= scale;
        c[index].y *= scale;
    }
}

Is there anything I messed up on? I would really appreciate if anyone could help me out.

Answer 1

Although the poster has found the mistake by himself, I want to share my own implementation of the 2D Poisson equation solver.

The implementation slightly differs from the one linked to by the poster.

The theory is reported at Solve Poisson Equation Using FFT .

MATLAB VERSION

I first report the Matlab version for reference:

clear all
close all
clc

M       = 64;              % --- Number of Fourier harmonics along x (should be a multiple of 2)  
N       = 32;              % --- Number of Fourier harmonics along y (should be a multiple of 2)  
Lx      = 3;               % --- Domain size along x
Ly      = 1.5;               % --- Domain size along y
sigma   = 0.1;             % --- Characteristic width of f (make << 1)

% --- Wavenumbers
kx = (2 * pi / Lx) * [0 : (M / 2 - 1) (- M / 2) : (-1)]; % --- Wavenumbers along x
ky = (2 * pi / Ly) * [0 : (N / 2 - 1) (- N / 2) : (-1)]; % --- Wavenumbers along y
[Kx, Ky]  = meshgrid(kx, ky); 

% --- Right-hand side of differential equation
hx              = Lx / M;                   % --- Grid spacing along x
hy              = Ly / N;                   % --- Grid spacing along y
x               = (0 : (M - 1)) * hx;
y               = (0 : (N - 1)) * hy;
[X, Y]          = meshgrid(x, y);
rSquared        = (X - 0.5 * Lx).^2 + (Y - 0.5 * Ly).^2;
sigmaSquared    = sigma^2;
f               = exp(-rSquared / (2 * sigmaSquared)) .* (rSquared - 2 * sigmaSquared) / (sigmaSquared^2);
fHat            = fft2(f);

% --- Denominator of the unknown spectrum
den             = -(Kx.^2 + Ky.^2); 
den(1, 1)       = 1;            % --- Avoid division by zero at wavenumber (0, 0)

% --- Unknown determination
uHat            = ifft2(fHat ./ den);
% uHat(1, 1)      = 0;            % --- Force the unknown spectrum at (0, 0) to be zero
u               = real(uHat);
u               = u - u(1,1);   % --- Force arbitrary constant to be zero by forcing u(1, 1) = 0

% --- Plots
uRef    = exp(-rSquared / (2 * sigmaSquared));
err     = 100 * sqrt(sum(sum(abs(u - uRef).^2)) / sum(sum(abs(uRef).^2)));
errMax  = norm(u(:)-uRef(:),inf)
fprintf('Percentage root mean square error = %f\n', err);
fprintf('Maximum error = %f\n', errMax);
surf(X, Y, u)
xlabel('x')
ylabel('y')
zlabel('u')
title('Solution of 2D Poisson equation by spectral method')

CUDA VERSION

Here is the corresponding CUDA version:

#include <stdio.h>
#include <fstream>
#include <iomanip>

// --- Greek pi
#define _USE_MATH_DEFINES
#include <math.h>

#include <cufft.h>

#define BLOCKSIZEX      16
#define BLOCKSIZEY      16

#define prec_save 10

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); }
    }
}

void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/**************************************************/
/* COMPUTE RIGHT HAND SIDE OF 2D POISSON EQUATION */
/**************************************************/
__global__ void computeRHS(const float * __restrict__ d_x, const float * __restrict__ d_y,
                           float2 * __restrict__ d_r, const float Lx, const float Ly, const float sigma, 
                           const int M, const int N) {

    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    const float sigmaSquared = sigma * sigma;

    const float rSquared = (d_x[tidx] - 0.5f * Lx) * (d_x[tidx] - 0.5f * Lx) +
                           (d_y[tidy] - 0.5f * Ly) * (d_y[tidy] - 0.5f * Ly);

    d_r[tidy * M + tidx].x = expf(-rSquared / (2.f * sigmaSquared)) * (rSquared - 2.f * sigmaSquared) / (sigmaSquared * sigmaSquared);
    d_r[tidy * M + tidx].y = 0.f;

}

/****************************************************/
/* SOLVE 2D POISSON EQUATION IN THE SPECTRAL DOMAIN */
/****************************************************/
__global__ void solvePoisson(const float * __restrict__ d_kx, const float * __restrict__ d_ky, 
                              float2 * __restrict__ d_r, const int M, const int N)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    float scale = -(d_kx[tidx] * d_kx[tidx] + d_ky[tidy] * d_ky[tidy]);

    if (tidx == 0 && tidy == 0) scale = 1.f;

    scale = 1.f / scale;
    d_r[M * tidy + tidx].x *= scale;
    d_r[M * tidy + tidx].y *= scale;

}

/****************************************************************************/
/* SOLVE 2D POISSON EQUATION IN THE SPECTRAL DOMAIN - SHARED MEMORY VERSION */
/****************************************************************************/
__global__ void solvePoissonShared(const float * __restrict__ d_kx, const float * __restrict__ d_ky,
    float2 * __restrict__ d_r, const int M, const int N)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    // --- Use shared memory to minimize multiple access to same spectral coordinate values
    __shared__ float kx_s[BLOCKSIZEX], ky_s[BLOCKSIZEY];

    kx_s[threadIdx.x] = d_kx[tidx];
    ky_s[threadIdx.y] = d_ky[tidy];
    __syncthreads();

    float scale = -(kx_s[threadIdx.x] * kx_s[threadIdx.x] + ky_s[threadIdx.y] * ky_s[threadIdx.y]);

    if (tidx == 0 && tidy == 0) scale = 1.f;

    scale = 1.f / scale;
    d_r[M * tidy + tidx].x *= scale;
    d_r[M * tidy + tidx].y *= scale;

}

/******************************/
/* COMPLEX2REAL SCALED KERNEL */
/******************************/
__global__ void complex2RealScaled(float2 * __restrict__ d_r, float * __restrict__ d_result, const int M, const int N, float scale)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    d_result[tidy * M + tidx] = scale * (d_r[tidy * M + tidx].x - d_r[0].x);
}

/******************************************/
/* COMPLEX2REAL SCALED KERNEL - OPTIMIZED */
/******************************************/
__global__ void complex2RealScaledOptimized(float2 * __restrict__ d_r, float * __restrict__ d_result, const int M, const int N, float scale)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx >= M) || (tidy >= N)) return;

    __shared__ float d_r_0[1];

    if (threadIdx.x == 0) d_r_0[0] = d_r[0].x;

    volatile float2 c;
    c.x = d_r[tidy * M + tidx].x;
    c.y = d_r[tidy * M + tidx].y;

    d_result[tidy * M + tidx] = scale * (c.x - d_r_0[0]);
}

/**************************************/
/* SAVE FLOAT2 ARRAY FROM GPU TO FILE */
/**************************************/
void saveGPUcomplextxt(const float2 * d_in, const char *filename, const int M) {

    float2 *h_in = (float2 *)malloc(M * sizeof(float2));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(float2), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) {
        //printf("%f %f\n", h_in[i].c.x, h_in[i].c.y);
        outfile << std::setprecision(prec_save) << h_in[i].x << "\n"; outfile << std::setprecision(prec_save) << h_in[i].y << "\n";
    }
    outfile.close();

}

/*************************************/
/* SAVE FLOAT ARRAY FROM GPU TO FILE */
/*************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/********/
/* MAIN */
/********/
int main()
{
    const int   M       = 64;              // --- Number of Fourier harmonics along x (should be a multiple of 2)
    const int   N       = 32;              // --- Number of Fourier harmonics along y(should be a multiple of 2)
    const float Lx      = 3.f;             // --- Domain size along x
    const float Ly      = 1.5f;            // --- Domain size along y
    const float sigma   = 0.1f;            // --- Characteristic width of f(make << 1)

    // --- Wavenumbers on the host
    float *h_kx = (float *)malloc(M * sizeof(float));
    float *h_ky = (float *)malloc(N * sizeof(float));
    for (int k = 0; k < M / 2; k++)  h_kx[k]        = (2.f * M_PI / Lx) * k;
    for (int k = -M / 2; k < 0; k++) h_kx[k + M]    = (2.f * M_PI / Lx) * k;
    for (int k = 0; k < N / 2; k++)  h_ky[k]        = (2.f * M_PI / Ly) * k;
    for (int k = -N / 2; k < 0; k++) h_ky[k + N]    = (2.f * M_PI / Ly) * k;

    // --- Wavenumbers on the device
    float *d_kx;    gpuErrchk(cudaMalloc(&d_kx, M * sizeof(float)));
    float *d_ky;    gpuErrchk(cudaMalloc(&d_ky, N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_kx, h_kx, M * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_ky, h_ky, N * sizeof(float), cudaMemcpyHostToDevice));

    // --- Domain discretization on the host
    float *h_x = (float *)malloc(M * sizeof(float));
    float *h_y = (float *)malloc(N * sizeof(float));
    for (int k = 0; k < M; k++)  h_x[k] = Lx / (float)M * k;
    for (int k = 0; k < N; k++)  h_y[k] = Ly / (float)N * k;

    // --- Domain discretization on the device
    float *d_x;     gpuErrchk(cudaMalloc(&d_x, M * sizeof(float)));
    float *d_y;     gpuErrchk(cudaMalloc(&d_y, N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_x, h_x, M * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice));

    // --- Compute right-hand side of the equation on the device
    float2 *d_r;    gpuErrchk(cudaMalloc(&d_r, M * N * sizeof(float2)));
    dim3 dimBlock(BLOCKSIZEX, BLOCKSIZEY);
    dim3 dimGrid(iDivUp(M, BLOCKSIZEX), iDivUp(N, BLOCKSIZEY));
    computeRHS << <dimGrid, dimBlock >> >(d_x, d_y, d_r, Lx, Ly, sigma, M, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    // --- Create plan for CUDA FFT
    cufftHandle plan;
    cufftPlan2d(&plan, N, M, CUFFT_C2C);

    // --- Compute in place forward FFT of right-hand side
    cufftExecC2C(plan, d_r, d_r, CUFFT_FORWARD);

    // --- Solve Poisson equation in Fourier space 
    //solvePoisson << <dimGrid, dimBlock >> > (d_kx, d_ky, d_r, M, N);
    solvePoissonShared << <dimGrid, dimBlock >> > (d_kx, d_ky, d_r, M, N);

    // --- Compute in place inverse FFT
    cufftExecC2C(plan, d_r, d_r, CUFFT_INVERSE);

    //saveGPUcomplextxt(d_r, "D:\\Project\\poisson2DFFT\\poisson2DFFT\\d_r.txt", M * N);

    // --- With cuFFT, an FFT followed by an IFFT will return the same array times the size of the transform
    // --- Accordingly, we need to scale the result.
    const float scale = 1.f / ((float)M * (float)N);
    float *d_result;    gpuErrchk(cudaMalloc(&d_result, M * N * sizeof(float)));
    //complex2RealScaled << <dimGrid, dimBlock >> > (d_r, d_result, M, N, scale);
    complex2RealScaledOptimized << <dimGrid, dimBlock >> > (d_r, d_result, M, N, scale);

    //saveGPUrealtxt(d_result, "D:\\Project\\poisson2DFFT\\poisson2DFFT\\d_result.txt", M * N);

    // --- Transfer data from device to host
    float *h_result = (float *)malloc(M * N * sizeof(float));
    gpuErrchk(cudaMemcpy(h_result, d_result, M * N * sizeof(float), cudaMemcpyDeviceToHost));

    return 0;

}

Answer 2

As it shows in the tutorial , the Matlab implementation on slide 33 on page 17 shows that the Poisson calculations are based on the top left corner of the screen as the origin. The x and y data values are then x = (0:(N-1))*h; and y = (0:(N-1))*h; , which is why the meshgrid created from these x and y values both start from 0 and increase, as shown on the graph's x and y axes on slide 31 on page 16. In this case, where the image length was 1 (I refer to the input data of the NxN float array or the meshgrid as an image), the center of the image is actually (0.5, 0.5). I wanted to translate these points, so the center point would instead be (0,0) and followed a typical representation of the Cartesian Plane .

So in my code, instead of the Matlab code of

x = (0:(N-1))*h;
y = (0:(N-1))*h;

which could be implemented as

for (int i = 0; i < N; i++)
    {
        kx[i] = i;
        ky[i] = i;
    }

I replaced it with

for (int i = 0; i < N; i++)
    {
        kx[i] = i - N / 2.0f; //centers kx values to be at center of image
        ky[i] = N / 2.0f - i; //centers ky values to be at center of image
    }

However, I had forgot to change the Poisson calculation so it recognizes the center of the image as the origin instead of the top right corner as the origin. So as Mr. Robert Crovella said, I would have to

change this line: if (idx == 0 && idy == 0) scale = 1.0f; to this: if (idx == 2 && idy == 2) scale = 1.0f;

for the case where the image length, or N, is 4. To generalize this for any image length, this line of code could be then changed to if (idx == n/2 && idy == n/2) scale = 1.0f;

Solve the Poisson equation using FFT with CUDA

Question

2 answers

solution1
2 2017-12-29 16:43:29

solution2
1 ACCPTED 2017-03-19 15:04:16

Solve the Poisson equation using FFT with CUDA

Question

2 answers

solution1 2 2017-12-29 16:43:29

solution2 1 ACCPTED 2017-03-19 15:04:16

solution1
2 2017-12-29 16:43:29

solution2
1 ACCPTED 2017-03-19 15:04:16