C++ Cuda performance for double pointers

Question

I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.

Some suggestions to save some time/memory?

I really want to use dynamic 2d array.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>

__global__ void fct(int **dev_c)
{
    int y = threadIdx.x;
    int x = threadIdx.y;
    dev_c[y][x] = 3;
}

int main(void)
{
    //Output Array
    int **cc = new int*[2];
    for (int i = 0; i < 2; i++)cc[i] = new int[2];
    //Host Array
    int ** h_c = (int **)malloc(2 * sizeof(int *));
    for (int i = 0; i < 2; i++) {
        cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
    }
    //Devie array
    int ** d_c;
    cudaMalloc((void **)&d_c, 2 * sizeof(int *));
    cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);


    dim3 d(2, 2);
    fct << <1, d >> > (d_c);

    for (int i = 0; i < 2; i++) {
        cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
    }

    for (int i = 0; i < 2; i++) {
        for (int j = 0; j < 2; j++) {
            printf("(%d,%d):%d\n", i, j, cc[i][j]);
        }
    }
    int x;
    std::cin >> x;
    delete[] h_c;
    delete[] d_c;
}

Answer 1

You may actually want to use flattened matrix with some pointer tricks:

int main() {
    const int size = 10;

    auto arr = new int*[size];
    arr[0] = new int[size * size];
    for(int i = 1; i < size; i++) {
        arr[i] = arr[0] + (i * size);
    }
}

This way, you can still access the matrix with arr[x][y] syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).

* _{It is faster to allocate size * size memory once, rather than allocating size times size elements.}

Side note: using delete[] on a malloc ed memory is undefined behaviour . Don't mix new / new[] + delete / delete[] with malloc + free .

C++ Cuda performance for double pointers

Question

1 answers

solution1
0 ACCPTED 2018-12-30 14:41:54

C++ Cuda performance for double pointers

Question

1 answers

solution1 0 ACCPTED 2018-12-30 14:41:54

solution1
0 ACCPTED 2018-12-30 14:41:54