I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.
Some suggestions to save some time/memory?
I really want to use dynamic 2d array.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>
__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}
int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);
dim3 d(2, 2);
fct << <1, d >> > (d_c);
for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%d\n", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete[] h_c;
delete[] d_c;
}
You may actually want to use flattened matrix with some pointer tricks:
int main() {
const int size = 10;
auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}
This way, you can still access the matrix with arr[x][y]
syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).
* It is faster to allocate size * size
memory once, rather than allocating size
times size
elements.
Side note: using delete[]
on a malloc
ed memory is undefined behaviour . Don't mix new
/ new[]
+ delete
/ delete[]
with malloc
+ free
.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.