Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3

Question

I have read this post Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D among many others including NVIDIA docs and I can't get cudaMallocPitch to work together with cudaMemcpy2D.

I need to copy a very big matrix in an array format (Matrix[width*height]) along with a simple array to perform Matrix * vector operations. It is not optional for me to use cudaMallocPitch in order to avoid conflicts and have a better performance.

So, I started by just trying to copy the matrix (vector in my case) to the device and check if it was correctly copied but my code does not print anything. If I use cudaMalloc and cudaMemcpy everything works fine. But I do not know what to do with cudaMallocPitch and cudaMemcpy2D.

What can I do to fix this?

#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    while (idx < N)
    {
        double e = *(double *)(((char *) A + idx * mpitch) + N);
        printf("(%f)", e);
    }
}
int main()
{
    int N = 1500;
    double * A  = new double[N], * d_A;
    size_t pitch;

    for (int i = 0; i < N; ++i)
    {
        A[i] = i;
    }
    cudaMallocPitch(&d_A, &pitch,  sizeof(double) * N, 1);
    cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
    unsigned int blocksize = 1024;
    unsigned int nblocks = (N + blocksize - 1) / blocksize;
    kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
    cudaFree(d_A);
    delete [] A;
    return 0;
}

Answer 1

Error checking can make a big difference in debugging. You should always use it before coming here.

It wasn't clear if you wanted a row or column vector ie a matrix of [1xN] or [Nx1]

I've added an explanation on Talomnies suggestion, but first the 'working slabs of code'

Here's [Nx1]

#include <cstdio>
#include <iostream>
#include <cuda.h>

using namespace std;

__global__ void kernel(size_t mpitch, double * A, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx>=N) return;
    double e = *(double *)(((char *) A + idx * mpitch));
    printf("(%f)", e);

}
int main()
{
    int N = 15;
    double * A  = new double[N], * d_A;
    size_t pitch;

    for (int i = 0; i < N; ++i)
    {
        A[i] = i;
    }

    cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
    if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;

    err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
    if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;

    unsigned int blocksize = 1024;
    unsigned int nblocks = (N + blocksize - 1) / blocksize;
    kernel <<<nblocks, blocksize>>>(pitch, d_A, N);

    cudaDeviceSynchronize();
    err = cudaGetLastError();
    if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;

    cudaFree(d_A);
    delete [] A;
    return 0;
}

[1xN]:

#include <cstdio>
#include <iostream>
#include <cuda.h>

using namespace std;

__global__ void kernel(size_t mpitch, double * A, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx>=N) return;
    int row=0;//only one row

    double *row_ptr = (double *)( (char *) (A + mpitch * row) );
    double e = row_ptr[idx];
    printf("(%f)", e);

}
int main()
{
    int N = 15;
    double * A  = new double[N], * d_A;
    size_t pitch;

    for (int i = 0; i < N; ++i)
    {
        A[i] = i;
    }

    cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
    if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;

    err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
    if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;

    unsigned int blocksize = 1024;
    unsigned int nblocks = (N + blocksize - 1) / blocksize;
    kernel <<<nblocks, blocksize>>>(pitch, d_A, N);

    cudaDeviceSynchronize();
    err = cudaGetLastError();
    if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;

    cudaFree(d_A);
    delete [] A;
    return 0;
}

Explanation

Firslty, Error Handling:

Considering how easy error handling is in CUDA there isn't a good excuse not to put it in.

cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;

Second, you didn't specify if you wanted a column vector or a row vector. Since a row vector is simply a 1-D array in linear memory and you don't need pitched memory to do that, I will assume for this explanation that you meant a column vector.

The reoccurring problem you were having was "misaligned address" in the kernel. This indicates that the problem is book-keeping, so lets walk through the three major steps of handling an aligned 2D array (even though our arrays will be either a column or row vector).

Allocating : Your allocation was written out as

cudaMallocPitch(&d_A, &pitch,  sizeof(double) * N, 1);

This is correct for the row vector as the API is cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows) However if we would like to do a column vector correct call is

cudaMallocPitch(&d_A, &pitch, sizeof(double), N);

Accessing : For accessing you were mixing up accessing a row, and accessing an element in the row.

double e = *(double *)(((char *) A + idx * mpitch) + N);

Once again stick to the documentation. The API documentation for cudaMallocPitch includes

T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;

for us this translates into

int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;

I've used column = 0 for completeness since we do not have more than one column.

Copying :

cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);

For this case this is correct. API for cudaMemcpy2D is

cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);

Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3

Question

1 answers

solution1
2 ACCPTED 2015-05-30 16:18:38

Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3

Question

1 answers

solution1 2 ACCPTED 2015-05-30 16:18:38

solution1
2 ACCPTED 2015-05-30 16:18:38