CUDA-使用cudaMallocPitch和cudaMemcpy2D，錯誤：InvalidValue，InvalidPitchValue

Question

好吧，所以我正在嘗試為cuda開發2D陣列，但這正在變得很痛苦。 錯誤在標題中，並發生在cudaMemcpy2D處。 我認為這個問題對訓練有素的眼睛很明顯。 預先感謝您提供的任何幫助，我已經超越了正在學習指針的班級。

#include <cuda_runtime.h>
#include <iostream>
#pragma comment (lib, "cudart")

/* Program purpose: pass a 10 x 10 matrix and multiply it by another 10x10 matrix */

float matrix1_host[100][100];
float matrix2_host[100][100];

float* matrix1_device;
float* matrix2_device;  
size_t pitch;
cudaError_t err;

__global__ void addMatrix(float* matrix1_device,float* matrix2_device, size_t pitch){
    // How this works
    // first we start to cycle through the rows by using the thread's ID
    // then we calculate an address from the address of a point in the row, by adding the pitch (size of each row) and  * it by
    // the amount of rows we've already completed, then we can use that address of somewhere at a start of a row to get the colums 
    // in the row with a normal array grab. 

    int r = threadIdx.x;

        float* rowofMat1 = (float*)((char*)matrix1_device + r * pitch);
        float* rowofMat2 = (float*)((char*)matrix2_device + r * pitch);
        for (int c = 0; c < 100; ++c) {
             rowofMat1[c] += rowofMat2[c];
        }

}

void initCuda(){
    err = cudaMallocPitch((void**)matrix1_device, &pitch, 100 * sizeof(float), 100);
    err = cudaMallocPitch((void**)matrix2_device, &pitch, 100 * sizeof(float), 100); 
    //err = cudaMemcpy(matrix1_device, matrix1_host, 100*100*sizeof(float), cudaMemcpyHostToDevice);
    //err = cudaMemcpy(matrix2_device, matrix2_host, 100*100*sizeof(float), cudaMemcpyHostToDevice);
    err = cudaMemcpy2D(matrix1_device, 100*sizeof(float), matrix1_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
    err = cudaMemcpy2D(matrix2_device, 100*sizeof(float), matrix2_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
}

void populateArrays(){
    for(int x = 0; x < 100; x++){
        for(int y = 0; y < 100; y++){
            matrix1_host[x][y] = (float) x + y;
            matrix2_host[y][x] = (float) x + y;
        }
    }
}

void runCuda(){
    dim3 dimBlock ( 100 );
    dim3 dimGrid ( 1 );
    addMatrix<<<dimGrid, dimBlock>>>(matrix1_device, matrix2_device, 100*sizeof(float)); 
    //err = cudaMemcpy(matrix1_host, matrix1_device, 100*100*sizeof(float), cudaMemcpyDeviceToHost);
    err = cudaMemcpy2D(matrix1_host, 100*sizeof(float), matrix1_device, pitch, 100*sizeof(float),100, cudaMemcpyDeviceToHost);
    //cudaMemcpy(matrix1_host, matrix1_device, 100*100*sizeof(float), cudaMemcpyDeviceToHost);
}

void cleanCuda(){
    err = cudaFree(matrix1_device);
    err = cudaFree(matrix2_device);

    err = cudaDeviceReset();
}


int main(){
    populateArrays();
    initCuda();
    runCuda();
    cleanCuda();
    std::cout << cudaGetErrorString(cudaGetLastError());
    system("pause");
    return 0;
}

Answer 1

首先，通常，您應該為matrix1和matrix2有一個單獨的音調變量。 在這種情況下，它們將與從API調用cudaMallocPitch返回的值相同，但在一般情況下可能不是。

在cudaMemcpy2D行中，調用的第二個參數是目標音高。 這只是您對特定目標矩陣（即第一個參數）進行cudaMallocPitch調用時返回的音調值。

第四個參數是源音高。 由於這是通過普通主機分配分配的，因此除了字節寬度以外，沒有間距。

這樣就交換了第二個和第四個參數。

所以代替這個：

err = cudaMemcpy2D(matrix1_device, 100*sizeof(float), matrix1_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);

嘗試這個：

err = cudaMemcpy2D(matrix1_device, pitch, matrix1_host, 100*sizeof(float), 100*sizeof(float), 100, cudaMemcpyHostToDevice);

同樣，第二次調用cudaMemcpy2D 。 第三個調用實際上是可以的，因為它的方向相反，源矩陣和目標矩陣交換了，因此它們正確地與音高參數對齊。

CUDA-使用cudaMallocPitch和cudaMemcpy2D，錯誤：InvalidValue，InvalidPitchValue

問題描述

1 個解決方案

解決方案1
3 已采納 2013-03-15 04:53:42

CUDA-使用cudaMallocPitch和cudaMemcpy2D，錯誤：InvalidValue，InvalidPitchValue

問題描述

1 個解決方案

解決方案1 3 已采納 2013-03-15 04:53:42

解決方案1
3 已采納 2013-03-15 04:53:42