CUDA cudaMemcpy，遇到非法的 memory 访问

Question

i'm fairly new to cuda and i want to use the concept of constant memory, but i'm getting an illegal memory access was encountered when running the code.我对 cuda 还很陌生，我想使用常量 memory 的概念，但是在运行代码时遇到了非法的 memory 访问。

My kernel looks like this我的 kernel 看起来像这样

__global__ void nonceKernel(int inLen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int nonceLen, int* finishedFlag, int *mutex, int size)
{
        if(!*finishedFlag) return;

        unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

        bool found = true;
        BYTE tempNonce[2];
        BYTE tempSha1[20];

        tempNonce[1]=((tid+size) >> 8) & 0x000000FF;
        tempNonce[0]=(tid+size) & 0x000000FF;

        CUDA_SHA1 ctx;

        cuda_sha1_init(&ctx); //init context
        cuda_sha1_update(&ctx, device_input_data, inLen); // add input buffer
        cuda_sha1_update(&ctx, tempNonce, nonceLen); //add nonce
        cuda_sha1_final(&ctx, tempSha1); //compute sha1

        for(int i=0; i<shaTermLength; i++) {
                if(tempSha1[19 - i] != device_sha1_term[shaTermLength - 1 - i])
                        found=false;
        }

        if(found == true) {
                lock(mutex);
                memcpy(outSha1, tempSha1, 20); //20 bytes for sha1
                memcpy(outNonce, tempNonce, nonceLen); //2 bytes for nonce
                *finishedFlag = 0;
                unlock(mutex);
        }
}

My intermediary function like this:我的中介 function 是这样的：

cudaError_t nonceWithCuda(int intlen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int *finishFlag, int nonceLen, int size)
{
        BYTE *gpuSha1Out;
        BYTE *gpuNonceOut;
        int *gpuFinishedFlag;
        cudaError_t cudaStatus;
        int *mutex;

        cudaStatus= cudaSetDevice(0);
        if(cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaSetDevice failed! Do you have a cuda gpu installed?");
                goto Error;
        }
        ....
        cudaStatus=cudaMalloc((void**)&gpuFinishedFlag, 1*sizeof(int));
        if(cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaMalloc for gpuFinishedFlag failed");
                goto Error;

        }

        cudaStatus=cudaMemcpy(gpuFinishedFlag, finishFlag, sizeof(int), cudaMemcpyHostToDevice);
        if(cudaStatus!=cudaSuccess) {
                fprintf(stderr, "cudamemcpy 0 to gpuFinishedFlag failed!");
                goto Error;
        }
        ....
        while(*finishFlag) {
                nonceKernel<<<128, 1024>>>(intlen, shaTermLength, gpuSha1Out, gpuNonceOut, nonceLen, gpuFinishedFlag, mutex, size);
                size++;
                cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
                if(cudaStatus!=cudaSuccess) {
                        fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
                        goto Error;
                }
        }
        ......

Error:
        cudaFree(gpuSha1Out);
        cudaFree(gpuNonceOut);
        cudaFree(gpuFinishedFlag);

        return cudaStatus;
}

Also i'm declaring the constant variables as such:我也这样声明常量变量：

__constant__ BYTE* device_input_data;
__constant__ BYTE* device_sha1_term;

where BYTE is defined as an unsigned char typedef unsigned char BYTE;其中 BYTE 定义为 unsigned char typedef unsigned char BYTE; . .

And finally the main function.最后是主function。

int main(int argc, char** argv) {

        size_t input_block_size=5; //bytes
        int nonceLen=2;
        int finishedFlag=1;

        ...

        BYTE* inputData = (BYTE*) malloc(input_block_size * sizeof(BYTE)); //input byte buffer
        inputData[0]=0x23; //#
        inputData[1]=0x30; //0
        inputData[2]=0x42; //B
        inputData[3]=0x69; //i
        inputData[4]=0x61; //a
        BYTE* shaTerm = (BYTE*) malloc(nonceLen * sizeof(BYTE));
        shaTerm[0]=0x7E;
        shaTerm[1]=0x46;
        int shaTermLength = sizeof(shaTerm)/sizeof(shaTerm[0]);//ouput sha buffer
        cudaStatus=cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
        fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
        cudaStatus=cudaMemcpyToSymbol(device_sha1_term, shaTerm, shaTermLength * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
        fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
        ...
        nonceWithCuda(input_block_size, shaTermLength, outputSha1Buffer, outputNonceBuffer, &finishedFlag, 2, size);

The error occurs in the while from the nonceWithCuda function, when i'm copying back the value from the gpu to host, i mean this piece of code:当我将值从 gpu 复制回主机时，从 nonceWithCuda function 中发生错误，我的意思是这段代码：

cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
if(cudaStatus!=cudaSuccess) {
       fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
       goto Error;
}

The output: output：

$ ./nonce_v3
MemcpyToSymbol: no error
MemcpyToSymbol: no error
cudaMemcpy from gpuFinishedFlag failed, with code: an illegal memory access was encountered!

Note that the same code works fine when i'm not using constant for those two variables and cannot understand why.请注意，当我没有对这两个变量使用常量并且无法理解原因时，相同的代码可以正常工作。 Can someone point me in the right direction?有人可以指出我正确的方向吗？

Thank you for your help!!!谢谢您的帮助！！！

Answer 1

I am assuming that you want to store the 5 elements of inputData in constant memory.我假设您想将inputData的 5 个元素存储在常量 memory 中。

The line __constant__ BYTE* device_input_data; __constant__ BYTE* device_input_data; will reserve constant memory to store a single pointer.将保留常量 memory 以存储单个指针。 It will not reserve constant memory for 5 BYTE values.它不会为 5 BYTE 值保留常量 memory。

Then, with然后，与

cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);

the memory adress to which this pointer points to is set to the elements of inputData, ie after transfer, the pointer could have the value 0x2330426961000000 .该指针指向的 memory 地址设置为 inputData 的元素，即在传输后，指针的值可能为0x2330426961000000 。 Most likely, this is not a valid address to device memory.这很可能不是设备 memory 的有效地址。 This causes the observed memory error when trying to access this memory location in your kernel.当尝试访问 kernel 中的此 memory 位置时，这会导致观察到的 memory 错误。

To fix this, you need to declare the constant memory as a BYTE array of size 5.要解决此问题，您需要将常量 memory 声明为大小为 5 的 BYTE 数组。

__constant__ BYTE device_input_data[5];

CUDA cudaMemcpy，遇到非法的 memory 访问

问题描述

1 个解决方案

解决方案1
0 2021-12-15 20:47:19

CUDA cudaMemcpy，遇到非法的 memory 访问

问题描述

1 个解决方案

解决方案1 0 2021-12-15 20:47:19

解决方案1
0 2021-12-15 20:47:19