简体   繁体   English

使用 cuda 将 bgr 转换为 yuv420p 的问题

[英]Problem of converting bgr to yuv420p with cuda

I need to convert image from bgr to yuv420p and I first use OpenCV to do so.我需要将图像从bgr 转换为 yuv420p ,我首先使用 OpenCV 这样做。

Mat img = imread("1.bmp");
Mat yuvImg;
cvtColor(img,yuvImg,COLOR_BGR2YUV_I420);

The result of it is normal.结果是正常的。 However,my image is too big and its pixel is almost 6400 * 2000. I find it costs too much time of converting bgr to yuv420p with opencv api cvtcolor.但是,我的图像太大,它的像素几乎是 6400 * 2000。我发现使用 opencv api cvtcolor 将 bgr 转换为 yuv420p 花费太多时间

Then I decide to convert it myself and speed it with cuda.然后我决定自己转换它并用 cuda 加速它。

Here is code in cpu :这是cpu中的代码:

void bgr_to_yuv420p(unsigned  char* yuv420p, unsigned char* bgr, int width, int height)
{
    if (yuv420p == NULL || bgr== NULL)
        return;
    int frameSize = width*height;
    int chromaSize = frameSize / 4;

    int yIndex = 0;
    int uIndex = frameSize;
    int vIndex = frameSize + chromaSize;

    int R, G, B, Y, U, V;
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            B = bgr[(i * width + j) * 3 + 0];
            G = bgr[(i * width + j) * 3 + 1];
            R = bgr[(i * width + j) * 3 + 2];

            //BGR to YUV
            Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
            U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
            V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;

            yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
            if (i % 2 == 0 && j % 2 == 0)
            {
                yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
                yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
            }
        }
    }
}

I test the code bgr_to_yuv420p (...) and the result is also normal.我测试了代码bgr_to_yuv420p (...),结果也正常。

Then I speed it up with cuda.然后我用 cuda 加速它。

Here is all my code include kernel function and test function.这是我所有的代码,包括 kernel function 和测试 function。

#include <iostream>
#include <time.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "opencv2/highgui.hpp" 
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;

//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 *  d_in, unsigned char * d_out,
                               uint imgheight, uint imgwidth)
{

    int col_num = blockIdx.x*blockDim.x+threadIdx.x;
    int row_num = blockIdx.y*blockDim.y+threadIdx.y;

    if ((row_num < imgheight) && (col_num < imgwidth))
    {
//        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
        int global_offset = row_num*imgwidth+col_num;

        int r,g,b;
        r = int(d_in[global_offset].z);
        g = int (d_in[global_offset].y);
        b = int (d_in[global_offset].x);


        d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
        if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
            int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
            d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
            d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;

        }

    }
}

int main(void)
{

    Mat srcImage = imread("1.bmp");
    imshow("srcImage", srcImage);
    const uint imgheight = srcImage.rows;
    const uint imgwidth = srcImage.cols;

    Mat nv12Image(imgheight * 3 / 2, imgwidth, CV_8UC1, Scalar(255));

    //input and output 
    uchar3 *d_in;
    unsigned char *d_out;

    // malloc memo in gpu
    cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);

    //copy image from cpu to gpu
    cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    //run kernel function
    bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);

    cudaDeviceSynchronize();

    //copy yuv420p from gpu to cpu
    cudaMemcpy(nv12Image.data, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);

    imshow("nv12",nv12Image);
    imwrite("cuda.bmp",nv12Image);

    cudaFree(d_in);
    cudaFree(d_out);


    return 0;

}

The code with cuda can run but the result is not normal.带有cuda的代码可以运行但结果不正常。 Y of YUV420p is normal but there is something wrong with U and V. I think the reason is here in __global__ void bgr2yuv420p(...) YUV420p 的 Y 是正常的,但是 U 和 V 有问题。我认为原因在__global__ void bgr2yuv420p(...)

if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
                int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
                d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
                d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;

            }

I try a lot but still cannot solve it.我尝试了很多但仍然无法解决它。 And I find little code about converting rgb to yuv420p, More codes are about converting yuv420p to rgb.我发现很少有关于将 rgb 转换为 yuv420p 的代码,更多的代码是关于将 yuv420p 转换为 rgb 的。 So I want to know is somebody running into the same question or giving me some advice?所以我想知道是否有人遇到同样的问题或给我一些建议?

Thanks Robert Crovella.Here is my update-1 .谢谢罗伯特·克罗维拉。这是我的update-1

I follow Robert Crovella's advice and change the kernel function like this:我按照 Robert Crovella 的建议更改 kernel function 如下:

//kernel function to convert bgr to yuv420p
    __global__ void bgr2yuv420p(uchar3 *  d_in, unsigned char * d_out,
                                   uint imgheight, uint imgwidth)
    {

        int col_num = blockIdx.x*blockDim.x+threadIdx.x;
        int row_num = blockIdx.y*blockDim.y+threadIdx.y;

        if ((row_num < imgheight) && (col_num < imgwidth))
        {
    //        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
            int global_offset = row_num*imgwidth+col_num;

            int r,g,b;
            r = int(d_in[global_offset].z);
            g = int (d_in[global_offset].y);
            b = int (d_in[global_offset].x);


            d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
            if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
                int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
                d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
                d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;

            }

        }
    }

I test the new kernel with excitement,but the result is also not normal.我兴奋地测试了新的kernel,但结果也不正常。 Here is my result image with the updated kernel function.这是我更新后的 kernel function 的结果图像。 yuv420p image converted by myself自己转换的yuv420p图片

Then the normal result image converted by opencv api is here.那么opencv api转换的正常结果图就到这里了。 yuv420p image converted by opencv api由opencv api转换的yuv420p图像

As we can see, the difference between the two images is U and V. I have already changed the index of U and V in kernel function, ie可以看到,两张图的区别是U和V。我已经在kernel function中改变了U和V的索引,即

if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
                int uv_offset = imgwidth*imgheight+((row_num >>1)*imgwidth)+col_num;
                d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
                d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;

            }

I think it will work but it does not.我认为它会起作用,但它不会。 Any other advice?还有什么建议吗? Robert Crovella罗伯特·克罗维拉

Edit: The solution is Robert Crovella's latest answer.编辑:解决方案是罗伯特·克罗维拉的最新答案。 I have double checked it and it is really perfect.我已经仔细检查了它,它真的很完美。

There are a variety of issues:有各种各样的问题:

  • the calculations to convert R,G,B to Y,U,V between your CPU and GPU codes are not identical.在 CPU 和 GPU 代码之间将 R,G,B 转换为 Y,U,V 的计算并不相同。 Yes, this matters.是的,这很重要。
  • Your CPU code has planar Y,U,V storage.您的 CPU 代码具有平面 Y、U、V 存储。 That means Y has its own plane, U has its own plane, and V has its own plane.这意味着 Y 有自己的平面,U 有自己的平面,V 有自己的平面。 Your GPU codes is semi planar (NV12) format.您的 GPU 代码是半平面 (NV12) 格式。 That means Y has its own plane, and U,V are interleaved in a single plane: UVUVUVUVUVUV.... Obviously the output of those two codes could never match identically.这意味着 Y 有自己的平面,并且 U、V 在一个平面上交错:UVUVUVUVUVUV.... 显然,这两个代码的 output 永远不可能完全匹配。
  • IMO, there is no need to drag OpenCV into this. IMO,无需将 OpenCV 拖入其中。
  • Your UV offset calculation in the kernel (GPU) code was broken.您在 kernel (GPU) 代码中的 UV 偏移计算已损坏。 The imgwidth*imgheight offset gets you past the Y area (correctly), but from that point, it is not correct to use row_num*imgwidth to index by row into the UV planar region. imgwidth*imgheight偏移让您越过 Y 区域(正确),但从那时起,使用row_num*imgwidth索引到 UV 平面区域是不正确的。 You do not have that many rows in the UV planar region, you only have half as many rows.在 UV 平面区域中没有那么多行,只有一半的行。
  • In your GPU kernel, you had U,V ordering reversed, you were effectively doing VUVUVUVU...在您的 GPU kernel 中,您的 U、V 顺序颠倒了,您实际上是在做 VUVUVUVU ...

My recommendation would be to start by harmonizing the calculation differences and storage order/format.我的建议是首先协调计算差异和存储顺序/格式。 The following code has the above issues addressed, and gives matching results for me between CPU and GPU codes:以下代码解决了上述问题,并为我提供了 CPU 和 GPU 代码之间的匹配结果:

$ cat t1708.cu
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
// I have no idea if these are the correct conversion formulas
// I simply lifted what I saw in your host code so that we 
// are using the same conversion calculations in host and device
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
  int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
  return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
  int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
  return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
  int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
  return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}

void bgr_to_yuv420p(unsigned  char* yuv420p, unsigned char* bgr, int width, int height)
{
    if (yuv420p == NULL || bgr== NULL)
        return;
    int frameSize = width*height;

    int yIndex = 0;
    int uIndex = frameSize;

    int R, G, B;
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            B = bgr[(i * width + j) * 3 + 0];
            G = bgr[(i * width + j) * 3 + 1];
            R = bgr[(i * width + j) * 3 + 2];

            //BGR to YUV
            yuv420p[yIndex++] = bgr2y(R,G,B);
            if (i % 2 == 0 && j % 2 == 0)
            {
                yuv420p[uIndex] = bgr2u(R,G,B);
                yuv420p[uIndex+1] = bgr2v(R,G,B);
                uIndex+=2;
            }
        }
    }
}

//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 *  d_in, unsigned char * d_out,
                               uint imgheight, uint imgwidth)
{

    int col_num = blockIdx.x*blockDim.x+threadIdx.x;
    int row_num = blockIdx.y*blockDim.y+threadIdx.y;

    if ((row_num < imgheight) && (col_num < imgwidth))
    {
//        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
        int global_offset = row_num*imgwidth+col_num;

        int r,g,b;
        r = int(d_in[global_offset].z);
        g = int (d_in[global_offset].y);
        b = int (d_in[global_offset].x);


        d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
        if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
            int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
            d_out[uv_offset] = bgr2u(r,g,b);
            d_out[uv_offset+1] = bgr2v(r,g,b);

        }

    }
}

int main(void)
{

    const uint imgheight = 1000;
    const uint imgwidth = 1500;

    //input and output
    uchar3 *d_in;
    unsigned char *d_out;
    uchar3 *idata = new uchar3[imgheight*imgwidth];
    unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
    unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
    uchar3 pix;
    for (int i = 0; i < imgheight*imgwidth; i++){
      pix.x = (rand()%30)+40;
      pix.y = (rand()%30)+40;
      pix.z = (rand()%30)+40;
      idata[i] = pix;}
    for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
    bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
    // malloc memo in gpu
    cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);

    //copy image from cpu to gpu
    cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    //run kernel function
    bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);

    cudaDeviceSynchronize();

    //copy yuv420p from gpu to cpu
    cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
    for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
    cudaFree(d_in);
    cudaFree(d_out);


    return 0;

}
$ nvcc -o t1708 t1708.cu
$ cuda-memcheck ./t1708
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

Any time you are having trouble with a CUDA code, I recommend每当您遇到 CUDA 代码问题时,我建议

  1. Proper CUDA error checking正确的 CUDA 错误检查
  2. Running your code with cuda-memcheck使用cuda-memcheck运行代码

EDIT: Based on additional comments, here is a version of the above code that uses the OP-supplied CPU code verbatim, and provides a CUDA kernel that generates YUV planar storage (instead of semi-planar storage):编辑:根据附加评论,这里是上述代码的一个版本,它逐字使用 OP 提供的 CPU 代码,并提供了 CUDA kernel 生成 YUV 平面存储(而不是半平面存储):

#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
  int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
  return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
  int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
  return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
  int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
  return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}

void bgr_to_yuv420sp(unsigned  char* yuv420p, unsigned char* bgr, int width, int height)
{
    if (yuv420p == NULL || bgr== NULL)
        return;
    int frameSize = width*height;

    int yIndex = 0;
    int uIndex = frameSize;

    int R, G, B;
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            B = bgr[(i * width + j) * 3 + 0];
            G = bgr[(i * width + j) * 3 + 1];
            R = bgr[(i * width + j) * 3 + 2];

            //BGR to YUV
            yuv420p[yIndex++] = bgr2y(R,G,B);
            if (i % 2 == 0 && j % 2 == 0)
            {
                yuv420p[uIndex] = bgr2u(R,G,B);
                yuv420p[uIndex+1] = bgr2v(R,G,B);
                uIndex+=2;
            }
        }
    }
}
void bgr_to_yuv420p(unsigned  char* yuv420p, unsigned char* bgr, int width, int height)
{
    if (yuv420p == NULL || bgr== NULL)
        return;
    int frameSize = width*height;
    int chromaSize = frameSize / 4;

    int yIndex = 0;
    int uIndex = frameSize;
    int vIndex = frameSize + chromaSize;

    int R, G, B, Y, U, V;
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            B = bgr[(i * width + j) * 3 + 0];
            G = bgr[(i * width + j) * 3 + 1];
            R = bgr[(i * width + j) * 3 + 2];

            //BGR to YUV
            Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
            U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
            V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;

            yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
            if (i % 2 == 0 && j % 2 == 0)
            {
                yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
                yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
            }
        }
    }
}
//kernel function to convert bgr to yuv420sp
__global__ void bgr2yuv420sp(uchar3 *  d_in, unsigned char * d_out,
                               uint imgheight, uint imgwidth)
{

    int col_num = blockIdx.x*blockDim.x+threadIdx.x;
    int row_num = blockIdx.y*blockDim.y+threadIdx.y;

    if ((row_num < imgheight) && (col_num < imgwidth))
    {
//        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
        int global_offset = row_num*imgwidth+col_num;

        int r,g,b;
        r = int(d_in[global_offset].z);
        g = int (d_in[global_offset].y);
        b = int (d_in[global_offset].x);


        d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
        if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
            int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
            d_out[uv_offset] = bgr2u(r,g,b);
            d_out[uv_offset+1] = bgr2v(r,g,b);

        }

    }
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 *  d_in, unsigned char * d_out,
                               uint imgheight, uint imgwidth)
{

    int col_num = blockIdx.x*blockDim.x+threadIdx.x;
    int row_num = blockIdx.y*blockDim.y+threadIdx.y;

    if ((row_num < imgheight) && (col_num < imgwidth))
    {
//        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
        int global_offset = row_num*imgwidth+col_num;

        int r,g,b;
        r = int(d_in[global_offset].z);
        g = int (d_in[global_offset].y);
        b = int (d_in[global_offset].x);


        d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
        if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
            int u_offset = imgwidth*imgheight+((row_num>>1)*(imgwidth>>1))+(col_num>>1);
            d_out[u_offset] = bgr2u(r,g,b);
            int v_offset = u_offset+((imgheight>>1)*(imgwidth>>1));
            d_out[v_offset] = bgr2v(r,g,b);

        }
    }
}


int main(void)
{

    const uint imgheight = 1000;
    const uint imgwidth = 1500;

    //input and output
    uchar3 *d_in;
    unsigned char *d_out;
    uchar3 *idata = new uchar3[imgheight*imgwidth];
    unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
    unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
    uchar3 pix;
    for (int i = 0; i < imgheight*imgwidth; i++){
      pix.x = (rand()%30)+40;
      pix.y = (rand()%30)+40;
      pix.z = (rand()%30)+40;
      idata[i] = pix;}
    for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
    bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
    // malloc memo in gpu
    cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);

    //copy image from cpu to gpu
    cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    //run kernel function
    bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);

    cudaDeviceSynchronize();

    //copy yuv420p from gpu to cpu
    cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
    for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
    cudaFree(d_in);
    cudaFree(d_out);


    return 0;

}

I don't claim correctness for this code or any other code that I post.我不声明此代码或我发布的任何其他代码的正确性。 Anyone using any code I post does so at their own risk.使用我发布的任何代码的任何人都需要自担风险。 I merely claim that I have attempted to address the deficiencies that I found in the original posting, and provide some explanation thereof.我只是声称我试图解决我在原始帖子中发现的缺陷,并提供一些解释。 I am not claiming my code is defect-free, or that it is suitable for any particular purpose.我并不是说我的代码没有缺陷,或者它适用于任何特定目的。 Use it (or not) at your own risk.使用(或不使用)风险自负。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM