Cuda 如何将 char** 从内核复制到主机

Question

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <fstream>
#include <algorithm>
#include <time.h>

using namespace std;

__global__ void kern_2D(char **desc, char** merge_char) {

    int idx = threadIdx.x + blockDim.x*blockIdx.x;
    int idy = threadIdx.y + blockDim.y*blockIdx.y;

    if (idx < 10000)
    {
        char* s1 = desc[idx];
        merge_char[idx] = s1;
        //printf("From key = %s\n", merge_char[idx]);
    }

}


int main() {
    cudaError_t err = cudaSuccess;
    size_t max_line_len = 255;
    char line[255];
    size_t line_len;
    size_t max_lines_desc = 10000;
    //---------------------------------------------------------------------------------//

    char **d_desc;
    cudaMalloc(&d_desc, max_lines_desc * sizeof(char *));

    char **m_desc = NULL;
    m_desc = (char**)malloc(max_lines_desc * sizeof(char**));
    char **d_temp_desc = NULL;
    d_temp_desc = (char **)malloc(max_lines_desc * sizeof(char **));

    FILE *f_desc = fopen("desc.txt", "r");
    if (!f_desc)
    {
        fprintf(stderr, "Error opening file!\n");
    }
    int idesc = 0;

    do
    {
        if (!fgets(line, max_line_len, f_desc))
        {
            if (ferror(f_desc) && !feof(f_desc))
            {
                fprintf(stderr, "Error reading from file!\n");
                fclose(f_desc);
            }
            break;
        }

        line_len = strlen(line);
        if ((line_len > 0) && (line[line_len - 1] == '\n'))
        {
            line[line_len - 1] = '\0';
            --line_len;
        }
        m_desc[idesc] = line;
        cudaMalloc(&(d_temp_desc[idesc]), sizeof(line) * sizeof(char));
        cudaMemcpy(d_temp_desc[idesc], m_desc[idesc], sizeof(line) * sizeof(char), cudaMemcpyHostToDevice);
        cudaMemcpy(d_desc + idesc, &(d_temp_desc[idesc]), sizeof(char *), cudaMemcpyHostToDevice);

        ++idesc;
    } while (idesc < max_lines_desc);
    fclose(f_desc);

    //---------------------------------------------------------------------------------//


    char **merge_char;
    cudaMallocManaged(&merge_char, max_lines_desc * sizeof(char *));


    kern_2D << < 1, 1000 >> > (d_desc , merge_char);

    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %s after launching addKernel!\n", cudaGetErrorString(err));
    }


    //---------------------------------------------------------------------------------//

    char** h_dev;

    cudaMalloc((void**)(&h_dev), max_lines_desc * sizeof(char*));
    err = cudaMemcpy(h_dev, merge_char, max_lines_desc * sizeof(char*), cudaMemcpyDeviceToHost);
    if (err == cudaSuccess) printf("2: Okay \n");


    for (int i = 0; i < max_lines_desc; i++)
    {
        printf("%s\n", h_dev[i]);
    }


    return 0;


}
//nvcc - arch = sm_30 - o kernel kernel.cu
// cuda - memcheck . / kernel

I'm sorry for my mistake.我很抱歉我的错误。 I already updated my code.我已经更新了我的代码。 It's completed.它完成了。

For desc.txt, the file has 10000 lines like below.对于 desc.txt，该文件有 10000 行，如下所示。 I checked status after copy from device to host, but I'm wrong.从设备复制到主机后，我检查了状态，但我错了。 I cannot print char** h_dev.我无法打印 char** h_dev。

motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125摩托车 ckd 新款 apsonic ckd 2017 ckd 2018 摩托车 apsonic 新款摩托车 apsonic 125 摩托车 apsonic ap125 新款摩托车 apsonic ap125

Answer 1

I have to say that I don't really understand what your intent is here, because the only thing your kernel is doing is swapping pointers.我不得不说我真的不明白你在这里的意图，因为你的内核所做的唯一一件事就是交换指针。 If that is all you intend to do, you are certainly making things difficult for yourself by using double pointers everywhere.如果这就是您打算做的所有事情，那么您肯定会因为到处使用双指针而给自己带来麻烦。 It would be far simpler just to manage the indices.仅管理索引会简单得多。

But to address your question, as far as I can tell your "copy back to host" is indeed incorrect.但是为了解决您的问题，据我所知，您的“复制回主机”确实不正确。 You are effectively doing a deep-copy of your data from host to device, and so you will need a deep-copy (2-stage copy) in the other direction as well.您正在有效地将数据从主机到设备进行深度复制，因此您还需要在另一个方向进行深度复制（2 阶段复制）。

To accomplish that, we don't use cudaMalloc on the copy to host.为了实现这一点，我们不在要托管的副本上使用cudaMalloc 。 cudaMalloc allocates device memory. cudaMalloc分配设备内存。 If you want to copy something to the host, your copy target is host memory.如果要将某些内容复制到主机，则复制目标是主机内存。 So we will need a set of cudaMemcpy operations to deep-copy the data back to the host, using host buffers as the targets.因此，我们将需要一组cudaMemcpy操作将数据深度复制回主机，使用主机缓冲区作为目标。

The following code represents the simplest modifications I could make to what you have shown to accomplish this, and it seems to work for me for my simple test case:下面的代码代表了我可以对你所展示的内容进行的最简单的修改，它似乎适用于我的简单测试用例：

$ cat desc.txt
1motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
2motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
3motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
4motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
5motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
6motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap1
$ cat t301.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <fstream>
#include <algorithm>
#include <time.h>

using namespace std;

__global__ void kern_2D(char **desc, char** merge_char) {

    int idx = threadIdx.x + blockDim.x*blockIdx.x;
    int idy = threadIdx.y + blockDim.y*blockIdx.y;

    if (idx < 10000)
    {
        char* s1 = desc[idx];
        merge_char[idx] = s1;
        //printf("From key = %s\n", merge_char[idx]);
    }

}


int main() {
    cudaError_t err = cudaSuccess;
    size_t max_line_len = 255;
    char line[255];
    size_t line_len;
    size_t max_lines_desc = 10000;
    //---------------------------------------------------------------------------------//

    char **d_desc;
    cudaMalloc(&d_desc, max_lines_desc * sizeof(char *));

    char **m_desc = NULL;
    m_desc = (char**)malloc(max_lines_desc * sizeof(char**));
    char **d_temp_desc = NULL;
    d_temp_desc = (char **)malloc(max_lines_desc * sizeof(char **));

    FILE *f_desc = fopen("desc.txt", "r");
    if (!f_desc)
    {
        fprintf(stderr, "Error opening file!\n");
    }
    int idesc = 0;

    do
    {
        if (!fgets(line, max_line_len, f_desc))
        {
            if (ferror(f_desc) && !feof(f_desc))
            {
                fprintf(stderr, "Error reading from file!\n");
                fclose(f_desc);
            }
            break;
        }

        line_len = strlen(line);
        if ((line_len > 0) && (line[line_len - 1] == '\n'))
        {
            line[line_len - 1] = '\0';
            --line_len;
        }
        m_desc[idesc] = line;
        cudaMalloc(&(d_temp_desc[idesc]), sizeof(line) * sizeof(char));
        cudaMemcpy(d_temp_desc[idesc], m_desc[idesc], sizeof(line) * sizeof(char), cudaMemcpyHostToDevice);
        cudaMemcpy(d_desc + idesc, &(d_temp_desc[idesc]), sizeof(char *), cudaMemcpyHostToDevice);

        ++idesc;
    } while (idesc < max_lines_desc);
    fclose(f_desc);

    //---------------------------------------------------------------------------------//


    char **merge_char;
    cudaMallocManaged(&merge_char, max_lines_desc * sizeof(char *));


    kern_2D << < 1, 1000 >> > (d_desc , merge_char);

    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %s after launching addKernel!\n", cudaGetErrorString(err));
    }


    //---------------------------------------------------------------------------------//

    char** h_dev;

    h_dev = (char **)malloc(max_lines_desc * sizeof(char*));
    err = cudaMemcpy(h_dev, merge_char, max_lines_desc * sizeof(char*), cudaMemcpyDeviceToHost);
    if (err == cudaSuccess) printf("2: Okay \n");


    for (int i = 0; i < 6; i++)
    {
        cudaMemcpy(line, h_dev[i], sizeof(line), cudaMemcpyDeviceToHost);
        printf("%s\n", line);
    }


    return 0;


}
$ nvcc -o t301 t301.cu
t301.cu(15): warning: variable "idy" was declared but never referenced

$ cuda-memcheck ./t301
========= CUDA-MEMCHECK
2: Okay
1motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
2motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
3motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
4motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
5motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
6motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap1
========= ERROR SUMMARY: 0 errors
$

Cuda 如何将 char** 从内核复制到主机

问题描述

1 个解决方案

解决方案1
1 2018-10-10 14:25:00

Cuda 如何将 char** 从内核复制到主机

问题描述

1 个解决方案

解决方案1 1 2018-10-10 14:25:00

解决方案1
1 2018-10-10 14:25:00