[英]Parallel programming addition using CUDA not successful
我嘗試使用下面的代碼添加數組,但最終沒有添加數組並且沒有錯誤,這是一個基於 gpu (CUDA) 的並行程序。
#include <cuda_runtime.h>
#include <cuda.h>
#include <iostream>
#include <stdlib.h>
using namespace std;
__global__ void AddInts(int *a, int *b, int count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < count)
{
a[id] += b[id];
}
}
int main()
{
srand(time(NULL));
int count = 100;
int *h_a = new int[count];
int *h_b = new int[count];
for (int i = 0; i < count; i++)
{
h_a[i] = rand() % 1000;
h_b[i] = rand() % 1000;
}
cout << "Prior to addition:" << endl;
for (int i = 0; i < 5; i++)
cout << h_a[i] << " " << h_b[i] << endl;
int *d_a, *d_b;
if (cudaMalloc(&d_a, sizeof(int) * count) != cudaSuccess)
{
cout << "Nope! No";
return 0;
}
if (cudaMalloc(&d_b, sizeof(int) * count) != cudaSuccess)
{
cout << "Nope!";
cudaFree(d_a);
return 0;
}
if (cudaMemcpy(d_a, h_a, sizeof(int) * count, cudaMemcpyHostToDevice) != cudaSuccess)
{
cout << "Could not copy!" << endl;
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
if (cudaMemcpy(d_b, h_b, sizeof(int) * count, cudaMemcpyHostToDevice) != cudaSuccess)
{
cout << "Could not copy!" << endl;
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
AddInts <<<count / 256 + 1, 256 >>> (h_a, h_b, count);
if (cudaMemcpy(h_a, h_b, sizeof(int) * count, cudaMemcpyDeviceToHost) == cudaSuccess)
{
delete[] h_a;
delete[] h_b;
cudaFree(d_a);
cudaFree(d_b);
cout << "Nope!" << endl;
return 0;
}
for (int i = 0; i < 5; i++)
cout << "It's " << h_a[i] << endl;
cudaFree(d_a);
cudaFree(d_b);
delete[] h_a;
delete[] h_b;
return 0;
}
我的結果是:
Prior to addition: 188 336 489 593 706 673 330 792 329 588 It's 188
It's 489 It's 706 It's 330 It's 329
D:\Learn\CUDA\Visual_stidio\matrxAdd\x64\Release\matrxAdd.exe (process
8468) exited with code 0. To automatically close the console when
debugging stops, enable Tools->Options->Debugging->Automatically close
the console when debugging stops. Press any key to close this window .
. .
首先,設備上的計算應該使用設備內存。 因此,該行:
AddInts <<<count / 256 + 1, 256 >>> (h_a, h_b, count);
應該:
AddInts <<<count / 256 + 1, 256 >>> (d_a, d_b, count);
然后,您應該將結果從設備內存復制到主機內存。 因此,該行:
if (cudaMemcpy(h_a, h_b, sizeof(int) * count, cudaMemcpyDeviceToHost) == cudaSuccess)
應該:
if (cudaMemcpy(h_a, d_a, sizeof(int) * count, cudaMemcpyDeviceToHost) == cudaSuccess)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.