[英]CUDA multiple multiplication of the matrix in the kernel code
矩陣乘法的功能:
__global__ void gpu_matrix_mult(float *a, float *b, float *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0;
if (col < k && row < m)
{
for (int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
然后在以下循環中調用該函數:
int currentActivityCount = -1;
while (activityCount != currentActivityCount)
{
if (currentActivityCount > -1)
{
cudaMemcpy(d_b, h_b_new, sizeof(int)*m*k, cudaMemcpyHostToDevice);
}
gpu_matrix_mult << <dimGrid, dimBlock >> >(d_a, d_b, d_c, m, n, k);
cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
currentActivityCount = activityCount;
activityCount = 0;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
if (h_c[i*k + j] >= 0.5)
{
activityCount++;
h_b_new[i * k + j] = 1;
}
else
{
h_b_new[i * k + j] = 0;
}
}
}
during++;
printf("Count of activity: %d During: %d\n", activityCount, during);
}
我的目標是將此循環移至“ gpu_matrix_mult”函數中,以使GPU之間的數據傳輸僅發生兩次,這意味着在調用函數之前和之后,而不是在每次循環迭代中。 我一直在嘗試一些方法,但沒有一個奏效。 那解決辦法可行嗎?
您可以在內核中執行以下操作:
__device__ int activityCount;
__global__ void gpu_matrix_mult(float *a, float *b0, float *b1, float *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0;
if (col < k && row < m)
{
for (int i = 0; i < n; i++)
{
sum += a[row * n + i] * b0[i * k + col];
}
c[row * k + col] = sum;
if (sum >= 0.5)
{
atomicAdd(&activityCount, 1);
b1[i * k + j] = 1;
}
else
{
b1[i * k + j] = 0;
}
}
}
// .............
int currentActivityCount = -1;
int activityCount_h = 0;
while (activityCount_h != currentActivityCount)
{
if (currentActivityCount > -1)
{
float *tmp = d_b0;
d_b0 = d_b1;
d_b1 = tmp;
}
currentActivityCount = activityCount_h;
activityCount_h = 0;
cudaMemcpyToSymbol(activityCount, &activityCount_h, sizeof(int));
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b0, d_b1, d_c, m, n, k);
cudaMemcpyfromSymbol(&activityCount_h, activity, sizeof(int));
during++;
printf("Count of activity: %d During: %d\n", activityCount, during);
}
[顯然從未編譯或運行,使用后果自負]
也就是說,在矩陣相乘之后,可以在設備上的內核中運行用於計算activityCount
的內部循環。 這在GPU的內存中需要兩個b
矩陣,但是主機上只需要指針交換即可更新它們,這基本上是零成本。 每次外部循環迭代兩次,內存傳輸減少為單個整數,這將相當快。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.