[英]Cannot get max value in CUDA vector multiplication
我將pB的每一行與pA的每一行進行多重播放,然后將最大值放入pC。 問題是:在內部循環中,接收器的最后一行被視為“最大值”。 結果,右列是完全錯誤的。
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1, 2, 3, 4, 5
, 2, 4, 6, 8, 2};
float pB[k * n] = { 9, 8, 7, 6, 5
, 4, 3, 2, 1, 9
, 8, 7, 6, 5, 4 };
float expected[k * n] = { 18, 32, 42, 48, 25
, 8, 12, 12, 8, 45
,16, 28, 36, 40, 20 };
float pC[k * n] = { 18, 32, 42, 48, 10
, 8, 12, 12, 8, 18
,16, 28, 36, 40, 8 };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
CPPUNIT_ASSERT_EQUAL_MESSAGE( "passed processing", 0, rst );
}
// pDevB and pDevC nave the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
if( value > pDevC[j] )
{
pDevC[j] = value;
}
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC, pfnMsg fnMsg )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = nullptr;
float* pDevB = nullptr;
float* pDevC = nullptr;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}
抱歉,我在某些時候錯過了您編輯代碼的機會。
您遇到的問題是比賽條件。 在失敗的情況下,您將啟動2個塊。 算法的設計使得每個塊都在同一組輸出元素上運行(在pdevC
)。 因此,由於兩個塊可以同時執行,因此兩個塊可以同時寫入相同的輸出元素。 這是一次碰撞,有兩種避免方法:
接下來是一些代碼,在此我演示了第二種方法(因為這樣對我來說更容易編寫)。 沒有在float
上提供atomicMax
操作的atomic函數,因此我按照atomic函數文檔中提供的模板(使用atomicCAS創建任意atomic操作)編寫了自己的模板。 那就是atomicMaxf
。
如果您選擇使用第一種方法(推薦),我會指出,對於您的算法,可能不需要在循環中調用內核。 我會設計一個新的內核,該內核為每個輸出點分配一個線程,然后在內核中的一個循環(或嵌套循環)中計算各個輸入點上所有必需的max操作。 由於每個線程都只向一個唯一的輸出點寫入數據,因此線程之間不會發生寫沖突。
無論如何,此代碼應提供正確的結果:
#include <stdio.h>
__device__ float atomicMaxf(float* address, float val)
{
int *address_as_int =(int*)address;
int old = *address_as_int, assumed;
while (val > __int_as_float(old)) {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val));
}
return __int_as_float(old);
}
// pDevB and pDevC have the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
atomicMaxf(pDevC+j, value);
// if( value > pDevC[j] )
// {
// pDevC[j] = value;
// }
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = NULL;
float* pDevB = NULL;
float* pDevC = NULL;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f
, 2.0f, 4.0f, 6.0f, 8.0f, 2.0f};
float pB[k * n] = { 9.0f, 8.0f, 7.0f, 6.0f, 5.0f
, 4.0f, 3.0f, 2.0f, 1.0f, 9.0f
, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f };
float expected[k * n] = { 18.0f, 32.0f, 42.0f, 48.0f, 25.0f
, 8.0f, 12.0f, 12.0f, 8.0f, 45.0f
,16.0f, 28.0f, 36.0f, 40.0f, 20.0f };
float pC[k * n] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
printf("passed processing: %d \n", rst );
for (int i=0; i<(k*n); i++)
if (pC[i] != expected[i]) printf("mismatch at %d, should be: %f was: %f\n", i, expected[i], pC[i]);
}
int main(){
TestCalcDotMax_2x5x3();
return 0;
}
非常感謝-現在可以使用。 比較時是否可以保留迭代索引[idx]? 像這樣:
struct ValIndex_t
{
float value;
int index;
};
__device__ float atomicMaxPare( float* address, float val, int* index, int idx )
{
int *address_as_int = reinterpret_cast<int*>( address->value ); // assume that float has size of integer 32 bit
int old = *address_as_int, assumed;
while( val > ::__int_as_float(old) )
{
assumed = old;
old = ::atomicCAS( address_as_int, assumed, ::__float_as_int(val) );
*index = idx;
}
return ::__int_as_float(old);
}
__global__ void CudaPareDotMax( float* pDevA, const float* pDevB, ValIndex_t* pDevC, const size_t m, const size_t k, const size_t n )
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if( idx < m )
{
for( size_t row = 0; row < n; row++ )
{
for( size_t col = 0; col < k; col++ )
{
const size_t slice = col + row * k;
const size_t index = slice + k * n * idx;
pDevA[index] *= pDevB[ col + k * idx ];
float& prvalue = (pDevC + slice )->value;
int& prindex = (pDevC + slice )->index;
::atomicMaxPare( &prvalue, pDevA[ index ], &prindex, idx );
}
}
}
}
還是我必須使用另一個原子函數進行交換? 不太了解如何在價值最大化時准確地加入它。 再次感謝
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.