[英]OpenCl: addition of large amount of arrays
在下面的代碼中,每個工作項都會生成一個數組sum_qcos_i。
為了添加它們,我首先使用本地數組sum_qcos_tmp進行本地添加。
然后,我將每個局部數組復制到全局一維矩陣sum_qcos_part中。
我需要為此添加矩陣列,這是每個工作項在使用結果之前所做的工作。 這是代碼
__kernel __attribute__((vec_type_hint(double4))) void energy_forces( const int atom_number,
const int nvect,
__global double4 *kvect,__global double *qcos,__global double *qsin,
__global double *cst_ewald ,
__global double4 *positions,
__global double4 *forces_r,
__global double *sum_qcos_part,__global double *sum_qsin_part)
{
int i = 0 ;
int gti = 0 , ggi = 0 , lti = 0;
double kr = (double)0.0 ;
double ss = (double)0.0 , cc = (double)0.0 ;
double prod = (double)0.0 ;
double valqcos = 0. , valqsin = 0. ;
double4 zeroes_4 = (double4){ 0.0,0.0,0.0,0.0 };
double sum_qcos_i[NVECTOR_MAX] ;
double sum_qsin_i[NVECTOR_MAX] ;
#if defined NVECTOR_MAX
__local double sum_qcos_tmp[NVECTOR_MAX] ;
__local double sum_qsin_tmp[NVECTOR_MAX] ;
#endif
lti = get_local_id(0);
ggi = get_group_id(0);
for (k=0;k<nvect;k++) { /*k-vectors*/
sum_qcos_tmp[k] = .0 ;
sum_qsin_tmp[k] = .0 ;
sum_qcos_i[k] = .0 ;
sum_qsin_i[k] = .0 ;
}
double fk = (double)0.0 ;
double4 fr_i = zeroes_4 ;
double4 kvec_i = zeroes_4;
for (gti = get_global_id(0); gti < atom_number; gti += get_global_size(0))
{
pos_i = positions[gti];
for (k=0;k<nvect;k++) { /* sum over k-vectors to compute QCOS and QSIN for Ewald sum*/
prod = dot((double4)pos_i,(double4)kvect[k]);
ss = (double)sincos(-prod,&cc);
valqcos = cc ;
valqsin = ss ;
// valqcos = 1. ;
// valqsin = 1. ;
qcos[gti*NVECTOR_MAX+k] = valqcos ;
qsin[gti*NVECTOR_MAX+k] = valqsin ;
sum_qcos_i[k] = valqcos ; /* private variable */
sum_qsin_i[k] = valqsin ;
} /* end sum over k-vectors to compute QCOS and QSIN for Ewald sum*/
} // end for gti
int ii = 0 ;
for ( ii = 0;ii<get_local_size(0);ii++ )
{
if (lti == ii)
{
for (k=0;k<nvect;k++)
{ /* k-vectors */
sum_qcos_tmp[k] += sum_qcos_i[k] ; /* accumulates private data to local variable */
sum_qsin_tmp[k] += sum_qsin_i[k] ;
}
barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE) ;
}
}
if (lti == 0)
{
for (k=0;k<nvect;k++) {
sum_qcos_part[ggi*NVECTOR_MAX+k] = sum_qcos_tmp[k] ; /* cp local data to global array */
sum_qsin_part[ggi*NVECTOR_MAX+k] = sum_qsin_tmp[k] ;
}
}
int iii = 0 ;
for (gti = get_global_id(0); gti < atom_number; gti += get_global_size(0))
{
fr_i = zeroes_4 ;
barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE) ;
for (k=0;k<nvect;k++)
{
sum_qcos_i[k] = .0 ;
sum_qsin_i[k] = .0 ;
for (iii=0;iii<get_num_groups(0);iii++)
{
sum_qcos_i[k] += sum_qcos_part[iii*NVECTOR_MAX+k] ;
sum_qsin_i[k] += sum_qsin_part[iii*NVECTOR_MAX+k] ;
}
}
barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE) ;
for (k=0;k<nvect;k++)
{
fk = ( sum_qcos_i[k]*qsin[gti*NVECTOR_MAX+k] - sum_qsin_i[k]*qcos[gti*NVECTOR_MAX+k] ) ;
fr_i += cst_ewald[k] * fk * kvect[k] ;
}
#if defined(SCALAR_KERNELS)
forces_r[gti].x = fr_i.x;
forces_r[gti].y = fr_i.y;
forces_r[gti].z = fr_i.z;
forces_r[gti].w = .0 ;
#elif defined(VECTOR_KERNELS)
forces_r[gti] = fr_i;
#endif
} // end for gti
}
這個內核不起作用,我不知道為什么。
這里有些提示會很有幫助。
謝謝。
添加屏障可以解決問題:
for ( ii = 0;ii<get_local_size(0);ii++ )
{
if (lti == ii)
{
barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE) ;
for (k=0;k<nvect;k++)
{ /* k-vectors */
sum_qcos_tmp[k] += sum_qcos_i[k] ; /* accumulates private data to local variable */
sum_qsin_tmp[k] += sum_qsin_i[k] ;
}
}
}
向量forces_r的最終值仍然是錯誤的,但現在始終相同。
實際上問題沒有解決。
我根據粒子的數量定義全局工作量,以便每個工作項都處理一個粒子。
在計算中,我對每個粒子與一定數量的向量進行點積運算。
我現在的問題是,這些點積的結果取決於組的大小。 我將向量放入double4數組中。 全局工作量從1000到10000左右,而我用於點積的向量數量始終在200左右。我想知道關於全局工作量是否對數組大小有要求。
for (gti = get_global_id(0); gti < atom_number; gti += get_global_size(0))
{
pos_i = positions[gti];
for (k=0;k<nvect;k++) { /* sum over k-vectors to compute QCOS and QSIN for Ewald sum*/
prod = dot((double4)pos_i,(double4)kvect[k]);
ss = (double)sincos(-prod,&cc);
valqcos = cc ;
valqsin = ss ;
qcos[gti*NVECTOR_MAX+k] = valqcos ;
qsin[gti*NVECTOR_MAX+k] = valqsin ;
sum_qcos_i[k] = valqcos ; /* private variable */
sum_qsin_i[k] = valqsin ;
} /* end sum over k-vectors to compute QCOS and QSIN for Ewald sum*/
} // end for gtiforgot
這里有什么提示嗎?
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.