OpenCL：本地內存比 CPU 上的 L1 緩存快？

Question

我編寫了一個 OpenCL 內核，它對輸入矩陣執行框模糊。 該實現最初是為 GPU 編寫的，並使用本地內存來存儲工作組中工作項的鄰域。 然后，我在 CPU 上運行內核，並將運行時間與依賴於從全局內存自動緩存讀取而不是首先手動將它們存儲在本地內存中的實現進行比較。

在 CPU 沒有“本地內存”而是使用 RAM 的假設下，在 CPU 上使用本地內存應該弊大於利。 然而，“本地內存”內核比依賴緩存的內核快10 毫秒（在 8192x8192 矩陣上，工作項/工作組/“每個工作項計算的值的數量”設置為最佳的 8192x8192 矩陣上為 ~112 毫秒與 ~122 毫秒）對於這兩種實現，因為它們是由兩個內核的自動調諧器分別找到的）。

CPU using an OpenCL intel platform available on the host.內核使用主機上可用的 OpenCL 英特爾平台在 CPU 上運行。

發生這種情況的原因是什么？

kernel: Each work item works on a "block" of values. 內核：每個工作項都處理一個“塊”值。 每個塊都被復制到共享內存，它的鄰居被復制到本地內存，這取決於塊在工作組中的位置，因此不會復制兩次值。 然后，在障礙之后，計算最終值。
下面的代碼是X方向的內核； 除了檢查值以計算輸出值的方向外，y 方向內核完全相同。

__kernel void boxblur_x (__read_only __global float* image,
                   __local float* localmem,
                   __write_only __global float* output)
{
// size of input and output matrix
int MATRIX_SIZE_Y = IMAGE_HEIGHT;
int MATRIX_SIZE_X = IMAGE_WIDTH;
int MATRIX_SIZE   = MATRIX_SIZE_Y  * MATRIX_SIZE_X;

// mask size
int S_L = MASK_SIZE_LEFT;
int S_U = 0;
int S_R = MASK_SIZE_RIGHT;
int S_D = 0;
int SHAPE_SIZE_Y = S_U + S_D + 1;
int SHAPE_SIZE_X = S_L + S_R + 1;
int SHAPE_SIZE = SHAPE_SIZE_Y * SHAPE_SIZE_X;

// tuning parameter
// ---------------------------------------------------------------
//work items in y/x dimension per work group
int NUM_WI_Y = get_local_size(1);
int NUM_WI_X = get_local_size(0);

//size of blocks
int BLOCKHEIGHT = X_BLOCKHEIGHT;
int BLOCKWIDTH = X_BLOCKWIDTH;

//position in matrix
int GLOBAL_POS_X = get_global_id(0) * BLOCKWIDTH;
int GLOBAL_POS_Y = get_global_id(1) * BLOCKHEIGHT;

//localMemory size
int LOCALMEM_WIDTH = S_L + NUM_WI_X * BLOCKWIDTH + S_R;

//position in localmem
int LOCAL_POS_X = S_L + get_local_id(0) * BLOCKWIDTH;
int LOCAL_POS_Y = S_U + get_local_id(1) * BLOCKHEIGHT;


// copy values to shared memory
for (int i = 0; i < BLOCKHEIGHT; i++)
{
    for (int j = 0; j < BLOCKWIDTH; j++)
    {
        localmem[(LOCAL_POS_X + j) + (LOCAL_POS_Y + i) * LOCALMEM_WIDTH] = image[GLOBAL_POS_X + j + (GLOBAL_POS_Y + i) * MATRIX_SIZE_X];
    }
}

// only when all work items have arrived here,
// computation continues - otherwise, not all needed
// values might be available in local memory
barrier (CLK_LOCAL_MEM_FENCE);


for (int i = 0; i < BLOCKHEIGHT; i++)
{
    for (int j = 0; j < BLOCKWIDTH; j++)
    {
        float sum = 0;
        for (int b = 0; b <= S_L + S_R; b++)
        {
            sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH];
        }

        // divide by size of mask
        float pixelValue = sum / SHAPE_SIZE;

        // write new pixel value to output image
        output[GLOBAL_POS_X + j + ((GLOBAL_POS_Y + i) * get_global_size(0) * BLOCKWIDTH)] = pixelValue;
    }
}
}

“L1 緩存內核”：盡管有很多定義，但它的作用完全相同，但依賴於塊的全局內存緩存，而不是顯式管理本地內存。

#define WG_BLOCK_SIZE_Y ( OUTPUT_SIZE_Y / NUM_WG_Y )
#define WG_BLOCK_SIZE_X ( OUTPUT_SIZE_X / NUM_WG_X )

#define WI_BLOCK_SIZE_Y ( WG_BLOCK_SIZE_Y / NUM_WI_Y )
#define WI_BLOCK_SIZE_X ( WG_BLOCK_SIZE_X / NUM_WI_X )

#define WG_BLOCK_OFFSET_Y ( WG_BLOCK_SIZE_Y * WG_ID_Y )
#define WG_BLOCK_OFFSET_X ( WG_BLOCK_SIZE_X * WG_ID_X )

#define WI_BLOCK_OFFSET_Y ( WI_BLOCK_SIZE_Y * WI_ID_Y )
#define WI_BLOCK_OFFSET_X ( WI_BLOCK_SIZE_X * WI_ID_X )

#define NUM_CACHE_BLOCKS_Y ( WI_BLOCK_SIZE_Y / CACHE_BLOCK_SIZE_Y )
#define NUM_CACHE_BLOCKS_X ( WI_BLOCK_SIZE_X / CACHE_BLOCK_SIZE_X )

#define CACHE_BLOCK_OFFSET_Y ( CACHE_BLOCK_SIZE_Y * ii )
#define CACHE_BLOCK_OFFSET_X ( CACHE_BLOCK_SIZE_X * jj )

#define reorder(j)     ( ( (j) / WI_BLOCK_SIZE_X) + ( (j) % WI_BLOCK_SIZE_X) * NUM_WI_X )
#define reorder_inv(j) reorder(j)

#define view( i, j, x, y )    input[ ((i) + (x)) * INPUT_SIZE_X + ((j) + (y)) ]
 #define a_wg( i, j, x, y )    view(  WG_BLOCK_OFFSET_Y   + (i), WG_BLOCK_OFFSET_X    + reorder(j), (x), (y) )
 #define a_wi( i, j, x, y )    a_wg(  WI_BLOCK_OFFSET_Y   + (i), WI_BLOCK_OFFSET_X    + (j)       , (x), (y) )
 #define a_cache( i, j, x, y ) a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j)       , (x), (y) )


 #define res_wg( i, j ) output[ (WG_BLOCK_OFFSET_Y + i) * OUTPUT_SIZE_X + WG_BLOCK_OFFSET_X + reorder_inv(j) ]

 #define res(i, j)         output[ (i) * OUTPUT_SIZE_X + (j) ]
 #define res_wg( i, j )    res(    WG_BLOCK_OFFSET_Y + (i)   , WG_BLOCK_OFFSET_X    + reorder_inv(j) )
 #define res_wi( i, j )    res_wg( WI_BLOCK_OFFSET_Y + (i)   , WI_BLOCK_OFFSET_X    + (j)            )
 #define res_cache( i, j ) res_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j)            )


float f_stencil( __global float* input, int ii, int jj, int i, int j )
{
  // indices
  const int WG_ID_X = get_group_id(0);
  const int WG_ID_Y = get_group_id(1);

  const int WI_ID_X = get_local_id(0);
  const int WI_ID_Y = get_local_id(1);

  // computation
  float sum = 0;
  for( int y = 0 ; y < SHAPE_SIZE_Y ; ++y )
    for( int x = 0 ; x < SHAPE_SIZE_X ; ++x)
       sum += a_cache(i, j, y, x);

  return sum / SHAPE_SIZE;
}

__kernel void stencil( __global float* input,
                       __global float* output
                     )
{

  //indices
  const int WG_ID_X = get_group_id(0);
  const int WG_ID_Y = get_group_id(1);

  const int WI_ID_X = get_local_id(0);
  const int WI_ID_Y = get_local_id(1);

  // iteration over cache blocks
  for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii )
    for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj )
      // iteration within a cache block
      for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i )
        for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j )
          res_cache( i, j ) = f_stencil( input, ii, jj, i , j );
}

Answer 1

當您結合“L1 緩存”版本的循環時：

for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii )
 for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj )
  for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i )
   for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j )
     for( int y = 0 ; y < SHAPE_SIZE_Y(SU+SD+1) ; ++y )
       for( int x = 0 ; x < SHAPE_SIZE_X(SL+SR+1) ; ++x)
              ....  += a_cache(i, j, y, x);

和“本地”版本：

for (int i = 0; i < BLOCKHEIGHT; i++)
    for (int j = 0; j < BLOCKWIDTH; j++)
        for (int b = 0; b <= S_L + S_R; b++)
            ... +=input[...]

“a_cache”有很多計算

a_cache(i, j, y, x);

變成

a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j), x, y )

那變成

view(  WG_BLOCK_OFFSET_Y   + (CACHE_BLOCK_OFFSET_Y + (i)), WG_BLOCK_OFFSET_X    + reorder(CACHE_BLOCK_OFFSET_X + (j)), (x), (y) )

那變成

view(  WG_BLOCK_OFFSET_Y   + (CACHE_BLOCK_OFFSET_Y + (i)), WG_BLOCK_OFFSET_X    + ( ( (CACHE_BLOCK_OFFSET_X + (j)) / WI_BLOCK_SIZE_X) + ( (CACHE_BLOCK_OFFSET_X + (j)) % WI_BLOCK_SIZE_X) * NUM_WI_X )

, (x), (y) )

那變成

 input[ ((WG_BLOCK_OFFSET_Y   + (CACHE_BLOCK_OFFSET_Y + (i))) + (x)) * INPUT_SIZE_X + ((WG_BLOCK_OFFSET_X    + ( ( (CACHE_BLOCK_OFFSET_X + (j)) / WI_BLOCK_SIZE_X) + ( (CACHE_BLOCK_OFFSET_X + (j)) % WI_BLOCK_SIZE_X) * NUM_WI_X) + (y)) ]

這是 9 次加法 + 2 次乘法 + 1 次模 + 1 除法。

“本地”版本有

 sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH];

這是 4 次加法 + 3 次乘法，但沒有模也沒有除法。

“L1 緩存”版本需要保留 6 個循環的循環計數器，它們可能會使用更多的 CPU 寄存器甚至 L1 緩存。 數據緩存大小為每個內核 128 kB 或每個線程 64 kB。 如果您為每個內核啟動 1024 個線程（每個內核是一個工作組，對嗎？）那么僅循環計數器就需要 1024 * 6 * 4 = 24kB L1。 這留下了 40kB 可供使用。 當您添加“const int WG_ID_X”和其他變量（其中 5 個）時，只剩下 20kB。 現在為它的參數添加“f_stencil”函數臨時“堆棧”變量，可能沒有L1緩存，降低效率。 “本地”版本使用了大約 10-12 個變量（未使用的變量可能被優化了？）並且沒有函數，所以它可能對 L1 更好。

https://software.intel.com/en-us/node/540486

說

為了減少維護工作組的開銷，您應該創建盡可能大的工作組，這意味着 64 個或更多的工作項。 一個上限是訪問數據集的大小，因為最好不要超過單個工作組中L1 緩存的大小。

和

如果您的內核代碼包含屏障指令，工作組大小的問題就變成了一個權衡。 工作組中每個工作項需要的本地和私有內存越多，最佳工作組大小就越小。 原因是， barrier 還會針對work-group 中所有 work-item 使用的私有和本地內存總量發出復制指令，因為每個到達屏障的 work-item的狀態都被保存了在繼續另一個工作項之前。

您在“本地”版本中只有 1 個屏障，在此之前，使用了 8 個變量，因此不需要太多內存來復制？

OpenCL：本地內存比 CPU 上的 L1 緩存快？

問題描述

1 個解決方案

解決方案1
1 2016-09-19 22:04:34

OpenCL：本地內存比 CPU 上的 L1 緩存快？

問題描述

1 個解決方案

解決方案1 1 2016-09-19 22:04:34

解決方案1
1 2016-09-19 22:04:34