金屬-優化內存訪問

Question

這個問題有兩個部分，但它們密切相關：

問題1

Metal是否提供一種使用共享線程組內存的方法？

例如，在CUDA中，您可以像這樣將數據從設備內存顯式加載到共享內存中：

__shared__ float example1

Metal是否提供此類功能？ 似乎所有緩沖區訪問都從全局內存加載，除非幕后發生了一些隱藏的魔術。

問題2

這可能不是Metal獨有的，因此任何GPU專家都可以提供幫助。 蘋果在這里提供了一個矩陣乘法示例-我將在下面粘貼內核以供參考：

typedef struct
{
    ushort m, k, n, pbytes, qbytes;
} MetalMatrixDim;


kernel void MatrixMultiply(const device float*       A    [[ buffer(0) ]],
                           const device float*       B    [[ buffer(1) ]],
                           device float*             C    [[ buffer(2) ]],
                           constant MetalMatrixDim&  dims [[ buffer(3) ]],
                           ushort2                   gid  [[ thread_position_in_grid ]])
{
    ushort m = dims.m;
    ushort k = dims.k;
    ushort n = dims.n;

    ushort pbytes = dims.pbytes;
    ushort qbytes = dims.qbytes;

    ushort2 gidIn = ushort2(gid.x << 3, gid.y << 3);

    if (gidIn.x >= m || gidIn.y >= k) return;

    const device float4* a = (const device float4*)(A + gidIn.x);
    const device float4* b = (const device float4*)(B + gidIn.y);

    C = (device float*)((device char*)C + gidIn.x*qbytes);

    device float4* c = (device float4*)(C + gidIn.y);

    const device float4* Bend = (const device float4*)((const device char*)B + qbytes*n);

    float4 s0  = 0.0f, s1  = 0.0f, s2  = 0.0f, s3  = 0.0f;
    float4 s4  = 0.0f, s5  = 0.0f, s6  = 0.0f, s7  = 0.0f;
    float4 s8  = 0.0f, s9  = 0.0f, s10 = 0.0f, s11 = 0.0f;
    float4 s12 = 0.0f, s13 = 0.0f, s14 = 0.0f, s15 = 0.0f;

    do
    {
        float4 aCurr0 = a[0];
        float4 aCurr1 = a[1];
        float4 bCurr0 = b[0];
        float4 bCurr1 = b[1];

        s0   += (aCurr0.x * bCurr0);
        s2   += (aCurr0.y * bCurr0);
        s4   += (aCurr0.z * bCurr0);
        s6   += (aCurr0.w * bCurr0);

        s1   += (aCurr0.x * bCurr1);
        s3   += (aCurr0.y * bCurr1);
        s5   += (aCurr0.z * bCurr1);
        s7   += (aCurr0.w * bCurr1);

        s8   += (aCurr1.x * bCurr0);
        s10  += (aCurr1.y * bCurr0);
        s12  += (aCurr1.z * bCurr0);
        s14  += (aCurr1.w * bCurr0);

        s9   += (aCurr1.x * bCurr1);
        s11  += (aCurr1.y * bCurr1);
        s13  += (aCurr1.z * bCurr1);
        s15  += (aCurr1.w * bCurr1);

        a = (device float4*)((device char*)a + pbytes);
        b = (device float4*)((device char*)b + qbytes);

    } while(b < Bend);

    c[0] = s0;  c[1] = s1;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s2;  c[1] = s3;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s4;  c[1] = s5;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s6;  c[1] = s7;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s8;  c[1] = s9;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s10; c[1] = s11; c = (device float4*)((device char*)c + qbytes);
    c[0] = s12; c[1] = s13; c = (device float4*)((device char*)c + qbytes);
    c[0] = s14; c[1] = s15;
}

問題：對於每個線程，此內核都會計算輸出C的8 x 8扇區。 這是什么原因呢？ 為什么不讓每個線程計算C的單個元素，這將消除8的倍數大小限制並為較小的矩陣提供更好的並行化？

我認為該實現必須以某種方式進行優化，並且我猜想它與線程同步和內存訪問有關-這就是為什么我將其與問題1捆綁在一起的原因。

Answer 1

我看不到您的兩個問題之間的關系。 關於問題1：是的，Metal在計算函數中提供了共享的線程組內存。 只需在變量聲明中指定threadgroup地址空間限定符即可。 例如：

threadgroup float example1;

您還可以將線程組緩沖區指定為計算函數的輸入參數。

kernel void my_func(...,
                    threadgroup float *example2 [[threadgroup(0)]],
                    ...)
{
    ...
}

緩沖區由設備分配。 緩沖區的大小使用計算命令編碼器的-setThreadgroupMemoryLength:atIndex:方法設置。

金屬-優化內存訪問

問題描述

問題1

問題2

1 個解決方案

解決方案1
3 2017-05-09 15:57:38

金屬-優化內存訪問

問題描述

問題1

問題2

1 個解決方案

解決方案1 3 2017-05-09 15:57:38

解決方案1
3 2017-05-09 15:57:38