"Big Float" Mandelbrot 在 GPU 上的运行速度比 CPU 慢

Question

I just ported over my "big float" implementation over to OpenCL, but it turns out that the GPU was slower than the CPU?我刚刚将我的“大浮点”实现移植到 OpenCL，但事实证明 GPU 比 CPU 慢？ That's quite surprising as I thought that Mandelbrot was an embarrassingly parallel problem.这很令人惊讶，因为我认为 Mandelbrot 是一个令人尴尬的平行问题。

GPU
Preview render completed. Time taken: 86.031792ms. Iterations: 32
Preview render completed. Time taken: 268.726288ms. Iterations: 96
Preview render completed. Time taken: 461.045258ms. Iterations: 160
Full render completed. Time taken: 5908.507812ms. Iterations: 160

CPU
Preview render completed. Time taken: 101.530037ms. Iterations: 32
Preview render completed. Time taken: 199.125870ms. Iterations: 96
Preview render completed. Time taken: 281.195496ms. Iterations: 160
Full render completed. Time taken: 3264.943115ms. Iterations: 160

Does anyone have experience doing big floats on GPU before?有没有人有过在 GPU 上做大花车的经验？ Is it just too complicated to make use of the GPU efficiently, and therefore better done on the CPU?有效利用 GPU 是否太复杂，因此在 CPU 上做得更好？

There doesn't seem to be a way to profile OpenCL on M1 Mac so I can't really answer that for myself:(似乎没有办法在 M1 Mac 上分析 OpenCL，所以我无法真正为自己回答这个问题：(

Details:细节：

GPU uses 32-bit ints, while CPU uses 64-bit ints. GPU 使用 32 位整数，而 CPU 使用 64 位整数。 This means that the GPU needed 8 to get 256-bit numbers, while CPU only needed 4. I did this because I couldn't use inline ASM for "ADC" nor 128-bit ints in OpenCL, while I was able to do so on the CPU.这意味着 GPU 需要 8 个才能获得 256 位数字，而 CPU 只需要 4 个。我这样做是因为我不能将内联 ASM 用于“ADC”，也不能在 OpenCL 中使用 128 位整数，而我能够这样做在 CPU 上。
It seems to be much easier to profile and optimise for the CPU at the moment.目前，对 CPU 进行分析和优化似乎要容易得多。 I wonder if that would be the better option.我想知道这是否是更好的选择。
I've confirmed that the kernel is the bottleneck, not the CPU-GPU transfer (1ms) or blitting the image using SDL (1ms).我已经确认 kernel 是瓶颈，而不是 CPU-GPU 传输 (1ms) 或使用 SDL (1ms) 传输图像。

Code for the GPU version: GPU 版本的代码：

enum sign
{
    SIGN_NEG,
    SIGN_ZERO,
    SIGN_POS
};

struct __attribute__ ((packed)) fp256
{
    enum sign sign;
    CL_UINT man[8];
};

struct __attribute__ ((packed)) fp512
{
    enum sign sign;
    CL_UINT man[16];
};

struct fp256 fp_uadd256(struct fp256 a, struct fp256 b)
{
    struct fp256 c;
    char carry = 0;
    for (int i = 7; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
        carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

struct fp256 fp_usub256(struct fp256 a, struct fp256 b)
{
    struct fp256 c;
    char carry = 0;
    for (int i = 7; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] - (CL_ULONG)b.man[i] - (CL_ULONG)carry;
        carry = (char)(temp >> 63); // Check if wrapped around.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

struct fp512 fp_uadd512(struct fp512 a, struct fp512 b)
{
    struct fp512 c;
    char carry = 0;
    for (int i = 15; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
        carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

enum cmp
{
    CMP_SAME,
    CMP_A_BIG,
    CMP_B_BIG
};

enum cmp fp_ucmp256(struct fp256 a, struct fp256 b)
{
    struct fp256 c = fp_usub256(a, b);
    bool is_negative = (c.man[0] >> 31) == 1;
    if (is_negative)
        return CMP_B_BIG;
    else
    {

// Here, we check if the difference is 0. If so, then both a and b are the same,
// but if not, then the difference must be positive, and thus a > b.

        for (int i = 0; i < 8; i++)
            if (c.man[i] != 0)
                return CMP_A_BIG;
        return CMP_SAME;
    }
}

struct fp256 fp_sadd256(struct fp256 a, struct fp256 b)
{
    if (a.sign == SIGN_ZERO && b.sign == SIGN_ZERO)
        return a;
    if (b.sign == SIGN_ZERO)
        return a;
    if (a.sign == SIGN_ZERO)
        return b;
    if ((a.sign == SIGN_POS && b.sign == SIGN_POS) ||
        (a.sign == SIGN_NEG && b.sign == SIGN_NEG))
    {
        struct fp256 c = fp_uadd256(a, b);
        c.sign = a.sign;
        return c;
    }

    //assert((a.sign == SIGN_POS && b.sign == SIGN_NEG) ||
    //       (a.sign == SIGN_NEG && b.sign == SIGN_POS));

    enum cmp cmp = fp_ucmp256(a, b);
    if (cmp == CMP_SAME)
        return (struct fp256) { SIGN_ZERO, {0} };
    
    if (a.sign == SIGN_POS && b.sign == SIGN_NEG)
    {
        if (cmp == CMP_A_BIG)
        {
            struct fp256 c = fp_usub256(a, b);
            c.sign = SIGN_POS;
            return c;
        }
        else
        {
            struct fp256 c = fp_usub256(b, a);
            c.sign = SIGN_NEG;
            return c;
        }
    }
    else
    {
        if (cmp == CMP_A_BIG)
        {
            struct fp256 c = fp_usub256(a, b);
            c.sign = SIGN_NEG;
            return c;
        }
        else
        {
            struct fp256 c = fp_usub256(b, a);
            c.sign = SIGN_POS;
            return c;
        }
    }
}

struct fp256 fp_sinv256(struct fp256 a)
{
    if (a.sign == SIGN_POS) a.sign = SIGN_NEG;
    else if (a.sign == SIGN_NEG) a.sign = SIGN_POS;
    return a;
}

struct fp256 fp_ssub256(struct fp256 a, struct fp256 b)
{
    return fp_sadd256(a, fp_sinv256(b));
}

struct fp256 fp_smul256(struct fp256 a, struct fp256 b)
{
    if (a.sign == SIGN_ZERO || b.sign == SIGN_ZERO)
        return (struct fp256) { SIGN_ZERO, {0} };

    enum sign sign;
    if (a.sign == SIGN_NEG && b.sign == SIGN_NEG)
        sign = SIGN_POS;
    else if (a.sign == SIGN_NEG || b.sign == SIGN_NEG)
        sign = SIGN_NEG;
    else
        sign = SIGN_POS;

    struct fp512 c = {0};
    for (int i = 7; i >= 0; i--) // a
    {
        for (int j = 7; j >= 0; j--) // b
        {
            int low_offset = 15 - (7 - i) - (7 - j);
            //assert(low_offset >= 1);
            int high_offset = low_offset - 1;

            CL_ULONG mult = (CL_ULONG)a.man[i] * (CL_ULONG)b.man[j];
            struct fp512 temp = {0};
            temp.man[low_offset] = (CL_UINT)mult;
            temp.man[high_offset] = mult >> 32;

            c = fp_uadd512(c, temp);
        }
    }

    struct fp256 c256;
    c256.sign = sign;
    for (int i = 1; i <= 8; i++)
        c256.man[i - 1] = c.man[i];

    return c256;
}

struct fp256 fp_ssqr256(struct fp256 a)
{
    return fp_smul256(a, a);
}

struct fp256 fp_asr256(struct fp256 a)
{
    for (int i = 7; i >= 1; i--)
    {
        a.man[i] >>= 1;
        a.man[i] |= (a.man[i - 1] & 0x1) << 31;
    }
    a.man[0] >>= 1;
    return a;
}

struct fp256 int_to_fp256(int a)
{
    if (a == 0)
        return (struct fp256){ SIGN_ZERO, {0} };
    
    struct fp256 b = {0};
    if (a < 0)
    {
        b.sign = SIGN_NEG;
        a = -a;
    }
    else
        b.sign = SIGN_POS;
    
    b.man[0] = (CL_UINT)a;
    return b;
}

Answer 1

That's quite surprising as I thought that Mandelbrot was an embarrassingly parallel problem.这很令人惊讶，因为我认为 Mandelbrot 是一个令人尴尬的平行问题。

It is, but the mapping of work is not steady between work-items in both dimensions.是的，但是工作项在两个维度上的映射并不稳定。 Some work-items complete only in 5 iterations while some require 100 iterations.有些工作项仅在 5 次迭代中完成，而有些则需要 100 次迭代。 Those complete early simply start idling until the last(200 iterations?) pixel in same tile is computed.那些早期完成的只是开始空闲，直到计算同一块中的最后一个（200 次迭代？）像素。

To optimize the idle cycles during waiting on busy pixels (high-iteration pixels), you can do this:要优化等待繁忙像素（高迭代像素）期间的空闲周期，您可以执行以下操作：

have 1 GPU workitem take a 8x8 tile of pixels.有 1 个 GPU 工作项采用 8x8 像素块。 (10k workitems for 800x800 image) （800x800 图像的 10k 个工作项）
"sample" 4 corners of the tile using the GPU workitem running the kernel使用运行 kernel 的 GPU 工作项“采样”瓷砖的 4 个角
if sampled iteration values are relatively close to each other, spawn a mini-kernel by that work-item (using dynamic parallelism) and compute the 8x8 region using that mini-kernel on 64 workitems (GPU threads)如果采样的迭代值彼此相对接近，则通过该工作项（使用动态并行性）生成一个迷你内核，并在 64 个工作项（GPU 线程）上使用该迷你内核计算 8x8 区域
if sampled iteration values are very different between each other, then have the parent workitem compute the whole 8x8 region without having any idle cycles between pixel computations.如果采样的迭代值彼此之间非常不同，则让父工作项计算整个 8x8 区域，而在像素计算之间没有任何空闲周期。

Another option could be sending those "divergent" tiles to CPU and computing only "steady" tiles on the GPU.另一种选择可能是将这些“发散”图块发送到 CPU，并仅计算 GPU 上的“稳定”图块。 This way best of both worlds can be combined with relatively less wasting of GPU cycles.这种两全其美的方式可以与相对较少的 GPU 周期浪费相结合。 Or you can sort all tiles on their guessed-average-iteration values and run highest ones only on GPU and lowest ones only on CPU.或者，您可以根据猜测的平均迭代值对所有图块进行排序，并仅在 GPU 上运行最高的图块，仅在 CPU 上运行最低的图块。

Sampling corners of a 8x8 tile takes roughly 1/16 of its total compute effort.对 8x8 切片的角进行采样大约需要其总计算量的 1/16。 So with the relatively cheap preprocessing, you can distribute it fairly.所以通过相对便宜的预处理，你可以公平地分配它。

"Big Float" Mandelbrot 在 GPU 上的运行速度比 CPU 慢

问题描述

1 个解决方案

解决方案1
0 2022-01-09 18:29:17

"Big Float" Mandelbrot 在 GPU 上的运行速度比 CPU 慢

问题描述

1 个解决方案

解决方案1 0 2022-01-09 18:29:17

解决方案1
0 2022-01-09 18:29:17