[英]Fastest way to multiply and sum/add two arrays (dot product) - unaligned surprisingly faster than FMA
嗨,我有以下代码:
public unsafe class MultiplyAndAdd : IDisposable
{
float[] rawFirstData = new float[1024];
float[] rawSecondData = new float[1024];
static int alignment = 32;
float[] alignedFirstData = new float[1024 + alignment / sizeof(float)];
int alignedFirstDataOffset;
GCHandle alignedFirstDataHandle;
float* alignedFirstDataPointer;
float[] alignedSecondData = new float[1024 + alignment / sizeof(float)];
int alignedSecondDataOffset;
GCHandle alignedSecondDataHandle;
float* alignedSecondDataPointer;
public IEnumerable<object[]> Data { get; set; }
public void Dispose()
{
this.alignedFirstDataHandle.Free();
this.alignedSecondDataHandle.Free();
}
//Calculate the offset that needs to be applied to ensure that the array is aligned with 32.
private int CalculateAlignmentOffset(GCHandle handle)
{
var handlePointer = handle.AddrOfPinnedObject().ToInt64();
long lPtr2 = (handlePointer + alignment - 1) & ~(alignment - 1);
return (int)(lPtr2 - handlePointer);
}
public MultiplyAndAdd()
{
Random random = new Random(1055);
for (var i = 0; i < 1024; i++)
{
rawFirstData[i] = (float)random.NextDouble() * 4f - 2f;
rawSecondData[i] = (float)random.NextDouble() * 4f - 2f;
}
alignedFirstDataHandle = GCHandle.Alloc(alignedFirstData, GCHandleType.Pinned);
alignedFirstDataOffset = CalculateAlignmentOffset(alignedFirstDataHandle);
alignedFirstDataPointer = (float*)(alignedFirstDataHandle.AddrOfPinnedObject() + alignedFirstDataOffset);
alignedSecondDataHandle = GCHandle.Alloc(alignedSecondData, GCHandleType.Pinned);
alignedSecondDataOffset = CalculateAlignmentOffset(alignedSecondDataHandle);
alignedSecondDataPointer = (float*)(alignedSecondDataHandle.AddrOfPinnedObject() + alignedSecondDataOffset);
for (var i = 0; i < 1024; i++)
{
alignedFirstData[i + alignedFirstDataOffset / sizeof(float)] = rawFirstData[i];
alignedSecondData[i + alignedSecondDataOffset / sizeof(float)] = rawSecondData[i];
}
Data = new[] {
//7,
8,
//11,
//16,
20,
//30,
32,
//40,
50 }.Select(x => new object[] { x }).ToList();
}
public void Validate()
{
for(var i = 0; i < 1024; i++)
{
if (rawFirstData[i] != alignedFirstData[i + alignedFirstDataOffset / sizeof(float)])
{
throw new InvalidOperationException("Diff found!");
}
if (rawFirstData[i] != *(alignedFirstDataPointer + i))
{
throw new InvalidOperationException("Diff found!");
}
if (rawSecondData[i] != alignedSecondData[i + alignedSecondDataOffset / sizeof(float)])
{
throw new InvalidOperationException("Diff found!");
}
if (rawSecondData[i] != *(alignedSecondDataPointer + i))
{
throw new InvalidOperationException("Diff found!");
}
}
Action<string, float, float> ensureAlmostSame = delegate (string name, float normal, float other)
{
var diff = MathF.Abs(normal - other);
if (diff > 0.00001)
{
throw new InvalidOperationException($"The difference between normal and {name} was {diff}");
}
};
foreach (var count in Data.Select(x => (int)x[0]))
{
var normal = Normal(count);
var vectorUnaligned = VectorUnaligned(count);
ensureAlmostSame(nameof(vectorUnaligned), normal, vectorUnaligned);
var vectorAligned = VectorAligned(count);
ensureAlmostSame(nameof(vectorAligned), normal, vectorAligned);
var avx2Aligned = Avx2Aligned(count);
ensureAlmostSame(nameof(avx2Aligned), normal, avx2Aligned);
var fmaAligned = FmaAligned(count);
ensureAlmostSame(nameof(fmaAligned), normal, fmaAligned);
}
}
//[Benchmark(Baseline = true)]
[ArgumentsSource(nameof(Data))]
public float Normal(int count)
{
var result = 0f;
for (var i = 0; i < count; i++)
{
result += rawFirstData[i] * rawSecondData[i];
}
return result;
}
[Benchmark]
[ArgumentsSource(nameof(Data))]
public float VectorUnaligned(int count)
{
int vectorSize = Vector<float>.Count;
var accVector = Vector<float>.Zero;
int i = 0;
for (; i <= count - vectorSize; i += vectorSize)
{
var firstVector = new Vector<float>(rawFirstData, i);
var secondVector = new Vector<float>(rawSecondData, i);
var v = Vector.Multiply(firstVector, secondVector);
accVector = Vector.Add(v, accVector);
}
float result = Vector.Sum(accVector);
for (; i < count; i++)
{
result += rawFirstData[i] * rawSecondData[i];
}
return result;
}
//[Benchmark]
[ArgumentsSource(nameof(Data))]
public float VectorAligned(int count)
{
int vectorSize = Vector<float>.Count;
var accVector = Vector<float>.Zero;
int i = 0;
for (; i <= count - vectorSize; i += vectorSize)
{
var firstVector = new Vector<float>(alignedFirstData, alignedFirstDataOffset / sizeof(float) + i);
var secondVector = new Vector<float>(alignedSecondData, alignedSecondDataOffset / sizeof(float) + i);
var v = Vector.Multiply(firstVector, secondVector);
accVector = Vector.Add(v, accVector);
}
float result = Vector.Sum(accVector);
for (; i < count; i++)
{
result += rawFirstData[i] * rawSecondData[i];
}
return result;
}
[Benchmark]
[ArgumentsSource(nameof(Data))]
public float Avx2Aligned(int count)
{
int vectorSize = Vector256<float>.Count;
var accumulationVector = Vector256<float>.Zero;
var i = 0;
for (;i <= count - vectorSize; i += vectorSize)
{
var firstVector = Avx2.LoadAlignedVector256(alignedFirstDataPointer + i);
var secondVector = Avx2.LoadAlignedVector256(alignedSecondDataPointer + i);
var resultVector = Avx2.Multiply(firstVector, secondVector);
accumulationVector = Avx2.Add(accumulationVector, resultVector);
}
var result = 0f;
var temp = stackalloc float[vectorSize];
Avx2.Store(temp, accumulationVector);
for (int j = 0; j < vectorSize; j++)
{
result += temp[j];
}
for (; i < count; i++)
{
result += *(alignedFirstDataPointer + i) * *(alignedSecondDataPointer + i);
}
return result;
}
[Benchmark]
[ArgumentsSource(nameof(Data))]
public float FmaAligned(int count)
{
int vectorSize = Vector256<float>.Count;
var accumulationVector = Vector256<float>.Zero;
var i = 0;
for (; i <= count - vectorSize; i += vectorSize)
{
var firstVector = Avx2.LoadAlignedVector256(alignedFirstDataPointer + i);
var secondVector = Avx2.LoadAlignedVector256(alignedSecondDataPointer + i);
accumulationVector = Fma.MultiplyAdd(firstVector, secondVector, accumulationVector);
}
var result = 0f;
var temp = stackalloc float[vectorSize];
Avx2.Store(temp, accumulationVector);
for (int j = 0; j < vectorSize; j++)
{
result += temp[j];
}
for (; i < count; i++)
{
result += *(alignedFirstDataPointer + i) * *(alignedSecondDataPointer + i);
}
return result;
}
}
如果我在 Zen3 CPU 上运行这个基准测试,我会得到以下结果:
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19042.1586 (20H2/October2020Update)
AMD Ryzen 5 5600X, 1 CPU, 12 logical and 6 physical cores
.NET SDK=6.0.200
[Host] : .NET 6.0.2 (6.0.222.6406), X64 RyuJIT
DefaultJob : .NET 6.0.2 (6.0.222.6406), X64 RyuJIT
| Method | count | Mean | Error | StdDev |
|---------------- |------ |---------:|----------:|----------:|
| VectorUnaligned | 8 | 1.231 ns | 0.0093 ns | 0.0082 ns |
| Avx2Aligned | 8 | 3.576 ns | 0.0208 ns | 0.0195 ns |
| FmaAligned | 8 | 3.408 ns | 0.0259 ns | 0.0243 ns |
| VectorUnaligned | 20 | 4.428 ns | 0.0146 ns | 0.0122 ns |
| Avx2Aligned | 20 | 6.321 ns | 0.0578 ns | 0.0541 ns |
| FmaAligned | 20 | 5.845 ns | 0.0121 ns | 0.0113 ns |
| VectorUnaligned | 32 | 4.022 ns | 0.0098 ns | 0.0087 ns |
| Avx2Aligned | 32 | 5.205 ns | 0.0161 ns | 0.0150 ns |
| FmaAligned | 32 | 4.776 ns | 0.0265 ns | 0.0221 ns |
| VectorUnaligned | 50 | 6.901 ns | 0.0337 ns | 0.0315 ns |
| Avx2Aligned | 50 | 7.207 ns | 0.0476 ns | 0.0422 ns |
| FmaAligned | 50 | 7.246 ns | 0.0169 ns | 0.0158 ns |
为什么VectorUnaligned
比更优化的AVX2
和Fma
代码快得多?
如果我启用VectorAligned
它也比VectorUnaligned
慢。
不是答案,而是“最快的繁殖方式”的提示。
抱歉,我不知道如何处理对齐,但您错过了转换数组类型的选项。 它可能比从循环中的源数组中选择浮点数更快。
int vectorSize = Vector<float>.Count;
var accVector = Vector<float>.Zero;
Span<Vector<float>> firstVectors = MemoryMarshal.Cast<float, Vector<float>>(rawFirstData);
Span<Vector<float>> secondVectors = MemoryMarshal.Cast<float, Vector<float>>(rawSecondData);
for (int i = 0; i < firstVectors.Length; i++)
{
accVector += Vector.Multiply(firstVectors[i], secondVectors[i]);
}
float result = Vector.Sum(accVector);
for (int i = firstVectors.Length * vectorSize; i < count; i++)
{
result += rawFirstData[i] * rawSecondData[i];
}
它比VectorUnaligned
方法生成更多的 JIT 汇编器代码,但第一个循环看起来要短两倍,因为 if 只包含一个超出范围的检查而不是 4 个。给它一个机会来测试不同类型的向量和对齐方式。
这个
L0080: movsxd rsi, r11d
L0083: shl rsi, 5
L0087: vmovupd ymm1, [r8+rsi]
L008d: cmp r11d, r9d
L0090: jae short L00ff ; throw out-of-range
L0092: vmovupd ymm2, [r10+rsi]
L0098: vmulps ymm1, ymm1, ymm2
L009c: vaddps ymm0, ymm0, ymm1
L00a0: inc r11d
L00a3: cmp r11d, edx
L00a6: jl short L0080
VectorUnaligned
循环,看起来 JIT 未能优化它
L0020: mov r8, rdx
L0023: cmp eax, [r8+8]
L0027: jae L00c3 ; throw out-of-range
L002d: lea r9d, [rax+7]
L0031: cmp r9d, [r8+8]
L0035: jae L00c3 ; throw out-of-range
L003b: vmovupd ymm1, [r8+rax*4+0x10]
L0042: mov r8, [rcx+0x10]
L0046: cmp eax, [r8+8]
L004a: jae L00c3 ; throw out-of-range
L0050: cmp r9d, [r8+8]
L0054: jae short L00c3 ; throw out-of-range
L0056: vmovupd ymm2, [r8+rax*4+0x10]
L005d: vmulps ymm1, ymm1, ymm2
L0061: vaddps ymm0, ymm1, ymm0
L0065: add eax, 8
L0068: mov r8d, [rdx+8]
L006c: sub r8d, 8
L0070: cmp r8d, eax
L0073: jge short L0020
编译代码来自https://sharplab.io/ 。 实际生成的代码可能因 CPU 不同而异,因为Vector<T>.Count
在某些 CPU 上可能会有所不同。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.