簡體   English   中英

64 位整數上的 C++ 與 C# 按位運算 - 性能

[英]C++ vs C# bitwise operations on 64-bit ints - performance

我將 2D 位字段存儲在 5 個無符號長整型數組中。 我要爭取最好的表現。 我在 C# 中工作,但我試圖通過在 C++ 中實現我的類來設置基准。

這里的問題是 C# 實現需要大約 10 秒才能完成,而 C++ 需要大約 1 秒,使其速度提高 10 倍 C++ 是 VS2015 中的 x64 構建。 C# 在 x64 VS2015 .NET 4.6 中。 當然,兩者都在 Release 中。

編輯:稍微優化 C# 代碼后,與 C++ 1.3 秒相比,它仍然需要 7 到 8 秒。

注意: x86 中的 C++ 大約需要 6 秒才能完成。 我在 64 位機器上運行代碼。

問題:是什么讓 C++ 更快? 有沒有辦法將 C# 代碼優化為至少同樣快? (也許是一些不安全的魔法?)

讓我感到困惑的是,我們只是在談論遍歷數組和按位運算。 它不應該與 C++ 幾乎相同嗎?

示例代碼: 實現中有兩個簡單的函數。 Left() 和 Right() 將整個字段分別向左移動 1 位。 正確的在多頭之間帶有適當的位。

C++

#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;

class BitField
{
private:
    unsigned long long LEFTMOST_BIT = 0x8000000000000000;
    unsigned long long RIGHTMOST_BIT = 1;

public:
    unsigned long long Cells_l[5];
    BitField()
    {
        for (size_t i = 0; i < 5; i++)
        {
            Cells_l[i] = rand(); // Random initialization
        }
    }
    void Left()
    {
        unsigned long long carry = 0;
        unsigned long long nextCarry = 0;
        for (int i = 0; i < 5; i++)
        {
            nextCarry = (Cells_l[i] & LEFTMOST_BIT) >> 63;
            Cells_l[i] = Cells_l[i] << 1 | carry;
            carry = nextCarry;
        }
    }
    void Right()
    {
        unsigned long long carry = 0;
        unsigned long long nextCarry = 0;
        for (int i = 4; i >= 0; i--)
        {
            nextCarry = (Cells_l[i] & RIGHTMOST_BIT) << 63;
            Cells_l[i] = Cells_l[i] >> 1 | carry;
            carry = nextCarry;
        }
    }
};

int main()
{
    BitField bf;

    high_resolution_clock::time_point t1 = high_resolution_clock::now();
    for (int i = 0; i < 100000000; i++)
    {
        bf.Left();
        bf.Left();
        bf.Left();
        bf.Right();
        bf.Right();
        bf.Left();
        bf.Right();
        bf.Right();
    }
    high_resolution_clock::time_point t2 = high_resolution_clock::now();

    auto duration = duration_cast<milliseconds>(t2 - t1).count();

    cout << "Time: " << duration << endl << endl;
    // Print to avoid compiler optimizations
    for (size_t i = 0; i < 5; i++)
    {
        cout << bf.Cells_l[i] << endl;
    }

    return 0;
}

C#

using System;
using System.Diagnostics;

namespace TestCS
{
    class BitField
    {
        const ulong LEFTMOST_BIT = 0x8000000000000000;
        const ulong RIGHTMOST_BIT = 1;

        static Random rnd = new Random();

        ulong[] Cells;

        public BitField()
        {
            Cells = new ulong[5];
            for (int i = 0; i < 5; i++)
            {
                Cells[i] = (ulong)rnd.Next(); // Random initialization
            }
        }

        public void Left()
        {
            ulong carry = 0;
            ulong nextCarry = 0;
            for (int i = 0; i < 5; i++)
            {
                nextCarry = (Cells[i] & LEFTMOST_BIT) >> 63;
                Cells[i] = Cells[i] << 1 | carry;
                carry = nextCarry;
            }
        }
        public void Right()
        {
            ulong carry = 0;
            ulong nextCarry = 0;
            for (int i = 4; i >= 0; i--)
            {
                nextCarry = (Cells[i] & RIGHTMOST_BIT) << 63;
                Cells[i] = Cells[i] >> 1 | carry;
                carry = nextCarry;
            }
        }
    }

    class Program
    {
        static void Main(string[] args)
        {
            BitField bf = new BitField();
            Stopwatch sw = new Stopwatch();

            // Call to remove the compilation time from measurements
            bf.Left();
            bf.Right();

            sw.Start();
            for (int i = 0; i < 100000000; i++)
            {
                bf.Left();
                bf.Left();
                bf.Left();
                bf.Right();
                bf.Right();
                bf.Left();
                bf.Right();
                bf.Right();
            }
            sw.Stop();

            Console.WriteLine($"Done in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
        }
    }
}

編輯:修復了示例代碼中的“nextCarry”拼寫錯誤。

部分差異可能是因為兩個版本之間的代碼差異 - 您沒有在 C++ Left和 C# Right分配給nextCarry ,但這些可能是示例中的拼寫錯誤。

您可能想查看兩者的反匯編以了解差異,但這主要是由於 C++ 編譯器有更多時間花在優化代碼上。 在這種情況下,它展開循環,內聯所有函數調用(包括構造函數),並將Cells_l所有內容Cells_l送到寄存器中。 所以有一個使用寄存器的大循環,並且不能訪問內存。

我沒有看過 C# 編譯的輸出,但我懷疑它是否有任何接近的結果。

此外,如評論中所述,將 C# 代碼中的所有Cells.Length調用替換為 5(就像在 C++ 代碼中一樣)。

我從評論和@AntoninLejsek 刪除的答案中獲得了足夠的信息,我可以自己回答這個問題。

TL;DR C++ 編譯器在優化方面做得更好,並且在循環中完成 C# 托管數組訪問成本很高。 然而,不安全的代碼和固定訪問不足以匹配 C++。

看來我們需要手動優化 C# 代碼才能獲得與 C++ 相當的性能。

  1. 展開循環
  2. 使用不安全代碼進行固定數組訪問
  3. 不要重復訪問數組 - 而是將項目存儲到局部變量中。

以下 C# 代碼的運行速度與 C++ 代碼一樣快(實際上快了大約 100 毫秒)。 在 .NET 4.6 VS 2015 Release x64 上編譯。

unsafe struct BitField
{
    static Random rnd = new Random();
    public fixed ulong Cells[5];
    public BitField(int nothing)
    {
        fixed (ulong* p = Cells)
        {
            for (int i = 0; i < 5; i++)
            {
                p[i] = (ulong)rnd.Next(); // Just some random number
            }
        }
    }
public void StuffUnrolledNonManaged()
{
        ulong u0;
        ulong u1;
        ulong u2;
        ulong u3;
        ulong u4;
        fixed (ulong *p = Cells)
        {
            u0 = p[0];
            u1 = p[1];
            u2 = p[2];
            u3 = p[3];
            u4 = p[4];
        }
        ulong carry = 0;
        ulong nextCarry = 0;

        for (int i = 0; i < 100000000; i++)
        {

            //left
            carry = 0;
            nextCarry = u0 >> 63;
            u0 = u0 << 1 | carry;
            carry = nextCarry;
            nextCarry = u1 >> 63;
            u1 = u1 << 1 | carry;
            carry = nextCarry;
            nextCarry = u2 >> 63;
            u2 = u2 << 1 | carry;
            carry = nextCarry;
            nextCarry = u3 >> 63;
            u3 = u3 << 1 | carry;
            carry = nextCarry;
            u4 = u4 << 1 | carry;

            //left
            carry = 0;
            nextCarry = u0 >> 63;
            u0 = u0 << 1 | carry;
            carry = nextCarry;
            nextCarry = u1 >> 63;
            u1 = u1 << 1 | carry;
            carry = nextCarry;
            nextCarry = u2 >> 63;
            u2 = u2 << 1 | carry;
            carry = nextCarry;
            nextCarry = u3 >> 63;
            u3 = u3 << 1 | carry;
            carry = nextCarry;
            u4 = u4 << 1 | carry;

            //left
            carry = 0;
            nextCarry = u0 >> 63;
            u0 = u0 << 1 | carry;
            carry = nextCarry;
            nextCarry = u1 >> 63;
            u1 = u1 << 1 | carry;
            carry = nextCarry;
            nextCarry = u2 >> 63;
            u2 = u2 << 1 | carry;
            carry = nextCarry;
            nextCarry = u3 >> 63;
            u3 = u3 << 1 | carry;
            carry = nextCarry;
            u4 = u4 << 1 | carry;

            //right
            carry = 0;
            nextCarry = u4 << 63;
            u4 = u4 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u3 << 63;
            u3 = u3 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u2 << 63;
            u2 = u2 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u1 << 63;
            u1 = u1 >> 1 | carry;
            carry = nextCarry;
            u0 = u0 >> 1 | carry;

            //right
            carry = 0;
            nextCarry = u4 << 63;
            u4 = u4 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u3 << 63;
            u3 = u3 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u2 << 63;
            u2 = u2 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u1 << 63;
            u1 = u1 >> 1 | carry;
            carry = nextCarry;
            u0 = u0 >> 1 | carry;

            //left
            carry = 0;
            nextCarry = u0 >> 63;
            u0 = u0 << 1 | carry;
            carry = nextCarry;
            nextCarry = u1 >> 63;
            u1 = u1 << 1 | carry;
            carry = nextCarry;
            nextCarry = u2 >> 63;
            u2 = u2 << 1 | carry;
            carry = nextCarry;
            nextCarry = u3 >> 63;
            u3 = u3 << 1 | carry;
            carry = nextCarry;
            u4 = u4 << 1 | carry;

            //right
            carry = 0;
            nextCarry = u4 << 63;
            u4 = u4 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u3 << 63;
            u3 = u3 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u2 << 63;
            u2 = u2 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u1 << 63;
            u1 = u1 >> 1 | carry;
            carry = nextCarry;
            u0 = u0 >> 1 | carry;

            //right
            carry = 0;
            nextCarry = u4 << 63;
            u4 = u4 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u3 << 63;
            u3 = u3 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u2 << 63;
            u2 = u2 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u1 << 63;
            u1 = u1 >> 1 | carry;
            carry = nextCarry;
            u0 = u0 >> 1 | carry;

        }

        fixed (ulong* p = Cells)
        {
            p[0] = u0;
            p[1] = u1;
            p[2] = u2;
            p[3] = u3;
            p[4] = u4;
        }
    }

測試代碼

static void Main(string[] args)
        {
            BitField bf = new BitField(0);
            Stopwatch sw = new Stopwatch();

            // Call to remove the compilation time from measurements
            bf.StuffUnrolledNonManaged();

            sw.Start();
            bf.StuffUnrolledNonManaged();
            sw.Stop();

            Console.WriteLine($"Non managed access unrolled in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
        }

此代碼在大約1.1 秒內完成。

注意:僅固定數組訪問不足以匹配 C++ 性能。 如果我們不使用局部變量 - u0 的每個實例都被 p[0] 等替換。時間約為3.6 秒

如果我們只對問題中的代碼使用固定訪問(在循環中調用 Left() 和 Right() 函數)。 時間約為5.8 秒

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM