基数排序优化

Question

I was trying to optimize the Radix Sort code, because I felt there was room for it as traditional codes in books and on web seem a direct copy of one another and also they work very slow as they take an arbitrary number such as 10 for modulo operation. 我正在尝试优化Radix Sort代码，因为我觉得仍有空间，因为书本和网络上的传统代码似乎是彼此的直接复制，并且它们的工作速度非常慢，因为它们取任意数字（例如10）作为模数操作。 I have optimized the code as far as I could go, maybe I might have missed some optimization techniques. 我已经尽可能地优化了代码，也许我可能错过了一些优化技术。 In that case please enlighten me. 在这种情况下，请赐教。

Motivation for optimization: 优化动机：
http://codercorner.com/RadixSortRevisited.htm http://codercorner.com/RadixSortRevisited.htm
http://stereopsis.com/radix.html http://stereopsis.com/radix.html
I was unable to implement all the optimizations in the articles, mostly it was beyond my skills and understanding and lack of sufficient time, if you can feel free to implement them. 我无法实现文章中的所有优化，如果您可以随意实现它们，则主要是超出了我的技能和理解范围，并且没有足够的时间。

EDIT 4 编辑4
This Java version of Radix Sort calculates all histograms in 1 read and does not need to fill array Z with zeros after every LSB sort along with the usual ability to skip sorting and jump to next LSB sorting if all previous LSB's are same. 此Java版本的Radix Sort会计算1次读取中的所有直方图，并且不需要在每次LSB排序后用零填充数组Z，并且如果以前的所有LSB都相同，则通常可以跳过排序并跳到下一个LSB排序。 As usual this is only for 32-bit integers but a 64-bit version can be created from it. 通常，这仅适用于32位整数，但可以从中创建64位版本。

protected static int[] DSC(int A[])// Sorts in descending order
{
    int tmp[] = new int[A.length] ;
    int Z[] = new int[1024] ;
    int i, Jump, Jump2, Jump3, Jump4, swap[] ;

    Jump = A[0] & 255 ;
    Z[Jump] = 1 ;
    Jump2 = ((A[0] >> 8) & 255) + 256 ;
    Z[Jump2] = 1 ;
    Jump3 = ((A[0] >> 16) & 255) + 512 ;
    Z[Jump3] = 1 ;
    Jump4 = (A[0] >> 24) + 768 ;
    Z[Jump4] = 1 ;

    // Histograms creation
    for (i = 1 ; i < A.length; ++i)
    {
        ++Z[A[i] & 255] ;
        ++Z[((A[i] >> 8) & 255) + 256] ;
        ++Z[((A[i] >> 16) & 255) + 512] ;
        ++Z[(A[i] >> 24) + 768] ;
    }

    // 1st LSB Byte Sort
    if( Z[Jump] != A.length )
    {
        Z[0] = A.length - Z[0];
        for (i = 1; i < 256; ++i)
        {
            Z[i] = Z[i - 1] - Z[i];
        }
        for (i = 0; i < A.length; ++i)
        {
            tmp[Z[A[i] & 255]++] = A[i];
        }
        swap = A ; A = tmp ; tmp = swap ;
    }

    // 2nd LSB Byte Sort
    if( Z[Jump2] != A.length )
    {
        Z[256] = A.length - Z[256];
        for (i = 257; i < 512; ++i)
        {
            Z[i] = Z[i - 1] - Z[i];
        }
        for (i = 0; i < A.length; ++i)
        {
            tmp[Z[((A[i] >> 8) & 255) + 256]++] = A[i];
        }
        swap = A ; A = tmp ; tmp = swap ;
    }

    // 3rd LSB Byte Sort
    if( Z[Jump3] != A.length )
    {
        Z[512] = A.length - Z[512];
        for (i = 513; i < 768; ++i)
        {
            Z[i] = Z[i - 1] - Z[i];
        }
        for (i = 0; i < A.length; ++i)
        {
            tmp[Z[((A[i] >> 16) & 255) + 512]++] = A[i];
        }
        swap = A ; A = tmp ; tmp = swap ;
    }

    // 4th LSB Byte Sort
    if( Z[Jump4] != A.length )
    {
        Z[768] = A.length - Z[768];
        for (i = 769; i < Z.length; ++i)
        {
            Z[i] = Z[i - 1] - Z[i];
        }
        for (i = 0; i < A.length; ++i)
        {
            tmp[Z[(A[i] >> 24) + 768]++] = A[i];
        }
        return tmp ;
    }
    return A ;
}

The Java version ran faster with != sign than == sign Java版本使用！=符号比使用==符号运行得更快

if( Z[Jump] != A.length )
{
    // lines of code
}...

but in C the below version was on average, 25% faster (with equalto sign) than its counterpart with != sign. 但是在C语言中，以下版本的平均速度（带有等于号）比带有！=符号的版本快25％。 Your hardware might react differently. 您的硬件可能会有不同的反应。

if( Z[Jump] == A.length );
else
{
    // lines of code
}...

Below is the C code ( "long" on my machine is 32 bits ) 下面是C代码（我机器上的“ long”是32位）

long* Radix_2_ac_long(long *A, size_t N, long *Temp)// Sorts in ascending order
{
    size_t Z[1024] = {0};
    long *swp;
    size_t i, Jump, Jump2, Jump3, Jump4;

    // Sort-circuit set-up
    Jump = *A & 255;
    Z[Jump] = 1;
    Jump2 = ((*A >> 8) & 255) + 256;
    Z[Jump2] = 1;
    Jump3 = ((*A >> 16) & 255) + 512;
    Z[Jump3] = 1;
    Jump4 = (*A >> 24) + 768;
    Z[Jump4] = 1;

    // Histograms creation
    for(i = 1 ; i < N ; ++i)
    {
        ++Z[*(A+i) & 255];
        ++Z[((*(A+i) >> 8) & 255) + 256];
        ++Z[((*(A+i) >> 16) & 255) + 512];
        ++Z[(*(A+i) >> 24) + 768];
    }

    // 1st LSB byte sort
    if( Z[Jump] == N );
    else
    {
        for( i = 1 ; i < 256 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i < N ; --i )
        {
            *(--Z[*(A+i) & 255] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 2nd LSB byte sort
    if( Z[Jump2] == N );
    else
    {
        for( i = 257 ; i < 512 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i < N ; --i )
        {
            *(--Z[((*(A+i) >> 8) & 255) + 256] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 3rd LSB byte sort
    if( Z[Jump3] == N );
    else
    {
        for( i = 513 ; i < 768 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i < N ; --i )
        {
            *(--Z[((*(A+i) >> 16) & 255) + 512] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 4th LSB byte sort
    if( Z[Jump4] == N );
    else
    {
        for( i = 769 ; i < 1024 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i < N ; --i )
        {
            *(--Z[(*(A+i) >> 24) + 768] + Temp) = *(A+i);
        }
        return Temp;
    }
    return A;
}

EDIT 5 编辑5
The sort now handles negative numbers too. 现在，排序也可以处理负数。 Only some minor/negligible tweaks to the code did it. 仅对代码进行了一些细微/微不足道的调整。 It runs a little slower as a result but the effect is not significant. 结果，它的运行速度稍慢，但效果不明显。 Coded in C, below ( "long" on my system is 32 bits ) 下面用C编码（我的系统上的“ long”是32位）

long* Radix_Sort(long *A, size_t N, long *Temp)
{
    size_t Z[1024] = {0};
    long *swp;
    size_t Jump, Jump2, Jump3, Jump4;
    long i;

    // Sort-circuit set-up
    Jump = *A & 255;
    Z[Jump] = 1;
    Jump2 = ((*A >> 8) & 255) + 256;
    Z[Jump2] = 1;
    Jump3 = ((*A >> 16) & 255) + 512;
    Z[Jump3] = 1;
    Jump4 = ((*A >> 24) & 255) + 768;
    Z[Jump4] = 1;

    // Histograms creation
    for(i = 1 ; i < N ; ++i)
    {
        ++Z[*(A+i) & 255];
        ++Z[((*(A+i) >> 8) & 255) + 256];
        ++Z[((*(A+i) >> 16) & 255) + 512];
        ++Z[((*(A+i) >> 24) & 255) + 768];
    }

    // 1st LSB byte sort
    if( Z[Jump] == N );
    else
    {
        for( i = 1 ; i < 256 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i >= 0 ; --i )
        {
            *(--Z[*(A+i) & 255] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 2nd LSB byte sort
    if( Z[Jump2] == N );
    else
    {
        for( i = 257 ; i < 512 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i >= 0 ; --i )
        {
            *(--Z[((*(A+i) >> 8) & 255) + 256] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 3rd LSB byte sort
    if( Z[Jump3] == N );
    else
    {
        for( i = 513 ; i < 768 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i >= 0 ; --i )
        {
            *(--Z[((*(A+i) >> 16) & 255) + 512] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 4th LSB byte sort and negative numbers sort
    if( Z[Jump4] == N );
    else
    {
        for( i = 897 ; i < 1024 ; ++i )// -ve values frequency starts after index 895, i.e at 896 ( 896 = 768 + 128 ), goes upto 1023
        {
            Z[i] = Z[i-1] + Z[i];
        }
        Z[768] = Z[768] + Z[1023];
        for( i = 769 ; i < 896 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i >= 0 ; --i )
        {
            *(--Z[((*(A+i) >> 24) & 255) + 768] + Temp) = *(A+i);
        }
        return Temp;
    }
    return A;
}

EDIT 6 编辑6
Below is the pointer optimized version ( accesses array locations via pointers ) that takes on average, approximately 20% less time to sort than the one above. 下面是指针优化的版本（通过指针访问数组位置），平均花费的时间比上面的少20％。 It also uses 4 separate arrays for faster address calculation ( "long" on my system is 32 bits ). 它还使用4个单独的数组来加快地址计算（我的系统上的“ long”是32位）。

long* Radix_Sort(long *A, size_t N, long *Temp)
{
    long Z1[256] ;
    long Z2[256] ;
    long Z3[256] ;
    long Z4[256] ;
    long T = 0 ;
    while(T != 256)
    {
        *(Z1+T) = 0 ;
        *(Z2+T) = 0 ;
        *(Z3+T) = 0 ;
        *(Z4+T) = 0 ;
        ++T;
    }
    size_t Jump, Jump2, Jump3, Jump4;

    // Sort-circuit set-up
    Jump = *A & 255 ;
    Z1[Jump] = 1;
    Jump2 = (*A >> 8) & 255 ;
    Z2[Jump2] = 1;
    Jump3 = (*A >> 16) & 255 ;
    Z3[Jump3] = 1;
    Jump4 = (*A >> 24) & 255 ;
    Z4[Jump4] = 1;

    // Histograms creation
    long *swp = A + N;
    long *i = A + 1;
    for( ; i != swp ; ++i)
    {
        ++Z1[*i & 255];
        ++Z2[(*i >> 8) & 255];
        ++Z3[(*i >> 16) & 255];
        ++Z4[(*i >> 24) & 255];
    }

    // 1st LSB byte sort
    if( Z1[Jump] == N );
    else
    {
        swp = Z1+256 ;
        for( i = Z1+1 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        swp = A-1;
        for( i = A+N-1 ; i != swp ; --i )
        {
            *(--Z1[*i & 255] + Temp) = *i;
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 2nd LSB byte sort
    if( Z2[Jump2] == N );
    else
    {
        swp = Z2+256 ;
        for( i = Z2+1 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        swp = A-1;
        for( i = A+N-1 ; i != swp ; --i )
        {
            *(--Z2[(*i >> 8) & 255] + Temp) = *i;
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 3rd LSB byte sort
    if( Z3[Jump3] == N );
    else
    {
        swp = Z3 + 256 ;
        for( i = Z3+1 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        swp = A-1;
        for( i = A+N-1 ; i != swp ; --i )
        {
            *(--Z3[(*i >> 16) & 255] + Temp) = *i;
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 4th LSB byte sort and negative numbers sort
    if( Z4[Jump4] == N );
    else
    {
        swp = Z4 + 256 ;
        for( i = Z4+129 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        *Z4 = *Z4 + *(Z4+255) ;
        swp = Z4 + 128 ;
        for( i = Z4+1 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        swp = A - 1;
        for( i = A+N-1 ; i != swp ; --i )
        {
            *(--Z4[(*i >> 24) & 255] + Temp) = *i;
        }
        return Temp;
    }
    return A;
}

Answer 1

The edit 4 version is good enough if the original and temp arrays fit in cache. 如果原始数组和临时数组适合缓存，那么edit 4版本就足够了。 If the array size is much greater than cache size, most of the overhead is due to the random order writes to the arrays. 如果数组大小比缓存大小大得多，则大多数开销是由于对数组的随机顺序写入所致。 A hybrid msb/lsb radix sort can avoid this issue. 混合的MSB / LSB基数排序可以避免此问题。 For example split the array into 256 bins according to the most significant byte, then do a lsb radix sort on each of the 256 bins. 例如，根据最高有效字节将数组拆分为256个bin，然后对256个bin中的每一个进行lsb基数排序。 The idea here is that a pair (original and temp) of bins will fit within the cache, where random order writes are not an issue (for most cache implementations). 这里的想法是，一对（原始和临时）bin将适合高速缓存，其中随机顺序写入不成问题（对于大多数高速缓存实现而言）。

For a 8MB cache, the goal is for each of the bins to be < 4MB in size = 1 million 32 bit integers if the integers evenly distribute into the bins. 对于8MB高速缓存，目标是每个整数的大小小于4MB = 1百万个32位整数（如果整数均匀地分布到二进制数中）。 This strategy would work for array size up to 256 million 32 bit integers. 该策略适用于最大为2.56亿个32位整数的数组。 For larger arrays, the msb phase could split up the array into 1024 bins, for up to 1 billion 32 bit integers. 对于较大的阵列，msb阶段可以将阵列分成1024个bin，最多可容纳10亿个32位整数。 On my system, sorting 16,777,216 (2^24) 32 bit integers with a classic 8,8,8,8 lsb radix sort took 0.45 seconds, while the hybrid 8 msb : 8,8,8 lsb took 0.24 seconds. 在我的系统上，使用经典的8,8,8,8 lsb基数排序对16,777,216（2 ^ 24）个32位整数进行排序需要0.45秒，而混合8 msb：8,8,8 lsb则需要0.24秒。

// split array into 256 bins according to most significant byte
void RadixSort(uint32_t * a, size_t count)
{
size_t aIndex[260] = {0};               // count / array
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i;
    for(i = 0; i < count; i++)          // generate histogram
        aIndex[1+((size_t)(a[i] >> 24))]++;
    for(i = 2; i < 257; i++)            // convert to indices
        aIndex[i] += aIndex[i-1];
    for(i = 0; i < count; i++)          //  sort by msb
        b[aIndex[a[i]>>24]++] = a[i];
    for(i = 256; i; i--)                // restore aIndex
        aIndex[i] = aIndex[i-1];
    aIndex[0] = 0;
    for(i = 0; i < 256; i++)            // radix sort the 256 bins
        RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
    delete[] b;
}

// sort a bin by 3 least significant bytes
void RadixSort3(uint32_t * a, uint32_t *b, size_t count)
{
size_t mIndex[3][256] = {0};            // count / matrix
size_t i,j,m,n;
uint32_t u;
    if(count == 0)
        return;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 3; j++){
            mIndex[j][(size_t)(u & 0xff)]++;
            u >>= 8;
        }       
    }
    for(j = 0; j < 3; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 256; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }       
    }
    for(j = 0; j < 3; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<3))&0xff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
}

Example code for classic lsb radix sorts: 经典lsb基数排序的示例代码：

Example C++ lsb radix sort using 8,8,8,8 bit fields: 使用8、8、8、8位字段的示例C ++ lsb基数排序：

typedef unsigned int uint32_t;

void RadixSort(uint32_t * a, size_t count)
{
size_t mIndex[4][256] = {0};            // count / index matrix
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i,j,m,n;
uint32_t u;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 4; j++){
            mIndex[j][(size_t)(u & 0xff)]++;
            u >>= 8;
        }       
    }
    for(j = 0; j < 4; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 256; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }       
    }
    for(j = 0; j < 4; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<3))&0xff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
    delete[] b;
}

Example C++ code using 16,16 bit fields: 使用16,16位字段的示例C ++代码：

typedef unsigned int uint32_t;

uint32_t * RadixSort(uint32_t * a, size_t count)
{
size_t mIndex[2][65536] = {0};          // count / index matrix
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i,j,m,n;
uint32_t u;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 2; j++){
            mIndex[j][(size_t)(u & 0xffff)]++;
            u >>= 16;
        }
    }
    for(j = 0; j < 2; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 65536; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }
    }       
    for(j = 0; j < 2; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<4))&0xffff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
    delete[] b;
    return(a);
}

Answer 2

N & 15 , N & 31 , N & 63 .... and so on , which of these bitwise operations takes least time? N＆15，N＆31，N＆63 ....等等，这些按位运算中哪一个花费的时间最少？

They are same. 他们是一样的。 Do not take it bad, but optimizing for speed without knowing how long things last may end up quite bad. 不要把它弄坏，但是在不知道事情持续多长时间的情况下进行速度优化可能会变得非常糟糕。 And even when you know the timing, hardware is very complicated nowadays and quite unpredictable. 而且，即使您知道时间安排，如今的硬件也非常复杂且不可预测。 You program in java, that is another layer of insanely complex system. 您使用Java编程，这是非常复杂的系统的另一层。 The same code may be faster today and slower tomorrow. 相同的代码今天可能更快，明天可能更慢。 Your say approximately 2.232891909840167 times faster . 您的approximately 2.232891909840167 times faster 。 In reality, you have measurement on one hardware and software configuration with one set of data and you can only hope the measurement is representative enough. 实际上，您可以使用一组数据在一种硬件和软件配置上进行测量，并且只能希望测量结果具有足够的代表性。 Unfortunately, it is not always the case. 不幸的是，并非总是如此。

I rewrote your function. 我改写了你的职务。 It is shorter and simpler, yet does not seem to be slower. 它更短更简单，但似乎并不慢。 Compilers tend to like code that is not too clever, as there are many optimizations for simple cases. 编译器倾向于喜欢不太聪明的代码，因为对于简单情况有许多优化。 The correction for negative numbers is not particulary nice, you can delete it if you do not like it. 负数的校正不是特别好，如果您不喜欢它，可以将其删除。 It seems to work best for 8 bits and 11 bits, probably due to cache sizes, have a look at comments of rcgldr. 似乎由于高速缓存的大小，它最适合8位和11位，请查看rcgldr的注释。

EDIT 编辑

@ytoamn you are right, if all is in the first bucket the loop should continue, not break. @ytoamn您是对的，如果所有内容都在第一个存储桶中，则循环应继续而不是中断。 That was a bug. 那是一个错误。 To the other changes, I would rather avoid the contract you have done now. 对于其他更改，我宁愿避免您现在已经签订的合同。 I think there are three natural contracts for sorting function. 我认为排序功能有三个自然契约。 First one is sorting the original array and returning null. 第一个是对原始数组进行排序并返回null。 Second is sorting the original array and return it. 其次是对原始数组进行排序并返回它。 The third is returning new sorted array and keeping the original array intact. 第三是返回新的排序数组并保持原始数组完整。 I like the first one, as its behaviour is unambiguous. 我喜欢第一个，因为它的行为是明确的。 The way you have it now you should add big warning to the documentation, that the original array has changed and is returned from the function is some cases and in other not. 您现在拥有的方式应该在文档中添加重大警告，在某些情况下，原始数组已更改并从函数返回，在其他情况下则没有。 Second thing I would avoid is the old C code style. 我要避免的第二件事是旧的C代码样式。 You should define loop variable in the loop if you need it only there. 如果仅在循环中需要，则应在循环中定义循环变量。 Defining it globally injects dependency that may lead to bugs. 全局定义它会注入依赖关系，这可能会导致错误。 And it has no advantages here, as properly defined loop variables would share the space in the end anyway. 它在这里没有优势，因为正确定义的循环变量无论如何最终都会共享空间。 Compiler is well aware of the scope, you should use the smallest scope you need. 编译器非常了解范围，您应该使用所需的最小范围。

EDIT2 EDIT2

Feel free to comment directly under my post :-) Local variables are just addresses on the stack. 随意在我的帖子下直接发表评论:-)局部变量只是堆栈上的地址。 You allocate memory when constructing object which is not the case here. 您在构造对象时分配内存，但情况并非如此。 As for the array, think about this code: 至于数组，请考虑以下代码：

public static void Tst(int[] A) {
    int[] tmp = new int[A.length];
    A[0] = 6;
    A = tmp; // changes what parameter A contains
    A[0] = 7;
}

public static void main(String[] args) {
    int[] A = new int[1];
    A[0] = 5;
    Tst(A);
    System.out.println(A[0]); //prints 6
}

It prints 6. Number 7 is written into tmp array only. 打印6。数字7只写到tmp数组中。 Array A in main is not affected. main中的数组A不受影响。

protected static void ASC2(int A[], int bits) {
    int[] origA = A;
    int[] tmp = new int[A.length];
    int[] Z = new int[1 << bits];
    int mask = (1 << bits) - 1;

    for (int shift = 0; shift < 32; shift += bits) {

        if (shift > 0) {
            Arrays.fill(Z, 0);
        }
        for (int i = 0; i < A.length; ++i) {
            Z[(A[i] >> shift) & mask]++;
        }

        if (Z[0] == A.length) {
            continue; // all in first bucket
        }

        Z[Z.length - 1] = A.length - Z[Z.length - 1];
        for (int i = Z.length - 2; i >= 0; --i) {
            Z[i] = Z[i + 1] - Z[i];
        }

        if (shift + bits > 31) { // negative numbers correction
            int halfLength = Z.length / 2;
            int positSum = Z[halfLength];
            int negSum = A.length - positSum;
            if (negSum > 0) {
                for (int i = 0; i < halfLength; ++i) {
                    Z[i] += negSum;
                }
                for (int i = halfLength; i < Z.length; ++i) {
                    Z[i] -= positSum;
                }
            }
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[(A[i] >> shift) & mask]++] = A[i];
        }

        int[] swap = A;
        A = tmp;
        tmp = swap;
    }

    if (A != origA) {
        System.arraycopy(A, 0, origA, 0, A.length);
    }
}

EDIT3 EDIT3

Loop unroll is a valid technique, improving short circuiting is really nice. 循环展开是一种有效的技术，改善短路确实很棒。 But with using array lengths as constants you definitely start to be too clever. 但是，使用数组长度作为常量时，您肯定会变得太聪明了。 If you hard coded the base size, why not hard code it all like this: 如果您对基本大小进行了硬编码，那么为什么不这样对它们进行硬编码：

protected static int[] DSC2(int A[])// sorts in descending order
{
    int tmp[] = new int[A.length];
    int Z[] = new int[256];
    int sample, swap[];

    // 1st LSB byte extraction
    sample = A[0] & 255;
    for (int i = 0; i < A.length; ++i) {
        Z[A[i] & 255]++;
    }

    if (Z[sample] != A.length) {
        Z[0] = A.length - Z[0];
        for (int i = 1; i < Z.length; ++i) {
            Z[i] = Z[i - 1] - Z[i];
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[A[i] & 255]++] = A[i];
        }

        swap = A;
        A = tmp;
        tmp = swap;
        Arrays.fill(Z, 0);
    } else {
        Z[sample] = 0;
    }

    // 2nd LSB byte extraction
    sample = (A[0] >> 8) & 255;
    for (int i = 0; i < A.length; ++i) {
        Z[(A[i] >> 8) & 255]++;
    }

    if (Z[sample] != A.length) {
        Z[0] = A.length - Z[0];
        for (int i = 1; i < Z.length; ++i) {
            Z[i] = Z[i - 1] - Z[i];
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[(A[i] >> 8) & 255]++] = A[i];
        }

        swap = A;
        A = tmp;
        tmp = swap;
        Arrays.fill(Z, 0);
    } else {
        Z[sample] = 0;
    }

    // 3rd LSB byte extraction
    sample = (A[0] >> 16) & 255;
    for (int i = 0; i < A.length; ++i) {
        Z[(A[i] >> 16) & 255]++;
    }

    if (Z[sample] != A.length) {
        Z[0] = A.length - Z[0];
        for (int i = 1; i < Z.length; ++i) {
            Z[i] = Z[i - 1] - Z[i];
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[(A[i] >> 16) & 255]++] = A[i];
        }

        swap = A;
        A = tmp;
        tmp = swap;
        Arrays.fill(Z, 0);
    } else {
        Z[sample] = 0;
    }

    // 4th LSB byte extraction
    sample = (A[0] >> 24) & 255;
    for (int i = 0; i < A.length; ++i) {
        Z[(A[i] >> 24) & 255]++;
    }

    if (Z[sample] != A.length) {
        Z[0] = A.length - Z[0];
        for (int i = 1; i < Z.length; ++i) {
            Z[i] = Z[i - 1] - Z[i];
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[(A[i] >> 24) & 255]++] = A[i];
        }

        A = tmp;
    }

    return A;
}

基数排序优化

问题描述

2 个解决方案

解决方案1
2 2017-06-28 02:32:04

解决方案2
1 2017-04-30 23:57:07

基数排序优化

问题描述

2 个解决方案

解决方案1 2 2017-06-28 02:32:04

解决方案2 1 2017-04-30 23:57:07

解决方案1
2 2017-06-28 02:32:04

解决方案2
1 2017-04-30 23:57:07