I use the uint64_t Radix sort provided by Louis Ricci (answered Aug 24 '15 at 18:00) Radix_Sort_Uint64 . Amazingly fast.
I have a data structure which contains 2, uint32_t items and want to sort a large array (20+ million) looking only at the first or last 32 bits, but I want the sort routine to move the entire 64 bit package as a unit.
Is there a C language uint64 Radix sort which orders based on a subset of the entire 64 bit quanta as if the data were masked with 0X1111111100000000?
Example C code. It uses fewer local variables than the example linked to in the original post, allowing a C compiler to use registers for those variables. This program takes less than 0.5 second to sort 20 million (20*1024*1024 = 20971520) 64 bit unsigned integers by the upper 32 bits on my system (Intel 3770K 3.5ghz cpu, Windows 7 Pro 64 bit).
/* radix sort via upper 32 bits of 64 bit unsigned integers */
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
typedef unsigned long long uint64_t;
void RadixSort(uint64_t * pData, uint64_t * pTemp, size_t count)
{
size_t mIndex[4][256] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i];
mIndex[3][(u >> 32) & 0xff]++;
mIndex[2][(u >> 40) & 0xff]++;
mIndex[1][(u >> 48) & 0xff]++;
mIndex[0][(u >> 56) & 0xff]++;
}
for (j = 0; j < 4; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 256; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 40) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 56) & 0xff]++] = u;
}
}
#define COUNT (20*1024*1024) /* number of elements */
static clock_t dwTimeStart; /* clock values */
static clock_t dwTimeStop;
int main( )
{
uint64_t * pData;
uint64_t * pTemp;
uint64_t r;
size_t i;
/* allocate memory */
pData = (uint64_t *)malloc(COUNT*sizeof(uint64_t));
if(pData == NULL){
return 0;
}
pTemp = (uint64_t *)malloc(COUNT*sizeof(uint64_t));
if(pTemp == NULL){
free(pData);
return 0;
}
for(i = 0; i < COUNT; i++){ /* generate test data */
r = (((uint64_t)((rand()>>4) & 0xff))<< 0);
r |= (((uint64_t)((rand()>>4) & 0xff))<< 8);
r |= (((uint64_t)((rand()>>4) & 0xff))<<16);
r |= (((uint64_t)((rand()>>4) & 0xff))<<24);
r |= (((uint64_t)((rand()>>4) & 0xff))<<32);
r |= (((uint64_t)((rand()>>4) & 0xff))<<40);
r |= (((uint64_t)((rand()>>4) & 0xff))<<48);
r |= (((uint64_t)((rand()>>4) & 0xff))<<56);
pData[i] = r;
}
dwTimeStart = clock();
RadixSort(pData, pTemp, COUNT); /* sort array */
dwTimeStop = clock();
printf("Number of ticks %d\n", dwTimeStop-dwTimeStart);
for(i = 1; i < COUNT; i++){ /* check sort */
if((pData[i-1]>>32) > (pData[i]>>32)){
break;
}
}
if(i != COUNT)
printf("sort error\n");
free(pData);
return(0);
}
// =============================================================================
// Create 2 identical uint64[]s, mirror the first 32 bits in the last, sort one
// with qsort and the other with RadixSort11110000() and compare the results
int test_uint64_32_sort(EV_TIME_STR *tsa) {
uint64_t *la1, *la2;
int ii, lcount=3615232, zmem=0, debug=0, ucount=1, ri, er12=0, ep=63;
time_t tt; // This Time!
float rssec=2.0f; // RadixSort_SEConds elapsed time from Radixsort()
time_event(E_GENERATE_RANDOM_ARRAY, tsa, E_TIME_EVENT, 1, 0);
srand((unsigned) time(&tt));
la1=(uint64_t *)alloc_check((lcount)*8, zmem=0, "LLong_ara_1", debug);
la2=(uint64_t *)alloc_check((lcount)*8, zmem=0, "LLong_ara_2", debug);
for(ii=0; ii < lcount; ii++) { // Look only SHORT range
ri = rand(); // Random int
la1[ii] = ri << 32 + (0XFFFFFFFF - ri); // Reflect val in lower 32
}
// Make identical copies in the other []
time_event(E_MEMCPY_DOUBLE_ARRAY, tsa, E_TIME_EVENT, 1, 0);
memcpy((void *) la2, (void *) la1, lcount * sizeof(uint64_t));
time_event(E_QSORT_UINT64_ARRAY, tsa, E_TIME_EVENT, 1, 0);
qsort(la1, lcount, sizeof(uint64_t), comp_uint6411110000);
time_event(E_RADIX_SORT64_11110000, tsa, E_TIME_EVENT, 1, 0);
radixSort11110000(la2, lcount, &rssec, &ucount);
time_event(E_SPLIT_RGBJ_MEM, tsa, E_TIME_EVENT, 1, 0);
for(ii=er12=0; ii < lcount; ii++) {
if(la1[ii] != la2[ii]) {
er12++;
if((--ep) > 0) {
printf("II %d) Er%d, l1=0X%016llX, l2=0X%016llX\n",
ii, ep, la1[ii], la2[ii]); FF_SO; }
} // Count Error Mismatches
if(!(ii%100000)) {
printf("II %d) l1=0X%016llX, l2=0X%016llX\n",
ii, la1[ii], la2[ii]); FF_SO; }
}
printf("T63S: Er1/2 = %d \n", er12); FF_SO;
if(ucount) {
printf("T63S: RadixSort time = %.3f ms, unique=%d = %.3f%%\n",
rssec*1.0E3f, ucount, (100.0f * ucount / lcount)); FF_SO;
}
free_bb(la1); free_bb(la2);
}
// =============================================================================
// Based on original code from https://ideone.com/JHI0d9
// and suggestions from Ian Abbott
// https://stackoverflow.com/questions/47080353/radix-sort-c-code-to-sort-uint64-t-looking-only-at-32-msb-bits
// Hacked code may be unsuitable for any use and should not be used by anyone
// Sort uint64[] by looking ONLY at the
uint64_t *radixSort11110000(uint64_t *arrayA, uint32_t size) {
register uint64_t *array=arrayA; // Slam arg into Register!
register uint64_t std; // STanDard to compare others for uniqueness
register int dist=0; // Distinct, unique values found if *UCount arg is TRUE
register int ii; // Loop control
int64_t rtime, mtns; // Time in NanoSeconds!!!
rscounts4_t counts;
memset(&counts, 0, 256 * 4 * sizeof(uint32_t));
uint64_t * cpy = (uint64_t *)malloc(size * sizeof(uint64_t));
uint32_t o4=0, o3=0, o2=0, o1=0;
uint32_t t4, t3, t2, t1;
register uint32_t x;
// calculate counts
for(x = 0; x < size; x++) {
t4 = (array[x] >> 32) & 0xff;
t3 = (array[x] >> 40) & 0xff;
t2 = (array[x] >> 48) & 0xff;
t1 = (array[x] >> 56) & 0xff;
counts.c4[t4]++;
counts.c3[t3]++;
counts.c2[t2]++;
counts.c1[t1]++;
}
// convert counts to offsets
for(x = 0; x < 256; x++) {
t4 = o4 + counts.c4[x];
t3 = o3 + counts.c3[x];
t2 = o2 + counts.c2[x];
t1 = o1 + counts.c1[x];
counts.c4[x] = o4;
counts.c3[x] = o3;
counts.c2[x] = o2;
counts.c1[x] = o1;
o4 = t4;
o3 = t3;
o2 = t2;
o1 = t1;
}
// radix
for(x = 0; x < size; x++) {
t4 = (array[x] >> 32) & 0xff;
cpy[counts.c4[t4]] = array[x];
counts.c4[t4]++; }
for(x = 0; x < size; x++) {
t3 = (cpy[x] >> 40) & 0xff;
array[counts.c3[t3]] = cpy[x];
counts.c3[t3]++; }
for(x = 0; x < size; x++) {
t2 = (array[x] >> 48) & 0xff;
cpy[counts.c2[t2]] = array[x];
counts.c2[t2]++; }
for(x = 0; x < size; x++) {
t1 = (cpy[x] >> 56) & 0xff;
array[counts.c1[t1]] = cpy[x];
counts.c1[t1]++; }
free(cpy);
return array;
} // End radixSort_11110000().
// #############################################################################
// From: http://ideone.com/JHI0d9
// RadixSort---
typedef union {
struct {
uint32_t c4[256];
uint32_t c3[256];
uint32_t c2[256];
uint32_t c1[256];
};
uint32_t counts[256 * 4];
} rscounts4_t;
// =============================================================================
// Compare only the MSB 4 bytes of a uint64 by masking each with
// 0XFFFFFFFF00000000 before comparison
int comp_uint6411110000(const void *a, const void *b) {
return (
(
( ( *( (uint64_t *)a ) ) & 0XFFFFFFFF00000000ULL ) >
( ( *( (uint64_t *)b ) ) & 0XFFFFFFFF00000000ULL )
)
-
(
( ( *( (uint64_t *)a ) ) & 0XFFFFFFFF00000000ULL ) <
( ( *( (uint64_t *)b ) ) & 0XFFFFFFFF00000000ULL )
)
);
} // End Comp_Uint64_11110000().
// Both sorted arrays were identical.
// T63S: Er1/2 = 0
// TE: Top 90% events in desc order (7/312):
// Time= 0.157 sec = 71.282%, QSORT_UINT64_ARRAY , hits=1
// Time= 0.029 sec = 13.119%, RADIX_SORT64_11110000, hits=1
// Time= 0.026 sec = 11.872%, GENERATE_RANDOM_ARRAY, hits=1
// mult 71.282 /13.119 -> 5.433493
// The RadixSort is over 5x faster that the qsort
Perhaps the QSort comparitor could be handled more efficiently
The register optimized code from RGCLGR ["RADIX_SORT_UINT64_32R()"] is significantly faster than the original code sorting on only the 32 most significant bits:
Time= 1.851 sec = 15.648%, RADIX_SORT_FFFF0000 , hits=4, 0.463 sec each << OLD
Time= 1.552 sec = 13.120%, RADIX_SORT_UINT64_32R, hits=4, 0.388 sec each << NEW
It takes an average of 83.8% as long to run averaged over 4 runs with an array with 36.2 million values, 100% of which are distinct. After each run, the results of each sort are bit shifted >> 32
, compared and are identical.
Comparing to QSort and the previous version of Radix_Sort() on all 64 bits:
Time= 6.125 sec = 62.359%, QSORT_UINT64_ARRAY , hits=1
Time= 0.832 sec = 8.468%, RADIX_SORT_64 , hits=1 << OLD
Time= 0.770 sec = 7.842%, RADIX_SORT_UINT64_REG, hits=1 << NEW
The QSort takes almost 8 times as long as this new version. The original 'RADIX_SORT_64()' function is 8% slower.
Generalizing the code from RGCLGR to 48 and 64 bits and comparing sorting a uint64_t[32M] on 64, 48 and 32 bits with the RGCLGR method:
Time= 3.130 sec = 20.342%, RADIX_SORT_UINT64_REG, hits=4, 0.782 sec each
Time= 2.336 sec = 15.180%, RADIX_SORT_UINT64_48R, hits=4, 0.584 sec each
Time= 1.540 sec = 10.007%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each
Note the expected linearity between 20%, 15% and 10% program time for 64, 48 and 32 bits considered for sort.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.