[英]The fastest way to generate a random permutation
我需要以最快的方式在0
和N-1
之間置換N
個數字(在 CPU 上,沒有多線程,但可能使用 SIMD)。 N
不大,我想大多數情況下, N<=12
,所以N!
適合簽名的 32 位 integer。
目前我嘗試過的大致如下(省略了一些優化,我的原始代碼在Java,但如果不是偽代碼,我們在C++中講性能):
#include <random>
#include <cstdint>
#include <iostream>
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[2];
uint64_t Next(void) {
const uint64_t s0 = s[0];
uint64_t s1 = s[1];
const uint64_t result = rotl(s0 + s1, 17) + s0;
s1 ^= s0;
s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
s[1] = rotl(s1, 28); // c
return result;
}
// Assume the array |dest| must have enough space for N items
void GenPerm(int* dest, const int N) {
for(int i=0; i<N; i++) {
dest[i] = i;
}
uint64_t random = Next();
for(int i=0; i+1<N; i++) {
const int ring = (N-i);
// I hope the compiler optimizes acquisition
// of the quotient and modulo for the same
// dividend and divisor pair into a single
// CPU instruction, at least in Java it does
const int pos = random % ring + i;
random /= ring;
const int t = dest[pos];
dest[pos] = dest[i];
dest[i] = t;
}
}
int main() {
std::random_device rd;
uint32_t* seed = reinterpret_cast<uint32_t*>(s);
for(int i=0; i<4; i++) {
seed[i] = rd();
}
int dest[20];
for(int i=0; i<10; i++) {
GenPerm(dest, 12);
for(int j=0; j<12; j++) {
std::cout << dest[j] << ' ';
}
std::cout << std::endl;
}
return 0;
}
上面的速度很慢,因為 CPU 的模運算 ( %
) 很慢。 我可以考慮生成一個介於0
和N!-1
(含)之間的隨機數; 這將減少模運算和Next()
調用的次數,但我不知道如何進行。 另一種方法是用反 integer 數的乘法代替除法運算,代價是生成的模數有小偏差,但我沒有這些反整數,乘法可能不會快得多(按位運算和移位應該是快點)。
有更具體的想法嗎?
更新:有人問我為什么它是實際應用程序中的瓶頸。 所以我剛剛發布了一個其他人可能感興趣的任務。 生產中的真正任務是:
struct Item {
uint8_t is_free_; // 0 or 1
// ... other members ...
};
Item* PickItem(const int time) {
// hash-map lookup, non-empty arrays
std::vector<std::vector<Item*>>> &arrays = GetArrays(time);
Item* busy = nullptr;
for(int i=0; i<arrays.size(); i++) {
uint64_t random = Next();
for(int j=0; j+1<arrays[i].size(); j++) {
const int ring = (arrays[i].size()-j);
const int pos = random % ring + j;
random /= ring;
Item *cur = arrays[i][pos];
if(cur.is_free_) {
// Return a random free item from the first array
// where there is at least one free item
return cur;
}
arrays[i][pos] = arrays[i][j];
arrays[i][j] = cur;
}
Item* cur = arrays[i][arrays[i].size()-1];
if(cur.is_free_) {
return cur;
} else {
// Return the busy item in the last array if no free
// items are found
busy = cur;
}
}
return busy;
}
我在 C++ 中提出了以下解決方案(雖然不能很好地移植到 Java,因為 Java 不允許使用常量參數化 generics - 在 Java 中我不得不使用多態性,以及大量的代碼重復):
#include <random>
#include <cstdint>
#include <iostream>
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[2];
uint64_t Next(void) {
const uint64_t s0 = s[0];
uint64_t s1 = s[1];
const uint64_t result = rotl(s0 + s1, 17) + s0;
s1 ^= s0;
s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
s[1] = rotl(s1, 28); // c
return result;
}
template<int N> void GenPermInner(int* dest, const uint64_t random) {
// Because N is a constant, the compiler can optimize the division
// by N with more lightweight operations like shifts and additions
const int pos = random % N;
const int t = dest[pos];
dest[pos] = dest[0];
dest[0] = t;
return GenPermInner<N-1>(dest+1, random / N);
}
template<> void GenPermInner<0>(int*, const uint64_t) {
return;
}
template<> void GenPermInner<1>(int*, const uint64_t) {
return;
}
// Assume the array |dest| must have enough space for N items
void GenPerm(int* dest, const int N) {
switch(N) {
case 0:
case 1:
return;
case 2:
return GenPermInner<2>(dest, Next());
case 3:
return GenPermInner<3>(dest, Next());
case 4:
return GenPermInner<4>(dest, Next());
case 5:
return GenPermInner<5>(dest, Next());
case 6:
return GenPermInner<6>(dest, Next());
case 7:
return GenPermInner<7>(dest, Next());
case 8:
return GenPermInner<8>(dest, Next());
case 9:
return GenPermInner<9>(dest, Next());
case 10:
return GenPermInner<10>(dest, Next());
case 11:
return GenPermInner<11>(dest, Next());
case 12:
return GenPermInner<12>(dest, Next());
// You can continue with larger numbers, so long as (N!-1) fits 64 bits
default: {
const uint64_t random = Next();
const int pos = random % N;
const int t = dest[pos];
dest[pos] = dest[0];
dest[0] = t;
return GenPerm(dest+1, N-1);
}
}
}
int main() {
std::random_device rd;
uint32_t* seed = reinterpret_cast<uint32_t*>(s);
for(int i=0; i<4; i++) {
seed[i] = rd();
}
int dest[20];
const int N = 12;
// No need to init again and again
for(int j=0; j<N; j++) {
dest[j] =j;
}
for(int i=0; i<10; i++) {
GenPerm(dest, N);
// Or, if you know N at compile-time, call directly
// GenPermInner<N>(dest, Next());
for(int j=0; j<N; j++) {
std::cout << dest[j] << ' ';
}
std::cout << std::endl;
}
return 0;
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.