简体   繁体   中英

Sampling a random subset from an array

What is a clean way of taking a random sample, without replacement from an array in javascript? So suppose there is an array

x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

and I want to randomly sample 5 unique values; ie generate a random subset of length 5. To generate one random sample one could do something like:

x[Math.floor(Math.random()*x.length)];

But if this is done multiple times, there is a risk of a grabbing the same entry multiple times.

I suggest shuffling a copy of the array using the Fisher-Yates shuffle and taking a slice:

function getRandomSubarray(arr, size) {
    var shuffled = arr.slice(0), i = arr.length, temp, index;
    while (i--) {
        index = Math.floor((i + 1) * Math.random());
        temp = shuffled[index];
        shuffled[index] = shuffled[i];
        shuffled[i] = temp;
    }
    return shuffled.slice(0, size);
}

var x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15];
var fiveRandomMembers = getRandomSubarray(x, 5);

Note that this will not be the most efficient method for getting a small random subset of a large array because it shuffles the whole array unnecessarily. For better performance you could do a partial shuffle instead:

function getRandomSubarray(arr, size) {
    var shuffled = arr.slice(0), i = arr.length, min = i - size, temp, index;
    while (i-- > min) {
        index = Math.floor((i + 1) * Math.random());
        temp = shuffled[index];
        shuffled[index] = shuffled[i];
        shuffled[i] = temp;
    }
    return shuffled.slice(min);
}

A little late to the party but this could be solved with underscore's new sample method (underscore 1.5.2 - Sept 2013):

var x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15];

var randomFiveNumbers = _.sample(x, 5);

Or... if you use underscore.js...

_und = require('underscore');

...

function sample(a, n) {
    return _und.take(_und.shuffle(a), n);
}

Simple enough.

In my opinion, I do not think shuffling the entire deck necessary. You just need to make sure your sample is random not your deck. What you can do, is select the size amount from the front then swap each one in the sampling array with another position in it. So, if you allow replacement you get more and more shuffled.

function getRandom(length) { return Math.floor(Math.random()*(length)); }

function getRandomSample(array, size) {
    var length = array.length;

    for(var i = size; i--;) {
        var index = getRandom(length);
        var temp = array[index];
        array[index] = array[i];
        array[i] = temp;
    }

    return array.slice(0, size);
}

This algorithm is only 2*size steps, if you include the slice method, to select the random sample.


More Random

To make the sample more random, we can randomly select the starting point of the sample. But it is a little more expensive to get the sample.

function getRandomSample(array, size) {
    var length = array.length, start = getRandom(length);

    for(var i = size; i--;) {
        var index = (start + i)%length, rindex = getRandom(length);
        var temp = array[rindex];
        array[rindex] = array[index];
        array[index] = temp;
    }
    var end = start + size, sample = array.slice(start, end);
    if(end > length)
        sample = sample.concat(array.slice(0, end - length));
    return sample;
}

What makes this more random is the fact that when you always just shuffling the front items you tend to not get them very often in the sample if the sampling array is large and the sample is small. This would not be a problem if the array was not supposed to always be the same. So, what this method does is change up this position where the shuffled region starts.


No Replacement

To not have to copy the sampling array and not worry about replacement, you can do the following but it does give you 3*size vs the 2*size .

function getRandomSample(array, size) {
    var length = array.length, swaps = [], i = size, temp;

    while(i--) {
        var rindex = getRandom(length);
        temp = array[rindex];
        array[rindex] = array[i];
        array[i] = temp;
        swaps.push({ from: i, to: rindex });
    }

    var sample = array.slice(0, size);

    // Put everything back.
    i = size;
    while(i--) {
         var pop = swaps.pop();
         temp = array[pop.from];
         array[pop.from] = array[pop.to];
         array[pop.to] = temp;
    }

    return sample;
}

No Replacement and More Random

To apply the algorithm that gave a little bit more random samples to the no replacement function:

function getRandomSample(array, size) {
    var length = array.length, start = getRandom(length),
        swaps = [], i = size, temp;

    while(i--) {
        var index = (start + i)%length, rindex = getRandom(length);
        temp = array[rindex];
        array[rindex] = array[index];
        array[index] = temp;
        swaps.push({ from: index, to: rindex });
    }

    var end = start + size, sample = array.slice(start, end);
    if(end > length)
        sample = sample.concat(array.slice(0, end - length));

    // Put everything back.
    i = size;
    while(i--) {
         var pop = swaps.pop();
         temp = array[pop.from];
         array[pop.from] = array[pop.to];
         array[pop.to] = temp;
    }

    return sample;
}

Faster...

Like all of these post, this uses the Fisher-Yates Shuffle. But, I removed the over head of copying the array.

function getRandomSample(array, size) {
    var r, i = array.length, end = i - size, temp, swaps = getRandomSample.swaps;

    while (i-- > end) {
        r = getRandom(i + 1);
        temp = array[r];
        array[r] = array[i];
        array[i] = temp;
        swaps.push(i);
        swaps.push(r);
    }

    var sample = array.slice(end);

    while(size--) {
        i = swaps.pop();
        r = swaps.pop();
        temp = array[i];
        array[i] = array[r];
        array[r] = temp;
    }

    return sample;
}
getRandomSample.swaps = [];

While I strongly support using the Fisher-Yates Shuffle, as suggested by Tim Down , here's a very short method for achieving a random subset as requested, mathematically correct, including the empty set, and the given set itself.

Note solution depends on lodash / underscore :

Lodash v4

const _ = require('loadsh')

function subset(arr) {
    return _.sampleSize(arr, _.random(arr.length))
}

Lodash v3

const _ = require('loadsh')

function subset(arr) {
    return _.sample(arr, _.random(arr.length));
}

If you're using lodash the API changed in 4.x:

const oneItem = _.sample(arr);
const nItems = _.sampleSize(arr, n);

https://lodash.com/docs#sampleSize

You can get a 5 elements sample by this way:

var sample = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
.map(a => [a,Math.random()])
.sort((a,b) => {return a[1] < b[1] ? -1 : 1;})
.slice(0,5)
.map(a => a[0]);

You can define it as a function to use in your code:

var randomSample = function(arr,num){ return arr.map(a => [a,Math.random()]).sort((a,b) => {return a[1] < b[1] ? -1 : 1;}).slice(0,num).map(a => a[0]); }

Or add it to the Array object itself:

    Array.prototype.sample = function(num){ return this.map(a => [a,Math.random()]).sort((a,b) => {return a[1] < b[1] ? -1 : 1;}).slice(0,num).map(a => a[0]); };

if you want, you can separate the code for to have 2 functionalities (Shuffle and Sample):

    Array.prototype.shuffle = function(){ return this.map(a => [a,Math.random()]).sort((a,b) => {return a[1] < b[1] ? -1 : 1;}).map(a => a[0]); };
    Array.prototype.sample = function(num){ return this.shuffle().slice(0,num); };

Perhaps I am missing something, but it seems there is a solution that does not require the complexity or potential overhead of a shuffle:

function sample(array,size) {
  const results = [],
    sampled = {};
  while(results.length<size && results.length<array.length) {
    const index = Math.trunc(Math.random() * array.length);
    if(!sampled[index]) {
      results.push(array[index]);
      sampled[index] = true;
    }
  }
  return results;
}

You can remove the elements from a copy of the array as you select them. Performance is probably not ideal, but it might be OK for what you need:

function getRandom(arr, size) {
  var copy = arr.slice(0), rand = [];
  for (var i = 0; i < size && i < copy.length; i++) {
    var index = Math.floor(Math.random() * copy.length);
    rand.push(copy.splice(index, 1)[0]);
  }
  return rand;
}

Here is another implementation based on Fisher-Yates Shuffle. But this one is optimized for the case where the sample size is significantly smaller than the array length. This implementation doesn't scan the entire array nor allocates arrays as large as the original array. It uses sparse arrays to reduce memory allocation.

function getRandomSample(array, count) {
    var indices = [];
    var result = new Array(count);
    for (let i = 0; i < count; i++ ) {
        let j = Math.floor(Math.random() * (array.length - i) + i);
        result[i] = array[indices[j] === undefined ? j : indices[j]];
        indices[j] = indices[i] === undefined ? i : indices[i];
    }
    return result;
}

A lot of these answers talk about cloning, shuffling, slicing the original array. I was curious why this helps from a entropy/distribution perspective.

I'm no expert but I did write a sample function using the indexes to avoid any array mutations — it does add to a Set though. I also don't know how the random distribution on this but the code was simple enough to I think warrant an answer here.

 function sample(array, size = 1) { const { floor, random } = Math; let sampleSet = new Set(); for (let i = 0; i < size; i++) { let index; do { index = floor(random() * array.length); } while (sampleSet.has(index)); sampleSet.add(index); } return [...sampleSet].map(i => array[i]); } const words = [ 'confused', 'astonishing', 'mint', 'engine', 'team', 'cowardly', 'cooperative', 'repair', 'unwritten', 'detailed', 'fortunate', 'value', 'dogs', 'air', 'found', 'crooked', 'useless', 'treatment', 'surprise', 'hill', 'finger', 'pet', 'adjustment', 'alleged', 'income' ]; console.log(sample(words, 4));

For very large arrays, it's more efficient to work with indexes rather than the members of the array.

This is what I ended up with after not finding anything I liked on this page.

/**
 * Get a random subset of an array
 * @param {Array} arr - Array to take a smaple of.
 * @param {Number} sample_size - Size of sample to pull.
 * @param {Boolean} return_indexes - If true, return indexes rather than members
 * @returns {Array|Boolean} - An array containing random a subset of the members or indexes.
 */
function getArraySample(arr, sample_size, return_indexes = false) {
    if(sample_size > arr.length) return false;
    const sample_idxs = [];
    const randomIndex = () => Math.floor(Math.random() * arr.length);
    while(sample_size > sample_idxs.length){
        let idx = randomIndex();
        while(sample_idxs.includes(idx)) idx = randomIndex();
        sample_idxs.push(idx);
    }
    sample_idxs.sort((a, b) => a > b ? 1 : -1);
    if(return_indexes) return sample_idxs;
    return sample_idxs.map(i => arr[i]);
}

My approach on this is to create a getRandomIndexes method that you can use to create an array of the indexes that you will pull from the main array. In this case, I added a simple logic to avoid the same index in the sample. this is how it works

const getRandomIndexes = (length, size) => {
  const indexes = [];
  const created = {};

  while (indexes.length < size) {
    const random = Math.floor(Math.random() * length);
    if (!created[random]) {
      indexes.push(random);
      created[random] = true;
    }
  }
  return indexes;
};

This function independently of whatever you have is going to give you an array of indexes that you can use to pull the values from your array of length length , so could be sampled by

const myArray = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

getRandomIndexes(myArray.length, 3).map(i => myArray[i])

Every time you call the method you are going to get a different sample of myArray . at this point, this solution is cool but could be even better to sample different sizes. if you want to do that you can use

getRandomIndexes(myArray.length, Math.ceil(Math.random() * 6)).map(i => myArray[i])

will give you a different sample size from 1-6 every time you call it.

I hope this has helped :D

D3-array 's shuffle uses the Fisher-Yeates shuffle algorithm to randomly re-order arrays. It is a mutating function - meaning that the original array is re-ordered in place, which is good for performance.

D3 is for the browser - it is more complicated to use with node.

https://github.com/d3/d3-array#shuffle

npm install d3-array

 //import {shuffle} from "d3-array" let x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]; d3.shuffle(x) console.log(x) // it is shuffled
 <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/5.0.0/d3.min.js"></script>

If you don't want to mutate the original array

 let x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]; let shuffled_x = d3.shuffle(x.slice()) //calling slice with no parameters returns a copy of the original array console.log(x) // not shuffled console.log(shuffled_x)
 <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/5.0.0/d3.min.js"></script>

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM