purrr; sample from multiple columns with probability list

Question

Say I want to take a sample of values of variable length from an arbitrary number of different probability distributions, and with a weighted probability of sampling from each distribution.

Seems like I should be able to do this using purrr 's map functions, but am struggling...

library(tidyverse)

set.seed(20171127)

# sample from 5 different probability distributions
dists <- tibble(
  samp_distA = round(rnorm(n=1000, mean=17, sd=4)),
  samp_distB = round(rnorm(n=1000, mean=13, sd=4)),
  samp_distC = round(rnorm(n=1000, mean=13, sd=4)),
  samp_distD = round(rbeta(n=1000, 2,8)*10),
  samp_distE = round(rnorm(n=1000, mean=8, sd=3))
  )

# define number of samples to be drawn for each group
n.times <- c(20,15,35,8,6) 

# define weights to be used for sampling from dists 
probs <- tibble(A = c(0.80, 0.05, 0.05, 0.05, 0.05),
                B = c(0.05, 0.80, 0.05, 0.05, 0.05),
                C = c(0.05, 0.05, 0.80, 0.05, 0.05),
                D = c(0.05, 0.05, 0.05, 0.80, 0.80),
                E = c(0.05, 0.05, 0.05, 0.05, 0.80)
                )

# sample from dists, n.times, and using probs as weights...
output <- map2(sample, size=n.times, weight=probs, tbl=dists)

#...doesn't work

Any suggestions gratefully received.

Answer 1

set.seed(123)
map2(
  n.times, 
  map(probs, rep, each = nrow(dists)),
  sample, x = flatten_dbl(dists), replace = TRUE
)

# [[1]]
#  [1] 15 13 18  6 15 15 12  8  9 12  7 17 14 12 15 10 18 19 24 24
# 
# [[2]]
#  [1] 12  2 15 16 14 17 11 11 10 12  6 19 13 12 13
# 
# [[3]]
#  [1] 10  9 16 12 13 11 10 18 14 19 16 16 12 19  4 15 19 19 13 14 15 10 14 12 10
# [26]  8 18 19  7  8 21  8 19 10  9
# 
# [[4]]
# [1]  3  3  2 15  1  4 14  2
# 
# [[5]]
# [1]  9 14 10  6 12  8

NB: I'm dubious about your answer to MrFlick's comment: "an 80% chance of selecting all values from samp_distA". To me it is much more intuitive to go the other route: "an 80% chance for each of the 10 values to come from samp_distA"... so that's what I did. Do you confirm you want the former?

Base R equivalent:

set.seed(123)
mapply(
  sample,
  n.times, 
  lapply(probs, rep, each = nrow(dists)),
  MoreArgs = list(x = unlist(dists, use.names = FALSE), replace = TRUE)
)

Edit

Re your follow-up question in a comment ("run the function multiple times for each person, eg so that as output, person A had 10 lists of randomly-sampled values, each of length 20 (and similar for persons B, C, D, and E, perhaps with each person having a predefined different number of lists)"):

n.reps <- c(A = 10, B = 1, C = 3, D = 2, E = 1)
set.seed(123)
pmap(
  list(n.reps, n.times, map(probs, rep, each = nrow(dists))),
  function(.x, .y, .z) replicate(
    .x,
    sample(flatten_dbl(dists), .y, replace = TRUE, .z),
    simplify = FALSE
  )
)

# $A
# $A[[1]]
# [1] 15 20 16 20 16 14 17 20 21 22 18 19 15 14 18 19 16 20  9 16
# 
# $A[[2]]
# [1] 13  9 11 19 25 19 11 18 16 19 16 21 15 12 11 11  9 13 20  1
# 
# $A[[3]]
# [1] 15 20 13 20 13 11 16 16 14 19 18 10 21 11 12 16 18 10 20 14
# 
# $A[[4]]
# [1] 16 19 14 11 17  9 20 11 19 13 11 16  8 11 10 18 27 22 20  4
# 
# $A[[5]]
# [1] 12 18 16 19 13 13 23 19 21 14 22  8  9 19 16 19  9 14 13 20
# 
# $A[[6]]
# [1] 18 26 16 15 21 17 15 19 14 18 19 25  5 16  7 19 21 15 23 16
# 
# $A[[7]]
# [1] 12 26 20 12  7  5 13 14 19  7 16 12 11 27 22 18 11 17 11 16
# 
# $A[[8]]
# [1] 21 18 24 22 18  0 15  3  9 16 16 11 16 20 22 18 18 20 16 21
# 
# $A[[9]]
# [1] 15 20 11 16 16 21 12 20 17  9 18 10 22 17 12  0 18 16 23 20
# 
# $A[[10]]
# [1] 16 22 15  4  7 19 18 13 15  1  7 18 21  1 20 21 15 12 20 15
# 
# 
# $B
# $B[[1]]
# [1]  9  5  8 17  9 10  7 13 12 11  9 21 10 15 12
# 
# 
# $C
# $C[[1]]
# [1] 15 15 16 13 19 14 16 15 11 15 19 16 19 12  6 12 10 12  1 18  9 10 18 11 19
# [26]  9  6 19 18 12  9 18 14 12  7
# 
# $C[[2]]
# [1]  5 14 16 10  8 13  8 18 22 18 14 12 13 10 19 12 15 10 16 13 16  9 15  6 15
# [26] 14  4  9 11 11  3 15 18 10 14
# 
# $C[[3]]
# [1] 13  8 12  9  6  9  2  7  8 12  2 11 20 10  1 14 14 11 11  1 13 13 18 14 12
# [26] 21 11  3  7  7 13 13 11  7 14
# 
# 
# $D
# $D[[1]]
# [1] 11  1  1  7 12  6  0  8
# 
# $D[[2]]
# [1]  4  1  7 15  2  2  8  9
# 
# 
# $E
# $E[[1]]
# [1]  7  8  6 11 10  6

Answer 2

This seems doable with purrr , but it takes a bit of set up, particularly because there's not a sample2 function (that I'm aware of) that samples a distribution based on a vector of probabilities, and then grabs a random sample from that subset.

To do that with purrr , we have to loop twice: the outside loops through each person using a simple numerical index; inside that loop, we loop through the n.times to get random samples from the appropriate distribution.

# prep data ---------------------------------------------------------------

# pull all the controls into a single data frame 
controldf  <- tibble(
  cols = c(1:5), n.times
  ) %>% 
  bind_cols(probs %>% 
              t %>% 
              as.tibble %>% 
              setNames(c("distA", "distB", "distC", "distD", "distE")) 
  )

# turn the distrubtions into long form
longdists  <- dists %>% 
  gather(dist, val)

distnames  <- c("A", "B", "C", "D", "E")


# function to do the work  ---------------------------------------------------------------

getdist  <- function(i) {

  # get the probabilities as a numeric vector
  myprobs  <- controldf[i,3:7] %>% as.numeric

  # how many samples do we need
  myn      <- controldf[[i,2]]

  # use our probabilties to decide what distribution to grab from
  samplestoget  <- sample(distnames, myn, prob = myprobs, replace = T) %>% 
    paste0("samp_dist", .)

  # loop through our list of distributions to grab from 
  map_dbl(samplestoget, ~filter(
    # filter on distribution key
    longdists, dist == .x
    ) %>% 
    # from that distribution, select a single value at random
    sample_n(1) %>% 
    # extract the numeric value
    pluck('val') )

}

# get the values by running the function over our indexes  -------------------------

results  <- map(controldf$cols, ~ getdist(.x))

purrr; sample from multiple columns with probability list

Question

2 answers

solution1
2 ACCPTED 2017-11-28 11:53:22

solution2
1 2017-11-27 19:36:34

purrr; sample from multiple columns with probability list

Question

2 answers

solution1 2 ACCPTED 2017-11-28 11:53:22

solution2 1 2017-11-27 19:36:34

solution1
2 ACCPTED 2017-11-28 11:53:22

solution2
1 2017-11-27 19:36:34