简体   繁体   中英

Randomly assign sample into groups in R

I have a big dataset consisting of some demographic information for each individual from different cities. I want to create a variable (eg class) which assigns individuals of same age group within a city into groups of around 20 (~ 15-25) people. Here is an R code to generate an example of my data:

    set.seed(10)
    ID = seq(1:10000)
    df <- as.data.frame(ID)
    df$City <- cut(runif(10000, 0,100),breaks = c(0,7,20,35,47,55,61,74,85,91,100),include.lowest = T,right = F, labels = c("City 1","City 2","City 3","City 4","City 5","City 6","City 7","City 8","City 9","City 10"))
    df$Age_Group <- cut(runif(10000, 0,100),breaks = c(0,10,20,30,40,50,60,70,80,90,101),include.lowest = T,right = F, labels = c("0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89","90+"))
    table(df$Age_Group,df$City)

I want df$class to group individuals of similar age group and city. Values of class need to continue after for all age groups and cities. How can I do that?

Thanks

Using toString .

df$class <- factor(apply(df[c("City", "Age_Group")], 1, toString))
levels(df$class)
# [1] "City 1, 0-9"    "City 1, 10-19"  "City 1, 20-29"  "City 1, 30-39" 
# [5] "City 1, 40-49"  "City 1, 50-59"  "City 1, 60-69"  "City 1, 70-79" 
# [9] "City 1, 80-89"  "City 1, 90+"    "City 10, 0-9"   "City 10, 10-19"
# [13] "City 10, 20-29" [...]

To get random samples, you could split the data set by "class" into subsets, say s , and calculate how many groups you get, when you divide the nrow(s)/20 (individuals) by 20. Use ceiling of this probably decimal point number, say x , and exploit then recycling properties of R; bind 1:ceiling(x) to s using cbind and let it recycle to nrow(s) , where we may safely suppressWarnings . Of course we want now use sample to disturb the order, and just want column [,2] . Finally use do.call(rbind(.)) to unsplit the data set, and delete the rownames if we want.

set.seed(1)  ## for sake of reproducibility
df <- `rownames<-`(do.call(rbind, by(df, df$class, function(s) 
  transform(s, SAMP=suppressWarnings(
    sample(cbind(s$class, SAMP=1:ceiling(nrow(s)/20))[,2])
    )))), NULL)

Result:

Yields "SAMP" column with approximately equal sized groups with ~20 members for each "class" .

df[60:70, ]  ##example rows
#      ID    City Age_Group          class SAMP
# 60 8766 City 01       0-9   City 01, 0-9    4
# 61 8775 City 01       0-9   City 01, 0-9    1
# 62 9021 City 01       0-9   City 01, 0-9    3
# 63 9041 City 01       0-9   City 01, 0-9    3
# 64 9482 City 01       0-9   City 01, 0-9    1
# 65 9622 City 01       0-9   City 01, 0-9    1
# 66   47 City 01     10-19 City 01, 10-19    4
# 67  698 City 01     10-19 City 01, 10-19    3
# 68  833 City 01     10-19 City 01, 10-19    1
# 69 1166 City 01     10-19 City 01, 10-19    1
# 70 1221 City 01     10-19 City 01, 10-19    2   

Check first ten tables of the classes with its SAMPles:

by(df$SAMP, df$class, table)[1:10]
# $`City 01, 0-9`
# 
# 1  2  3  4 
# 17 16 16 16 
# 
# $`City 01, 10-19`
# 
# 1  2  3  4 
# 18 17 17 17 
# 
# $`City 01, 20-29`
# 
# 1  2  3  4 
# 18 18 17 17 
# 
# $`City 01, 30-39`
# 
# 1  2  3  4 
# 19 19 19 19 
# 
# $`City 01, 40-49`
# 
# 1  2  3  4 
# 19 19 19 18 
# 
# $`City 01, 50-59`
# 
# 1  2  3  4  5 
# 18 17 17 17 17 
# 
# $`City 01, 60-69`
# 
# 1  2  3  4 
# 16 16 16 16 
# 
# $`City 01, 70-79`
# 
# 1  2  3  4 
# 19 19 19 19 
# 
# $`City 01, 80-89`
# 
# 1  2  3  4 
# 20 19 19 19 
# 
# $`City 01, 90+`
# 
# 1  2  3  4 
# 18 17 17 17 

Case you want the numbering by class rather than altogether, just paste "class" (as numeric) and "SAMP" together.

df <- transform(df, SAMP2=paste(as.numeric(class), SAMP, sep="."))
head(df)
#    ID    City Age_Group        class SAMP SAMP2
# 1 193 City 01       0-9 City 01, 0-9    3   1.3
# 2 480 City 01       0-9 City 01, 0-9    1   1.1
# 3 742 City 01       0-9 City 01, 0-9    2   1.2
# 4 757 City 01       0-9 City 01, 0-9    1   1.1
# 5 811 City 01       0-9 City 01, 0-9    3   1.3
# 6 870 City 01       0-9 City 01, 0-9    3   1.3

The caret package can help you with this. It will attempt to create n partitions while respecting categories like Age and City given the unbalanced nature of the input it won't be perfect. But you can chose the number of partitions (aka folds) and see what suits your needs, I went for 5.

require(caret)
#> Loading required package: caret
#> Loading required package: lattice
#> Loading required package: ggplot2
set.seed(10)
ID = seq(1:10000)
df <- as.data.frame(ID)
df$City <- cut(runif(10000, 0,100),breaks = c(0,7,20,35,47,55,61,74,85,91,100),include.lowest = T,right = F, labels = c("City 1","City 2","City 3","City 4","City 5","City 6","City 7","City 8","City 9","City 10"))
df$Age_Group <- cut(runif(10000, 0,100),breaks = c(0,10,20,30,40,50,60,70,80,90,101),include.lowest = T,right = F, labels = c("0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89","90+"))
# table(df$Age_Group, df$City)
df$class <- caret::createFolds(df$Age_Group,
                               5,
                               FALSE)
table(df$class, df$City, df$Age_Group)
#> , ,  = 0-9
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     18     27     28     29     15      8     22     21      9      21
#>   2     16     29     31     27      9     10     19     23     12      22
#>   3     12     20     26     26     20     11     30     22     12      18
#>   4      9     27     24     28     13     12     24     31     12      17
#>   5     10     22     36     31     13     13     23     24     11      15
#> 
#> , ,  = 10-19
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     13     22     13     22     11      9     38     18     22      23
#>   2     12     23     34     21     13      7     26     22     16      16
#>   3     14     25     30     25     13      7     30     23     11      12
#>   4     13     29     31     19     22     17     23     16      9      11
#>   5     17     22     24     23     18     20     22     15      9      20
#> 
#> , ,  = 20-29
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     14     28     31     24     12     10     35     22     12      14
#>   2      9     32     22     29     15      9     30     19     18      19
#>   3     18     35     25     17     14     13     22     18     19      21
#>   4     15     26     33     25     11     15     37     20      1      19
#>   5     14     20     31     32     12     14     23     16     18      21
#> 
#> , ,  = 30-39
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     13     28     29     22     24     14     24     19     18      21
#>   2     15     28     31     32     19     14     21     25     16      12
#>   3     17     30     28     22     20      9     22     29     14      21
#>   4     18     26     33     23     10     16     23     24     13      26
#>   5     13     26     40     24     12      8     25     21     20      23
#> 
#> , ,  = 40-49
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     16     26     41     16     19     13     19     18     16      22
#>   2     18     23     36     32      8     12     28     15     16      18
#>   3     19     27     29     23     11     16     33     13     15      21
#>   4     13     21     30     29     18     18     26     19      9      23
#>   5      9     34     27     27     17      9     27     22     11      23
#> 
#> , ,  = 50-59
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     21     28     28     21     15     10     25     26     21       8
#>   2     12     17     24     25     20     20     25     32     14      13
#>   3     19     27     35     30     10      8     19     24     13      17
#>   4     19     23     30     23     19     11     19     25     16      18
#>   5     15     37     38     18     10     15     23     25      9      13
#> 
#> , ,  = 60-69
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     12     29     31     25     14     15     12     27     11      20
#>   2     12     22     29     25     18     14     22     20     11      24
#>   3     11     27     30     21     15     16     22     23     15      16
#>   4     17     21     32     20     12     12     24     28     11      19
#>   5     12     27     37     31     11     11     17     16     17      18
#> 
#> , ,  = 70-79
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     10     23     27     36     13      7     29     20     13      17
#>   2     25     19     27     27     18      8     25     17     10      20
#>   3     12     17     27     26     13      5     34     24     14      23
#>   4     12     28     34     22     15      8     28     21     14      13
#>   5     17     30     40     23     13     11     21     17      7      16
#> 
#> , ,  = 80-89
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     10     27     26     34     17     16     23     19      8      16
#>   2     17     19     33     16     19     19     16     31     12      14
#>   3     14     24     27     23     14     10     25     23     12      23
#>   4     12     25     30     33     14     16     19     14     12      20
#>   5     24     24     25     26     20      6     18     20     13      20
#> 
#> , ,  = 90+
#> 
#>    
#>     City 1 City 2 City 3 City 4 City 5 City 6 City 7 City 8 City 9 City 10
#>   1     16     21     30     25     20     15     31     23     10      11
#>   2     15     25     34     28     16     13     25     19     10      17
#>   3     12     23     30     26     19     14     24     23     13      18
#>   4     13     30     30     24     15     10     23     25     14      18
#>   5     13     16     24     24     23     17     30     23     18      15

Created on 2020-05-08 by the reprex package (v0.3.0)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM