简体   繁体   中英

Subset dataframe and apply function that counts frequency of each factor level

I have a df:

 df<- data.frame(region= c("1", "1", "1","1","1","1","1","1", "2","2"),plot=c("1", "1", "1","2","2","2", "3","3","3","3"), interact=c("A_B", "C_D","C_D", "E_F","C_D","C_D", "D_E", "D_E","C_B","A_B"))

I would like to subset the data by plot . For each plot subset I would like to count the frequency of each unique interact type. The output should look like:

df<- data.frame(region= c("1", "1", "1","1", "2","2", 
"2"),plot=c("1", 
"1", "2","2", "3","3","3"), interact=c("A_B", "C_D", "E_F","C_D", 
"D_E", "C_B","A_B"), freq= c(1,2,1,2,2,1,1))

Then I would like to make a function that calculates the following for each plot subset of the df:

 sum<-sum(df$freq) # Calculate sum of `freq` for each plot subset (this calculates the total number of interactions)
 prop<-unique(df$freq)/sum  #Divide each level of `freq` by the sum (this finds the proportion of each interaction type to the total number of interactions) 
 prop2<-prop^2 # Square this proportion 
 D<-sum(prop2) # Find the sum of these proportion for each plot subset
 simp<-1/D)# Use this to calculate simpsons diversity

The function I want to use is similar to that explained on the following page: http://rfunctions.blogspot.com.ng/2012/02/diversity-indices-simpsons-diversity.html . However that referenced version is preformed on a wide dataset and my data set will be long.

In the end I would have a df of values for each plot:

  result<- 
         Plot    div
          1      1.8
          2      1.8
          3      2.6

I used dplyr however result for plot3 is different and I dont know why. Could you provide your results for each calculations or check mine and let me know where the mistake is?

Also. If your are interested in calculating diversity indices you can get familiar with vegan package and especially diversity() function

df<- data.frame(region= c("1", "1", "1","1","1","1","1","1", "2","2"),
                plot=c("1", "1", "1","2","2","2", "3","3","3","3"),
                interact=c("A_B", "C_D","C_D", "E_F","C_D","C_D", "D_E", "D_E","C_B","A_B"))

library(dplyr)

df1 <- df %>% group_by(region, plot, interact) %>% summarise(freq = n()) 
df2 <- df1 %>% group_by(plot) %>%  mutate(sum=sum(freq), prop=freq/sum, prop2 = prop^2)
df2

 A tibble: 7 x 7
# Groups:   plot [3]
  region   plot interact  freq   sum      prop     prop2
  <fctr> <fctr>   <fctr> <int> <int>     <dbl>     <dbl>
1      1      1      A_B     1     3 0.3333333 0.1111111
2      1      1      C_D     2     3 0.6666667 0.4444444
3      1      2      C_D     2     3 0.6666667 0.4444444
4      1      2      E_F     1     3 0.3333333 0.1111111
5      1      3      D_E     2     4 0.5000000 0.2500000
6      2      3      A_B     1     4 0.2500000 0.0625000
7      2      3      C_B     1     4 0.2500000 0.0625000


df2 %>% group_by(plot) %>% summarise(D=sum(prop2), simp=1/D)

 A tibble: 3 x 3
    plot         D     simp
  <fctr>     <dbl>    <dbl>
1      1 0.5555556 1.800000
2      2 0.5555556 1.800000
3      3 0.3750000 2.666667

And here is the approach using diversity() function from vegan package.

First you need to use spread to create a "matrix" with all you interactions as separate columns

library(vegan)
library(tidyr)
library(dplyr)

df5 <- df %>% group_by(plot, interact) %>% summarise(freq = n())
df6 <-spread(data=df5, key = interact, value = freq, fill=0)
df6

# A tibble: 3 x 6
# Groups:   plot [3]
    plot   A_B   C_B   C_D   D_E   E_F
* <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
1      1     1     0     2     0     0
2      2     0     0     2     0     1
3      3     1     1     0     2     0

Than you calculate the diversity, giving as a data matrix the df6 without 1 column, which is plot. At the end you can add the calculated diversity as a column to the df6.

simp <-diversity(x=df6[,-1], index = "invsimpson")
df6$simp <- simp
df6

# A tibble: 3 x 7
# Groups:   plot [3]
    plot   A_B   C_B   C_D   D_E   E_F     simp
* <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
1      1     1     0     2     0     0 1.800000
2      2     0     0     2     0     1 1.800000
3      3     1     1     0     2     0 2.666667

Or even shorter with do() and tidy() from broom package

df5 <- df %>% group_by(plot, interact) %>% summarise(freq = n())

library(broom)

df5 %>% spread(key = interact, value = freq, fill=0) %>% 
  do(tidy(diversity(x=.[,-1], index = "invsimpson")))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM