简体   繁体   中英

Vectorize weighted mean by list of names in each row of a data.frame

I have two dataframes. I would like to calculate a weighted mean for each row of my results dataframe from the values in my values dataframe. Each row of results has two columns with lists. Every possible combination of the lists is a row in the values dataframe. I am doing this with the code below (two options), which is probably clearer than me trying to explain it. What I would like to know is if and how I can vectorize this (my original results dataframe is very large).

library(dplyr)

a = c('a, b, c', 'a, b', 'c') 
f = c('p, q', 'r', 's, t') 
results <- data.frame(a, f)
# > results
# a    f
# 1 a, b, c p, q
# 2    a, b    r
# 3       c s, t

av = c('a','b','c') 
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av, fv)
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)

# > values
# Var1 Var2          w        x
# 1     a    p 0.10710168 62.58004
# 2     b    p 0.89175147 20.26853
# 3     c    p 0.31489520 85.90532
# 4     a    q 0.07263807 89.02293
# 5     b    q 0.87090293 72.17195
# 6     c    q 0.88818599 48.65717
# 7     a    r 0.54076274 39.46479
# 8     b    r 0.08678314 57.99200
# 9     c    r 0.86298554 77.00845
# 10    a    s 0.41778402 23.35626
# 11    b    s 0.70227865 82.76310
# 12    c    s 0.84415123 65.26321
# 13    a    t 0.50651689 75.52230
# 14    b    t 0.37850063 87.41811
# 15    c    t 0.58515251 96.74228

# Option 1 with apply
calc_wa <- function(as, fs){
  as <- unlist(strsplit(as, ", "))
  fs <- unlist(strsplit(fs, ", "))
  valuestokeep <- values %>% filter(Var1 %in% as, Var2 %in% fs)
  wa_res <- weighted.mean(valuestokeep$x, valuestokeep$w)
  return(wa_res)
}

results$res <- apply(results, 1, function(y) calc_wa(y['a'], y['f'])) 

# Option 2 with mutate
calc_wa2 <- function(as, fs){
  as <- unlist(strsplit(as.character(as), ", "))
  fs <- unlist(strsplit(as.character(fs), ", "))
  valuestokeep <- values %>% filter(Var1 %in% as, Var2 %in% fs)
  wa_res <- weighted.mean(valuestokeep$x, valuestokeep$w)
  return(wa_res)
}
results <- results %>% rowwise() %>% mutate(res2= calc_wa2(a, f))
# > results
# Source: local data frame [3 x 4]
# Groups: <by row>
#   
#   # A tibble: 3 x 4
#   a       f       res  res2
# <fct>   <fct> <dbl> <dbl>
#   1 a, b, c p, q   52.3  52.3
# 2 a, b    r      42.0  42.0
# 3 c       s, t   78.2  78.2

(I am afraid I am missing some basic command, I also have no idea how to title/tag the question - suggestions welcome)

Using data.table instead:

Setup Data (made some slight variations):

library(data.table)

set.seed(1) # added for reproducability
a = c('a, b, c', 'a, b', 'c')
f = c('p, q', 'r', 's, t')
results <- data.table(a, f) #slight change
# > results
# a    f
# 1 a, b, c p, q
# 2    a, b    r
# 3       c s, t

av = c('a','b','c') 
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av = av, fv = fv) #slight change
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)

Code:

results[, rowID := 1:.N] # add ID
results_expand <- results[, expand.grid(as = trimws(unlist(strsplit(a, ","))),fs = trimws(unlist(strsplit(f, ","))), stringsAsFactors = FALSE), by = .(rowID)] # expand results
# Alternate: results_expand <- results[, CJ(as = trimws(unlist(strsplit(a, ","))),fs = trimws(unlist(strsplit(f, ",")))), by = .(rowID)] # expand results

results_expand <- merge(results_expand, values, by.x = c("as","fs"), by.y = c("av","fv")) # merge to value table
results_expand <- results_expand[, .(wm = weighted.mean(x, w)), by = rowID] # calculate weight

results <- merge(results, results_expand, by = "rowID")

results
   rowID       a    f       wm
1:     1 a, b, c p, q 74.56427
2:     2    a, b    r 45.37445
3:     3       c s, t 35.14175

This uses merge and grouping functions in data.table, so should be faster than any looping option.

The same procedure suggested by @Chris but using data.frame instead of data.table

library(dplyr);library(tidyr)

set.seed(1) # added for reproducability

a = c('a, b, c', 'a, b', 'c') 
f = c('p, q', 'r', 's, t') 
results <- data.frame(a, f)

av = c('a','b','c') 
fv = c('p', 'q', 'r', 's', 't')

values <- expand.grid(av=av, fv=fv)
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)

results$ID <- seq.int(nrow(results))

results_expand<- results %>%
  group_by(ID) %>%
  expand(as=trimws(unlist(strsplit(as.character(a), ","))), fs=trimws(unlist(strsplit(as.character(f), ","))))

results_expand <- merge(results_expand, values, by.x = c("as","fs"), by.y = c("av","fv"))
results_expand <- results_expand %>% group_by(ID) %>% mutate(wm = weighted.mean(x, w))
results <- merge(results, results_expand, by = "ID")
results <- results  %>%  group_by(ID) %>% select(ID, a, f, wm)
results <- distinct(results)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM