简体   繁体   中英

R replace multiple variables in a string using a lookup table

I'm working with 2 dataframes, as a sample:

DF1 is the main table with a column containing equations which usually have variables

DF1 <- read.table(text = 
"Unit     Year  Equation
 1        2020  'x+2*y'
 1        2021  'x+2*y'
 1        2022  'x+2*y'
 2        2020  'x'
 3        2020  'max(y^2, y+2*z)'
 3        2021  'max(y^2, y+2*z)'
 4        2020  '5'
 5        2020  '(x/y)+z'",
header = TRUE, stringsAsFactors = FALSE)

DF2 is the reference or lookup table which assigns Values to the Variables given certain Years

DF2 <- read.table(text = 
"Year  Variable  Value
 2020  x         10
 2021  x         15.5
 2022  x         50
 2020  y         1
 2021  y         2
 2022  y         3.5
 2020  z         20
 2021  z         34
 2022  z         11",
header = TRUE, stringsAsFactors = FALSE)

The goal is to match the variables and the years between the 2 dataframes so that the following table could be derived after applying eval(parse(text=Equation)) or anything similar:

Unit     Year  Equation
 1       2020  12
 1       2021  19.5
 1       2022  57
 2       2020  10
 3       2020  41
 3       2021  70
 4       2020  5
 5       2020  30

Currently I'm using a for and if-else loop to match the Years and replace the Variables row-by-row. It works okay, but running it has become very slow since DF1 could contain thousands of rows with several variables. Are there other functions I could use to achieve the same output?

Edit - Adding in the loop mentioned to help with comparison:

library(dplyr)
library(reshape2)
DF2 = dcast(DF2, Year~Variable, value.var='Value')

  #Adding in this line to avoid replacing "x" in "max":
  DF1$Equation = gsub("max","placeholder",DF1$Equation)

for(i in 1:nrow(DF1)) {
  for (j in 1:nrow(DF2)) {
    if (DF1[i,]$Year==DF2[j,]$Year) {
      #Every variable would be declared here:
      DF1[i,]$Equation = gsub("x",DF2[j,]$x,DF1[i,]$Equation)
      DF1[i,]$Equation = gsub("y",DF2[j,]$y,DF1[i,]$Equation)
      DF1[i,]$Equation = gsub("z",DF2[j,]$z,DF1[i,]$Equation)
    }
  }
}
  #Returning the function:
  DF1$Equation = gsub("placeholder","max",DF1$Equation)

Results_DF1 = DF1 %>% rowwise() %>%
              mutate(Equation = eval(parse(text=Equation)))

You could do:

left_join(DF1, DF2, 'Year') %>%
  pivot_wider(c(Unit,Year,Equation),Variable, values_from = Value) %>%
  rowwise() %>%
  mutate(a = eval(parse(text = Equation)))

   Unit  Year Equation            x     y     z     a
  <int> <int> <chr>           <dbl> <dbl> <dbl> <dbl>
1     1  2020 x+2*y            10     1      20  12  
2     1  2021 x+2*y            15.5   2      34  19.5
3     1  2022 x+2*y            50     3.5    11  57  
4     2  2020 x                10     1      20  10  
5     3  2020 max(y^2, y+2*z)  10     1      20  41  
6     3  2021 max(y^2, y+2*z)  15.5   2      34  70  
7     4  2020 5                10     1      20   5  
8     5  2020 (x/y)+z          10     1      20  30  

I just noticed you have edited your DF1 so I used it instead and there is no need for edit from my part:

library(dplyr)
library(rlang)

DF1 %>%
  left_join(DF2 %>%
              pivot_wider(names_from = Variable, values_from = Value), 
            by = "Year") %>%
  rowwise() %>%
  mutate(Result = eval(parse_expr(Equation)))

# A tibble: 8 x 7
# Rowwise: 
   Unit  Year Equation            x     y     z Result
  <int> <int> <chr>           <dbl> <dbl> <dbl>  <dbl>
1     1  2020 x+2*y            10     1      20   12  
2     1  2021 x+2*y            15.5   2      34   19.5
3     1  2022 x+2*y            50     3.5    11   57  
4     2  2020 x                10     1      20   10  
5     3  2020 max(y^2, y+2*z)  10     1      20   41  
6     3  2021 max(y^2, y+2*z)  15.5   2      34   70  
7     4  2020 5                10     1      20    5  
8     5  2020 (x/y)+z          10     1      20   30  

To give a base R solution, you could do the following to get your desired resulting dataframe:

  df1YearRows <- split(1:NROW(DF1),DF1$Year)
  df2YearRows <- split(1:NROW(DF2), DF2$Year)
  
  equationValues <- setNames(lapply(names(df1YearRows), function(yearId) {
    fittingVariables <- DF2[df2YearRows[[yearId]],c("Variable","Value")]
    fittingVariablesAndvaluesList <- as.list(setNames(fittingVariables$Value,fittingVariables$Variable))
    equationToEvaluate <- DF1$Equation[df1YearRows[[yearId]]]
    sapply(equationToEvaluate, function(eqTxt) {
      eval(parse(text = eqTxt), fittingVariablesAndvaluesList)
    })
  }),names(df1YearRows))
  
  equationValueColumn <- Reduce(
    f = function(oldColumn,newYearId) {
      oldColumn[df1YearRows[[newYearId]]] <- equationValues[[newYearId]]
      oldColumn
    }, 
    x = names(equationValues), 
    init = numeric(NROW(DF1))
  )
  
  resultDf <- data.frame(Year = DF1$Year, 
                         Equation = DF1$Equation, 
                         Equationvalue = equationValueColumn)
  
  resultDf

resulting in

  Year        Equation Equationvalue
1 2020           x+2*y          12.0
2 2021           x+2*y          19.5
3 2022           x+2*y          57.0
4 2020               x          10.0
5 2020 max(y^2, y+2*z)          41.0
6 2021 max(y^2, y+2*z)          70.0
7 2020               5           5.0
8 2020         (x/y)+z          30.0

A possibility in base . First convert DF2 into a named list per year using setNames and split . Then eval the Equations with the Values for the specific year using mapply .

L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
x <- mapply(function(e, y) {eval(str2lang(e), L2[[y]])},
            DF1$Equation, match(DF1$Year, names(L2)))
cbind(DF1[1:2], Equation=x)
#  Unit Year Equation
#1    1 2020     12.0
#2    1 2021     19.5
#3    1 2022     57.0
#4    2 2020     10.0
#5    3 2020     41.0
#6    3 2021     70.0
#7    4 2020      5.0
#8    5 2020     30.0

In case there are many same equations it could bring some improvements to parse only unique ones.

L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
E <- unique(DF1$Equation)
ES <- lapply(E, str2lang)
x <- mapply(function(e, y) {eval(ES[[e]], L2[[y]])},
            match(DF1$Equation, E), match(DF1$Year, names(L2)))
cbind(DF1[1:2], Equation=x)

And using the original equations without * between number and variable using gsub :

DF1a <- read.table(text = 
"Unit     Year  Equation
 1        2020  'x+2y'
 1        2021  'x+2y'
 1        2022  'x+2y'
 2        2020  'x'
 3        2020  'max(y^2, y+2z)'
 3        2021  'max(y^2, y+2z)'
 4        2020  '5'
 5        2020  '(x/y)+z'",
header = TRUE, stringsAsFactors = FALSE)

x <- mapply(function(e, y) {eval(parse(text=e), L2[[y]])},
       gsub("(\\d)([a-z])", "\\1*\\2", DF1a$Equation), as.character(DF1a$Year))

Compare timings and memory usage.

years <- 20
units <- 1000
DF1 <- expand.grid( stringsAsFactors = FALSE, Unit = 1:units, Year = 1:years,
           Equation = c("x+2*y", "x", "max(y^2, y+2*z)", "5", "(x/y)+z"))
DF2 <- expand.grid(Year = 1:years, Variable = c("x", "y", "z"))
set.seed(42)
DF2$Value <- rnorm(nrow(DF2))

library(dplyr)
library(rlang)
library(tidyr)

onyambu <- function() {
  left_join(DF1, DF2, 'Year') %>%
  pivot_wider(c(Unit,Year,Equation),Variable, values_from = Value) %>%
  rowwise() %>%
    mutate(a = eval(parse(text = Equation)))
}

anoushiravan <- function() {
  DF1 %>%
  left_join(DF2 %>%
              pivot_wider(names_from = Variable, values_from = Value), 
            by = "Year") %>%
  rowwise() %>%
  mutate(Result = eval(parse_expr(Equation)))
}

jonas <- function() {
  df1YearRows <- split(1:NROW(DF1),DF1$Year)
  df2YearRows <- split(1:NROW(DF2), DF2$Year)
  
  equationValues <- setNames(lapply(names(df1YearRows), function(yearId) {
    fittingVariables <- DF2[df2YearRows[[yearId]],c("Variable","Value")]
    fittingVariablesAndvaluesList <- as.list(setNames(fittingVariables$Value,fittingVariables$Variable))
    equationToEvaluate <- DF1$Equation[df1YearRows[[yearId]]]
    sapply(equationToEvaluate, function(eqTxt) {
      eval(parse(text = eqTxt), fittingVariablesAndvaluesList)
    })
  }),names(df1YearRows))
  
  equationValueColumn <- Reduce(
    f = function(oldColumn,newYearId) {
      oldColumn[df1YearRows[[newYearId]]] <- equationValues[[newYearId]]
      oldColumn
    }, 
    x = names(equationValues), 
    init = numeric(NROW(DF1))
  )
  
  resultDf <- data.frame(Year = DF1$Year, 
                         Equation = DF1$Equation, 
                         Equationvalue = equationValueColumn)
  
  resultDf
}

GKi <- function() {
  L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
  x <- mapply(function(e, y) {eval(str2lang(e), L2[[y]])},
              DF1$Equation, match(DF1$Year, names(L2)))
  cbind(DF1[1:2], Equation=x)
}

GKi2 <- function() {
  L2 <- split(setNames(as.list(DF2$Value), DF2$Variable), DF2$Year)
  E <- unique(DF1$Equation)
  ES <- lapply(E, str2lang)
  x <- mapply(function(e, y) {eval(ES[[e]], L2[[y]])},
              match(DF1$Equation, E), match(DF1$Year, names(L2)))
  cbind(DF1[1:2], Equation=x)
}
bench::mark(check = FALSE
          , onyambu()
          , anoushiravan()
          , jonas()
          , GKi()
          , GKi2()
            )
#  expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
#1 onyambu()         4.29s    4.29s     0.233  138.04MB     6.06     1    26
#2 anoushiravan()    7.12s    7.12s     0.140  100.23MB     3.09     1    22
#3 jonas()           3.49s    3.49s     0.287   89.98MB     3.44     1    12
#4 GKi()          863.95ms 863.95ms     1.16      6.6MB     3.47     1     3
#5 GKi2()         324.69ms 325.58ms     3.07     7.49MB     4.61     2     3

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM