R: dplyr solution for for-loop with initial conditions set

Question

I have a data which has 40 days of the year and some data

set.seed(123)
df <- data.frame(day = 1:40,rain = runif(40,min = 0, max = 3), petc = runif(40, min = 0.3, max = 8),swc = runif(40, min = 27.01, max = 117.43))

I want to calculate another variable called aetc for each day which is calculated as follows:

SW.ini <- 2 # setting some initial values 
SW.max <- 5
SW.min <- 0

For day 1,

1) Determine a variable called PAW(day1) = SW.ini + rain(day1)

2) If PAW(day1) >= SWC(day1), aetc(day1) = petc(day1) ;

If `PAW(day1) < SWC(day1), aetc(day1) = PAW(day1)/SWC(day1) * petc(day1)`

3) Check if aetc(day1) > PAW(day1). If yes, aetc(day1) = paw(day1) aetc(day1) > PAW(day1). If yes, aetc(day1) = paw(day1)

4) Update SW(day1) = SW.ini + rain(day1) - aetc(day1)

5) If SW(day1) > SW.max, SW(day1) = SW.max. Similarly if SW(day1) > SW.max, SW(day1) = SW.max. Similarly if SW(day1) < SW.min, SW(day1) = SW.min`

Repeat for day 2

1) Determine PAW(day2) = SW(day1) + rain(day2)
2) If PAW(day2) >= SWC(day2), aetc(day2) = petc(day2) ; If PAW(day2) < SWC(day2), aetc(day2) = PAW(day2)/SWC(day2) * petc(day2)

3) Check if aetc(day2) > PAW(day2) . If yes, aetc(day2) = paw(day2)

4) Update SW(day2) = SW(day1) + rain(day2) - aetc(day2)

5) If SW(day2) > SW.max, SW(day2) = SW.max. Similarly if SW(day2) > SW.max, SW(day2) = SW.max. Similarly if SW(day2) < SW.min, SW(day2) = SW.min`

Here's my elegant for loop to do this:

      df$PAW <- NA
      df$aetc <- NA
      df$SW <- NA

      df$PAW[1] <- SW.ini + df$rain[1]

      df$aetc[1] <- ifelse(df$PAW[1] >= df$swc[1], df$petc[1],(df$PAW[1]/df$swc[1])*df$petc[1])
      df$aetc[1] <- ifelse(df$aetc[1] > df$PAW[1], df$PAW[1], df$aetc[1])
      df$SW[1] <- SW.ini + df$rain[1] -  df$aetc[1]
      df$SW[1] <- ifelse(df$SW[1] > SW.max, SW.max, ifelse(df$SW[1] < 0, 0,df$SW[1]))

      for (day in 2:nrow(df)){

        df$PAW[day] <- df$SW[day - 1] + df$rain[day]
        df$aetc[day] <- ifelse(df$PAW[day] >= df$swc[day], df$petc[day], (df$PAW[day]/df$swc[day]) * df$petc[day])
        df$aetc[day] <- ifelse(df$aetc[day] > df$PAW[day], df$PAW[day],df$aetc[day])
        df$SW[day] <- df$SW[day - 1] + df$rain[day] -  df$aetc[day]
        df$SW[day] <- ifelse(df$SW[day] > SW.max,SW.max, ifelse(df$SW[day] < 0, 0,df$SW[day]))
      }

My problem is that this is just one year of data and I want run it for multiple years.

      set.seed(123)
      df <- data.frame(year = 1980:2015, day = rep(1:40, each = 36),rain = 
      runif(40*36,min = 0, max = 3), petc = runif(40*36, min = 0.3, max = 8),swc = runif(40*36, min = 27.01, max = 117.43))

So I wanted to do something like

                df %>% group_by(year) # and then run the above function for each year.

Is there a dplyr or any other solution to this?

Thank you

Answer 1

Note: I originally posted this answer on your follow up question, R: for loop within a foreach loop , but after seeing this one, it seems this answer is far more relevant here. (I don't address anything related to parallelizing in my answer, which was the topic of your follow up) .

Using `Rcpp` and `data.table`

Compiling the logic with C++ and applying it by group using data.table grouping operations gives a ~2,000x speed-up from your baseline, far greater than you might hope to get by parallelizing.

On your original example, which had 39,420,000 rows , this executes on my machine in 1.883 seconds ; and on the revised one with 28,800 rows , this executes in 0.004 seconds

library(data.table)
library(Rcpp)

Define and compile a C++ function, CalcSW() inline in the R script:

One note: counting in C / C++ starts at 0 , unlike R , which starts at 1 -- that's why the indices are different here

Rcpp::cppFunction('
List CalcSW(NumericVector SW_ini,
            NumericVector SW_max,
            NumericVector rain,
            NumericVector swc,
            NumericVector PETc) {

  int n = SW_ini.length();
  NumericVector SW(n);
  NumericVector PAW(n);
  NumericVector aetc(n);

  double SW_ini_glob = SW_ini[0];
  double SW_max_glob = SW_max[0];

  SW[0] = SW_ini_glob;
  PAW[0] = SW[0] + rain[0];

  if (PAW[0] > swc[0]){
    aetc[0] = PETc[0];
  } else {
    aetc[0] = PAW[0]/swc[0]*PETc[0];
  }

  if (aetc[0] > PAW[0]){
    aetc[0] = PAW[0];
  }

  SW[0] = SW[0] + rain[0] - aetc[0];

  if(SW[0] > SW_max_glob){
    SW[0] = SW_max_glob;
  }

  if(SW[0] < 0){
    SW[0] = 0;
  }

  for (int i = 1; i < n; i++) {

    PAW[i] = SW[i-1] + rain[0];

    if (PAW[i] > swc[i]){
      aetc[i] = PETc[i];
    } else {
      aetc[i] = PAW[i]/swc[i]*PETc[i];
    }

    if (aetc[i] > PAW[i]){
      aetc[i] = PAW[i];
    }

    SW[i] = SW[i-1] + rain[i] - aetc[i];

    if(SW[i] > SW_max_glob){
      SW[i] = SW_max_glob;
    }

    if(SW[i] < 0){
     SW[i] = 0;
    }
  }
  return Rcpp::List::create(Rcpp::Named("SW") = SW,
                            Rcpp::Named("PAW") = PAW,
                            Rcpp::Named("aetc") = aetc);
}')

Create data.table

df <- data.table(loc.id = rep(1:10, each = 80*36), 
                 year = rep(rep(1980:2015, each = 80), times = 10),
                 day = rep(rep(1:80, times = 36),times = 10),
                 rain = runif(10*36*80, min = 0 , max = 5),
                 swc = runif(10*36*80,min = 0, max = 50),
                 SW_max = rep(runif(10, min = 100, max = 200), each = 80*36),
                 SW_ini = runif(10*36*80),
                 PETc = runif(10*36*80, min = 0 , max = 1.3),
                 SW = as.numeric(NA),
                 PAW = as.numeric(NA), 
                 aetc = as.numeric(NA))

setkey(df, loc.id, year, day)

Execute the function CalcSW() on the df for each combination of loc.id and year , assign returned values to the three columns simultaneously:

system.time({
  df[,  c("SW","PAW","aetc") := CalcSW(SW_ini,
                                       SW_max,
                                       rain,
                                       swc,
                                       PETc), keyby = .(loc.id, year)]
})

...

   user  system elapsed 
  0.004   0.000   0.004

Results:

head(df)

...

   loc.id year day       rain       swc   SW_max     SW_ini      PETc       SW      PAW       aetc
1:      1 1980   1 0.35813251 28.360715 177.3943 0.69116310 0.2870478 1.038675 1.049296 0.01062025
2:      1 1980   2 1.10331116 37.013022 177.3943 0.02742273 0.4412420 2.125335 1.396808 0.01665171
3:      1 1980   3 1.76680011 32.509970 177.3943 0.66273062 1.1071233 3.807561 2.483467 0.08457420
4:      1 1980   4 3.20966558  8.252797 177.3943 0.12220454 0.3496968 6.840713 4.165693 0.17651342
5:      1 1980   5 1.32498191 14.784203 177.3943 0.66381497 1.2168838 7.573160 7.198845 0.59253503
6:      1 1980   6 0.02547458 47.903637 177.3943 0.21871598 1.0864713 7.418750 7.931292 0.17988449

I'm not 100% positive I implemented your logic perfectly, but the logic should be pretty straightforward to tweak where I may have missed something, I implemented it in a very similar manner to how you laid it out.

One other note: It's way easier to write C++ with auto-indenting and code highlighting (whether you're using RStudio or Emacs) you get if you create a separate file, named something like TestCode.cpp formatted like below.

Then, you can either use Rcpp::sourceCpp("TestCode.cpp") to compile your function in your R Script, or you can copy and paste everything except for the first three lines as a character string into as an argument of Rcpp::cppFunction() like I did above.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
List CalcSW(NumericVector SW_ini,
                     NumericVector SW_max,
                     NumericVector rain,
                     NumericVector swc,
                     NumericVector PETc) {

  int n = SW_ini.length();
  NumericVector SW(n);
  NumericVector PAW(n);
  NumericVector aetc(n);

  double SW_ini_glob = SW_ini[0];
  double SW_max_glob = SW_max[0];

  SW[0] = SW_ini_glob;
  PAW[0] = SW[0] + rain[0];

  if (PAW[0] > swc[0]){
    aetc[0] = PETc[0];
  } else {
    aetc[0] = PAW[0]/swc[0]*PETc[0];
  }

  if (aetc[0] > PAW[0]){
    aetc[0] = PAW[0];
  }

  SW[0] = SW[0] + rain[0] - aetc[0];

  if(SW[0] > SW_max_glob){
    SW[0] = SW_max_glob;
  }

  if(SW[0] < 0){
    SW[0] = 0;
  }

  for (int i = 1; i < n; i++) {

    PAW[i] = SW[i-1] + rain[0];

    if (PAW[i] > swc[i]){
      aetc[i] = PETc[i];
    } else {
      aetc[i] = PAW[i]/swc[i]*PETc[i];
    }

    if (aetc[i] > PAW[i]){
      aetc[i] = PAW[i];
    }

    SW[i] = SW[i-1] + rain[i] - aetc[i];

    if(SW[i] > SW_max_glob){
      SW[i] = SW_max_glob;
    }

    if(SW[i] < 0){
      SW[i] = 0;
    }
  }
  return Rcpp::List::create(Rcpp::Named("SW") = SW,
                            Rcpp::Named("PAW") = PAW,
                            Rcpp::Named("aetc") = aetc);
}

Answer 2

You could wrap your code in another for loop and save each years df in a list:

library(tidyverse)
lst <- vector("list", length(unique(df$year)))
for (i in seq_along(unique(df$year))) {
    df_year <- df %>% filter(year == unique(df$year)[[i]])

    # rest of code with df_year replacing df

    lst[[i]] <- df_year
}
final_df <- bind_rows(lst)

Answer 3

The data.table illustration from Matt is a very good illustration of how fast data.table can be because it does the calculations in place with no copies and moving around of data.

However, to answer the crux of your question about using pipes, you can use group_by along with do to accomplish what you are after (albeit much slower than data.table )

Below I set up the same dummy data Matt did. Then I use your function (but with the case fixed on PETc ). It's not fast, but it's pretty easy to follow.

df <- data.frame(loc.id = rep(1:10, each = 80*36), 
                 year = rep(rep(1980:2015, each = 80), times = 10),
                 day = rep(rep(1:80, times = 36),times = 10),
                 rain = runif(10*36*80, min = 0 , max = 5),
                 swc = runif(10*36*80,min = 0, max = 50),
                 SW_max = rep(runif(10, min = 100, max = 200), each = 80*36),
                 SW_ini = runif(10*36*80),
                 PETc = runif(10*36*80, min = 0 , max = 1.3) 
                 )

my_fun <- function(df){
  SW.ini <- 2 # setting some initial values 
  SW.max <- 5
  SW.min <- 0

  df$PAW <- NA
  df$aetc <- NA
  df$SW <- NA

  df$PAW[1] <- SW.ini + df$rain[1]

  df$aetc[1] <- ifelse(df$PAW[1] >= df$swc[1], df$PETc[1],(df$PAW[1]/df$swc[1])*df$PETc[1])
  df$aetc[1] <- ifelse(df$aetc[1] > df$PAW[1], df$PAW[1], df$aetc[1])
  df$SW[1] <- SW.ini + df$rain[1] -  df$aetc[1]
  df$SW[1] <- ifelse(df$SW[1] > SW.max, SW.max, ifelse(df$SW[1] < 0, 0,df$SW[1]))

  for (day in 2:nrow(df)){

    df$PAW[day] <- df$SW[day - 1] + df$rain[day]
    df$aetc[day] <- ifelse(df$PAW[day] >= df$swc[day], df$PETc[day], (df$PAW[day]/df$swc[day]) * df$PETc[day])
    df$aetc[day] <- ifelse(df$aetc[day] > df$PAW[day], df$PAW[day],df$aetc[day])
    df$SW[day] <- df$SW[day - 1] + df$rain[day] -  df$aetc[day]
    df$SW[day] <- ifelse(df$SW[day] > SW.max,SW.max, ifelse(df$SW[day] < 0, 0,df$SW[day]))
  }
  return(df)
}


library(tictoc)
library(tidyverse)


tic()
df  %>% 
  group_by(year) %>%
  do(my_fun(.)) -> 
  out
toc()
#> 5.075 sec elapsed

R: dplyr solution for for-loop with initial conditions set

Question

3 answers

solution1
5 ACCPTED 2018-03-06 22:51:47

Using `Rcpp` and `data.table`

solution2
1 2018-03-05 21:13:06

solution3
1 2018-05-21 20:27:31

R: dplyr solution for for-loop with initial conditions set

Question

3 answers

solution1 5 ACCPTED 2018-03-06 22:51:47

Using Rcpp and data.table

solution2 1 2018-03-05 21:13:06

solution3 1 2018-05-21 20:27:31

solution1
5 ACCPTED 2018-03-06 22:51:47

Using `Rcpp` and `data.table`

solution2
1 2018-03-05 21:13:06

solution3
1 2018-05-21 20:27:31