I have a data which has 40 days of the year and some data
set.seed(123)
df <- data.frame(day = 1:40,rain = runif(40,min = 0, max = 3), petc = runif(40, min = 0.3, max = 8),swc = runif(40, min = 27.01, max = 117.43))
I want to calculate another variable called aetc for each day which is calculated as follows:
SW.ini <- 2 # setting some initial values
SW.max <- 5
SW.min <- 0
For day 1,
1) Determine a variable called PAW(day1) = SW.ini + rain(day1)
2) If PAW(day1) >= SWC(day1), aetc(day1) = petc(day1)
;
If `PAW(day1) < SWC(day1), aetc(day1) = PAW(day1)/SWC(day1) * petc(day1)`
3) Check if aetc(day1) > PAW(day1). If yes, aetc(day1) = paw(day1)
aetc(day1) > PAW(day1). If yes, aetc(day1) = paw(day1)
4) Update SW(day1) = SW.ini + rain(day1) - aetc(day1)
5) If SW(day1) > SW.max, SW(day1) = SW.max. Similarly if
SW(day1) > SW.max, SW(day1) = SW.max. Similarly if
SW(day1) < SW.min, SW(day1) = SW.min`
Repeat for day 2
1) Determine PAW(day2) = SW(day1) + rain(day2)
2) If PAW(day2) >= SWC(day2), aetc(day2) = petc(day2)
; If PAW(day2) < SWC(day2), aetc(day2) = PAW(day2)/SWC(day2) * petc(day2)
3) Check if aetc(day2) > PAW(day2)
. If yes, aetc(day2) = paw(day2)
4) Update SW(day2) = SW(day1) + rain(day2) - aetc(day2)
5) If SW(day2) > SW.max, SW(day2) = SW.max. Similarly if
SW(day2) > SW.max, SW(day2) = SW.max. Similarly if
SW(day2) < SW.min, SW(day2) = SW.min`
Here's my elegant for loop to do this:
df$PAW <- NA
df$aetc <- NA
df$SW <- NA
df$PAW[1] <- SW.ini + df$rain[1]
df$aetc[1] <- ifelse(df$PAW[1] >= df$swc[1], df$petc[1],(df$PAW[1]/df$swc[1])*df$petc[1])
df$aetc[1] <- ifelse(df$aetc[1] > df$PAW[1], df$PAW[1], df$aetc[1])
df$SW[1] <- SW.ini + df$rain[1] - df$aetc[1]
df$SW[1] <- ifelse(df$SW[1] > SW.max, SW.max, ifelse(df$SW[1] < 0, 0,df$SW[1]))
for (day in 2:nrow(df)){
df$PAW[day] <- df$SW[day - 1] + df$rain[day]
df$aetc[day] <- ifelse(df$PAW[day] >= df$swc[day], df$petc[day], (df$PAW[day]/df$swc[day]) * df$petc[day])
df$aetc[day] <- ifelse(df$aetc[day] > df$PAW[day], df$PAW[day],df$aetc[day])
df$SW[day] <- df$SW[day - 1] + df$rain[day] - df$aetc[day]
df$SW[day] <- ifelse(df$SW[day] > SW.max,SW.max, ifelse(df$SW[day] < 0, 0,df$SW[day]))
}
My problem is that this is just one year of data and I want run it for multiple years.
set.seed(123)
df <- data.frame(year = 1980:2015, day = rep(1:40, each = 36),rain =
runif(40*36,min = 0, max = 3), petc = runif(40*36, min = 0.3, max = 8),swc = runif(40*36, min = 27.01, max = 117.43))
So I wanted to do something like
df %>% group_by(year) # and then run the above function for each year.
Is there a dplyr or any other solution to this?
Thank you
Note: I originally posted this answer on your follow up question, R: for loop within a foreach loop , but after seeing this one, it seems this answer is far more relevant here. (I don't address anything related to parallelizing in my answer, which was the topic of your follow up) .
Rcpp
and data.table
Compiling the logic with C++ and applying it by group using data.table grouping operations gives a ~2,000x speed-up from your baseline, far greater than you might hope to get by parallelizing.
On your original example, which had 39,420,000 rows , this executes on my machine in 1.883 seconds ; and on the revised one with 28,800 rows , this executes in 0.004 seconds
library(data.table)
library(Rcpp)
Define and compile a C++
function, CalcSW()
inline in the R script:
One note: counting in C
/ C++
starts at 0
, unlike R
, which starts at 1
-- that's why the indices are different here
Rcpp::cppFunction('
List CalcSW(NumericVector SW_ini,
NumericVector SW_max,
NumericVector rain,
NumericVector swc,
NumericVector PETc) {
int n = SW_ini.length();
NumericVector SW(n);
NumericVector PAW(n);
NumericVector aetc(n);
double SW_ini_glob = SW_ini[0];
double SW_max_glob = SW_max[0];
SW[0] = SW_ini_glob;
PAW[0] = SW[0] + rain[0];
if (PAW[0] > swc[0]){
aetc[0] = PETc[0];
} else {
aetc[0] = PAW[0]/swc[0]*PETc[0];
}
if (aetc[0] > PAW[0]){
aetc[0] = PAW[0];
}
SW[0] = SW[0] + rain[0] - aetc[0];
if(SW[0] > SW_max_glob){
SW[0] = SW_max_glob;
}
if(SW[0] < 0){
SW[0] = 0;
}
for (int i = 1; i < n; i++) {
PAW[i] = SW[i-1] + rain[0];
if (PAW[i] > swc[i]){
aetc[i] = PETc[i];
} else {
aetc[i] = PAW[i]/swc[i]*PETc[i];
}
if (aetc[i] > PAW[i]){
aetc[i] = PAW[i];
}
SW[i] = SW[i-1] + rain[i] - aetc[i];
if(SW[i] > SW_max_glob){
SW[i] = SW_max_glob;
}
if(SW[i] < 0){
SW[i] = 0;
}
}
return Rcpp::List::create(Rcpp::Named("SW") = SW,
Rcpp::Named("PAW") = PAW,
Rcpp::Named("aetc") = aetc);
}')
Create data.table
df <- data.table(loc.id = rep(1:10, each = 80*36),
year = rep(rep(1980:2015, each = 80), times = 10),
day = rep(rep(1:80, times = 36),times = 10),
rain = runif(10*36*80, min = 0 , max = 5),
swc = runif(10*36*80,min = 0, max = 50),
SW_max = rep(runif(10, min = 100, max = 200), each = 80*36),
SW_ini = runif(10*36*80),
PETc = runif(10*36*80, min = 0 , max = 1.3),
SW = as.numeric(NA),
PAW = as.numeric(NA),
aetc = as.numeric(NA))
setkey(df, loc.id, year, day)
Execute the function CalcSW()
on the df
for each combination of loc.id
and year
, assign returned values to the three columns simultaneously:
system.time({
df[, c("SW","PAW","aetc") := CalcSW(SW_ini,
SW_max,
rain,
swc,
PETc), keyby = .(loc.id, year)]
})
...
user system elapsed
0.004 0.000 0.004
Results:
head(df)
...
loc.id year day rain swc SW_max SW_ini PETc SW PAW aetc
1: 1 1980 1 0.35813251 28.360715 177.3943 0.69116310 0.2870478 1.038675 1.049296 0.01062025
2: 1 1980 2 1.10331116 37.013022 177.3943 0.02742273 0.4412420 2.125335 1.396808 0.01665171
3: 1 1980 3 1.76680011 32.509970 177.3943 0.66273062 1.1071233 3.807561 2.483467 0.08457420
4: 1 1980 4 3.20966558 8.252797 177.3943 0.12220454 0.3496968 6.840713 4.165693 0.17651342
5: 1 1980 5 1.32498191 14.784203 177.3943 0.66381497 1.2168838 7.573160 7.198845 0.59253503
6: 1 1980 6 0.02547458 47.903637 177.3943 0.21871598 1.0864713 7.418750 7.931292 0.17988449
I'm not 100% positive I implemented your logic perfectly, but the logic should be pretty straightforward to tweak where I may have missed something, I implemented it in a very similar manner to how you laid it out.
One other note: It's way easier to write C++
with auto-indenting and code highlighting (whether you're using RStudio or Emacs) you get if you create a separate file, named something like TestCode.cpp
formatted like below.
Then, you can either use Rcpp::sourceCpp("TestCode.cpp")
to compile your function in your R Script, or you can copy and paste everything except for the first three lines as a character string into as an argument of Rcpp::cppFunction()
like I did above.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
List CalcSW(NumericVector SW_ini,
NumericVector SW_max,
NumericVector rain,
NumericVector swc,
NumericVector PETc) {
int n = SW_ini.length();
NumericVector SW(n);
NumericVector PAW(n);
NumericVector aetc(n);
double SW_ini_glob = SW_ini[0];
double SW_max_glob = SW_max[0];
SW[0] = SW_ini_glob;
PAW[0] = SW[0] + rain[0];
if (PAW[0] > swc[0]){
aetc[0] = PETc[0];
} else {
aetc[0] = PAW[0]/swc[0]*PETc[0];
}
if (aetc[0] > PAW[0]){
aetc[0] = PAW[0];
}
SW[0] = SW[0] + rain[0] - aetc[0];
if(SW[0] > SW_max_glob){
SW[0] = SW_max_glob;
}
if(SW[0] < 0){
SW[0] = 0;
}
for (int i = 1; i < n; i++) {
PAW[i] = SW[i-1] + rain[0];
if (PAW[i] > swc[i]){
aetc[i] = PETc[i];
} else {
aetc[i] = PAW[i]/swc[i]*PETc[i];
}
if (aetc[i] > PAW[i]){
aetc[i] = PAW[i];
}
SW[i] = SW[i-1] + rain[i] - aetc[i];
if(SW[i] > SW_max_glob){
SW[i] = SW_max_glob;
}
if(SW[i] < 0){
SW[i] = 0;
}
}
return Rcpp::List::create(Rcpp::Named("SW") = SW,
Rcpp::Named("PAW") = PAW,
Rcpp::Named("aetc") = aetc);
}
You could wrap your code in another for loop and save each years df in a list:
library(tidyverse)
lst <- vector("list", length(unique(df$year)))
for (i in seq_along(unique(df$year))) {
df_year <- df %>% filter(year == unique(df$year)[[i]])
# rest of code with df_year replacing df
lst[[i]] <- df_year
}
final_df <- bind_rows(lst)
The data.table
illustration from Matt is a very good illustration of how fast data.table
can be because it does the calculations in place with no copies and moving around of data.
However, to answer the crux of your question about using pipes, you can use group_by
along with do
to accomplish what you are after (albeit much slower than data.table
)
Below I set up the same dummy data Matt did. Then I use your function (but with the case fixed on PETc
). It's not fast, but it's pretty easy to follow.
df <- data.frame(loc.id = rep(1:10, each = 80*36),
year = rep(rep(1980:2015, each = 80), times = 10),
day = rep(rep(1:80, times = 36),times = 10),
rain = runif(10*36*80, min = 0 , max = 5),
swc = runif(10*36*80,min = 0, max = 50),
SW_max = rep(runif(10, min = 100, max = 200), each = 80*36),
SW_ini = runif(10*36*80),
PETc = runif(10*36*80, min = 0 , max = 1.3)
)
my_fun <- function(df){
SW.ini <- 2 # setting some initial values
SW.max <- 5
SW.min <- 0
df$PAW <- NA
df$aetc <- NA
df$SW <- NA
df$PAW[1] <- SW.ini + df$rain[1]
df$aetc[1] <- ifelse(df$PAW[1] >= df$swc[1], df$PETc[1],(df$PAW[1]/df$swc[1])*df$PETc[1])
df$aetc[1] <- ifelse(df$aetc[1] > df$PAW[1], df$PAW[1], df$aetc[1])
df$SW[1] <- SW.ini + df$rain[1] - df$aetc[1]
df$SW[1] <- ifelse(df$SW[1] > SW.max, SW.max, ifelse(df$SW[1] < 0, 0,df$SW[1]))
for (day in 2:nrow(df)){
df$PAW[day] <- df$SW[day - 1] + df$rain[day]
df$aetc[day] <- ifelse(df$PAW[day] >= df$swc[day], df$PETc[day], (df$PAW[day]/df$swc[day]) * df$PETc[day])
df$aetc[day] <- ifelse(df$aetc[day] > df$PAW[day], df$PAW[day],df$aetc[day])
df$SW[day] <- df$SW[day - 1] + df$rain[day] - df$aetc[day]
df$SW[day] <- ifelse(df$SW[day] > SW.max,SW.max, ifelse(df$SW[day] < 0, 0,df$SW[day]))
}
return(df)
}
library(tictoc)
library(tidyverse)
tic()
df %>%
group_by(year) %>%
do(my_fun(.)) ->
out
toc()
#> 5.075 sec elapsed
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.