简体   繁体   中英

Replacing for loop with apply function in R

I am trying to calculate the average number of goals home and away team has scored, but "todays" game.

The data can be found here : http://www.football-data.co.uk/mmz4281/1415/E0.csv

My code

pl <- pl[,2:6]
pl$Date <- as.Date(pl$Date, "%d/%m/%y")

pl$HomeTeam <- as.character(pl$HomeTeam)
pl$AwayTeam <- as.character(pl$AwayTeam)

pl.func <- function(tf){
  tf$avg.ht <- rep(NA,nrow(tf))
  tf$avg.at <- rep(NA,nrow(tf))

  for(i in 1:nrow(tf)){
     tf$avg.ht[i] <- (sum(tf$FTHG[tf$HomeTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i]]) + sum(tf$FTAG[tf$AwayTeam == tf$HomeTeam[i] & tf$Date <tf$Date[i]])) / sum(tf$HomeTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i] | tf$AwayTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i])
     tf$avg.at[i] <- (sum(tf$FTHG[tf$HomeTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i]]) + sum(tf$FTAG[tf$AwayTeam == tf$AwayTeam[i] & tf$Date <tf$Date[i]])) / sum(tf$HomeTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i] | tf$AwayTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i])
  }
  return(tf)
}

pl <- pl.func(pl)

I need to "match" on team, and a earlier date. The above code works, but is slow as I want to calculate several hundreds of calculation. Can anyone hint or show how I can do this with some kind of apply function? I could not succed as I dont know to to replace the [i] argument from the loop on a correct way.

What you actually need are running conditional averages. Recently, I answered a similiar question where the OP needed running averages every 15 minutes by group where you need running averages for every past game played by team.

So consider the following sapply() approach which using sample data and running your code alongside, returns equivalent output. Possibly performance may be more enhanced for your needs:

pl$runavgHT <- sapply(1:nrow(pl),
                    function(i) {
                      (sum(((pl[1:i, c("Date")] < (pl$Date[i]))
                           & (pl[1:i, c("HomeTeam")] == pl$HomeTeam[i]))
                          *  pl[1:i,]$FTHG) +
                       sum(((pl[1:i, c("Date")] < (pl$Date[i]))
                           & (pl[1:i, c("AwayTeam")] == pl$HomeTeam[i]))
                          *  pl[1:i,]$FTAG)) /

                       sum(((pl[1:i, c("Date")] < (pl$Date[i])) & 
                            (pl[1:i, c("HomeTeam")] == pl$HomeTeam[i])) 
                          |((pl[1:i, c("Date")] < (pl$Date[i])) & 
                            (pl[1:i, c("AwayTeam")] == pl$HomeTeam[i])))
                    }
             )

pl$runavgAT <- sapply(1:nrow(pl),
                    function(i) {
                      (sum(((pl[1:i, c("Date")] < (pl$Date[i]))
                            & (pl[1:i, c("HomeTeam")] == pl$AwayTeam[i]))
                           *  pl[1:i,]$FTHG) +
                       sum(((pl[1:i, c("Date")] < (pl$Date[i]))
                            & (pl[1:i, c("AwayTeam")] == pl$AwayTeam[i]))
                           *  pl[1:i,]$FTAG)) /

                       sum(((pl[1:i, c("Date")] < (pl$Date[i])) & 
                            (pl[1:i, c("HomeTeam")] == pl$AwayTeam[i])) 
                          |((pl[1:i, c("Date")] < (pl$Date[i])) & 
                            (pl[1:i, c("AwayTeam")] == pl$AwayTeam[i])))
                    }
)

Here are some possible improvements (and a final benchmark) :

1) this is a modified version of your function with just some improvements in the loop :

pl.func2 <- function(DF){
  DF$avg.ht <- rep(NA,nrow(DF))
  DF$avg.at <- rep(NA,nrow(DF))

  for(i in 1:nrow(DF)){
     currDate <- DF$Date[i]
     currHT <- DF$HomeTeam[i]
     currAT <- DF$AwayTeam[i]

     prevHT.eq.HT <- which(DF$HomeTeam == currHT & DF$Date < currDate)
     prevHT.eq.AT <- which(DF$HomeTeam == currAT & DF$Date < currDate)
     prevAT.eq.HT <- which(DF$AwayTeam == currHT & DF$Date < currDate)
     prevAT.eq.AT <- which(DF$AwayTeam == currAT & DF$Date < currDate)

     DF$avg.ht[i] <- (sum(DF$FTHG[prevHT.eq.HT]) + sum(tf$FTAG[prevAT.eq.HT])) / (length(prevHT.eq.HT) + length(prevAT.eq.HT))
     DF$avg.at[i] <- (sum(DF$FTHG[prevHT.eq.AT]) + sum(tf$FTAG[prevAT.eq.AT])) / (length(prevHT.eq.AT) + length(prevAT.eq.AT))

  }
  return(DF)
}

2) this is another modified version of your function which uses cumulated infos to avoid subsetting and sum all the previous days (NB this requires the data.frame to be ordered by Date):

pl.func3 <- function(DF){
  DF$avg.ht <- rep(NA,nrow(DF))
  DF$avg.at <- rep(NA,nrow(DF))

  teams <- unique(c(DF$HomeTeam,DF$AwayTeam))
  cumul.info <- t(sapply(teams,FUN=function(team) c(cumulFTG=0,cumulMatches=0)))

  # store column indexes to reuse them
  cumulFTG <- 1
  cumulMatches <- 2

  for(i in 1:nrow(DF)){
     currHT <- DF$HomeTeam[i]
     currAT <- DF$AwayTeam[i]

     DF$avg.ht[i] <- cumul.info[currHT,cumulFTG] / cumul.info[currHT,cumulMatches]
     DF$avg.at[i] <- cumul.info[currAT,cumulFTG] / cumul.info[currAT,cumulMatches]

     cumul.info[currHT,cumulFTG] = cumul.info[currHT,cumulFTG] + DF$FTHG[i]
     cumul.info[currHT,cumulMatches] = cumul.info[currHT,cumulMatches] + 1

     cumul.info[currAT,cumulFTG] = cumul.info[currAT,cumulFTG] + DF$FTAG[i]
     cumul.info[currAT,cumulMatches] = cumul.info[currAT,cumulMatches] + 1

  }
  return(DF)
}

Check and benchmark :

# this is necessary for pl.func3
pl <- pl[order(pl$Date),] 

# are the results identical ? -> TRUE
identical(pl.func(pl),pl.func2(pl)) && identical(pl.func(pl),pl.func3(pl))

# benchmark
library(microbenchmark)
microbenchmark(pl.func(pl),pl.func2(pl),pl.func3(pl))

Unit: milliseconds
         expr       min        lq      mean    median        uq      max neval cld
  pl.func(pl) 184.36644 186.10643 188.38130 187.16322 188.80065 255.2101   100   c
 pl.func2(pl)  84.95047  85.80966  89.27945  87.41589  88.33845 159.6284   100  b 
 pl.func3(pl)  30.72683  31.05515  32.02944  31.41211  33.22858  35.8644   100 a 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM