简体   繁体   中英

group_by and apply a rolling regression based on window using dplyr

I have some data which looks like:

# A tibble: 6,618 x 8
    Open  High   Low Close   Volumn Adjusted stock dates     
   <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl> <chr> <date>    
 1  232.  237.  230.  233. 15470700     233. 1     2007-01-03
 2  234.  241.  233.  241. 15834200     241. 1     2007-01-04
 3  240.  243.  238.  243. 13795600     243. 1     2007-01-05
 4  243.  244.  240.  241.  9544400     241. 1     2007-01-08

I would like to calculate a 30 day rolling regression. What I have currently is:

df %>%
  group_by(stock) %>% 
  rollapply(
    width = 30,
    FUN = function(x){
      LinearModel = lm(formula = Close ~ date, data = as.data.frame(x))
      return(LinearModel$coef)
    })

This doesn't work, but I would like to have new columns in the df where I have the 30 day slopes and intercepts. I have tried wrapping the above function into a mutate without luck. I am trying to do this for each group in the stock column.

Data:

library(quantmod)
library(dplyr)
library(stats)
getSymbols(c("GOOG", "MSFT"), from = "2010-01-01", to = "2010-06-01")

names_for_column <- c("Open", "High", "Low", "Close", "Volumn", "Adjusted")
colnames(GOOG) <- names_for_column
colnames(MSFT) <- names_for_column

df <- bind_rows(data.frame(GOOG), data.frame(MSFT), .id = "stock") %>% 
  mutate(dates = c(time(GOOG), time(MSFT))) %>% 
  tibble()

Expected output:

    Open  High   Low Close   Volumn Adjusted stock dates          intercept   slope
   <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl> <chr> <date>          
 1  232.  237.  230.  233. 15470700     233. 1     2007-01-03        NA (for 30 obs)
...
 30  234.  241.  233.  241. 15834200     241. 1     2007-01-04      -0.324     0.284

EDIT:

I would like the output to be similar to a rolling version of:

df %>%
  filter(stock == 1) %>%
  condense(LinearModel = lm(Close ~ dates, data = .)) %>%
  tidy(LinearModel) %>%
  pivot_wider(names_from = term, values_from = estimate:p.value)

Which gives:

# A tibble: 1 x 8
  `estimate_(Inte… estimate_dates `std.error_(Int… std.error_dates `statistic_(Int…
             <dbl>          <dbl>            <dbl>           <dbl>            <dbl>
1           -3123.          0.231             25.5         0.00159            -123.
# … with 3 more variables: statistic_dates <dbl>, `p.value_(Intercept)` <dbl>,
#   p.value_dates <dbl>

So I am hoping to bind this to the original data.

When I run:

df %>%
  filter(stock == 1) %>%
  condense(out = lm(Close ~ dates, data =.) %>% 
             tidy)

I get:

# A tibble: 1 x 1
# Rowwise: 
  out             
  <list>          
1 <tibble [2 × 5]>

Adding unnest()

df %>%
  filter(stock == 1) %>%
  condense(out = lm(Close ~ dates, data =.) %>% 
             tidy) %>% 
  unnest(out)

I get the same result (without the pivot_wider part) as before:

# A tibble: 2 x 5
  term         estimate std.error statistic p.value
  <chr>           <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept) -3123.     25.5         -123.       0
2 dates           0.231   0.00159      145.       0

I want to flatten this data and link it up to the corresponding dates in the original data (with the first 30 rows containing NA). I am mostly interested in the values -3123 and 0.231 from the estimate column.

#

EDIT -

We can do a group_split and map over the list elements and then apply the rollapply

library(zoo)
library(dplyr)
library(purrr)
out <- df %>% 
        group_split(stock) %>%
        map(~ rollapply(.x,
           width = 30,
           FUN =  function(dat) {
           LinearModel = lm(formula = Close ~ dates,  as.data.frame(dat))
           LinearModel$coef
           }, by.column = FALSE, fill = NA_real_,  align = "right"))


length(out)
#[1] 2

If we want to update the original dataset with more columns

out <-  df %>% 
       group_split(stock) %>%
       map_dfr(~ {
           subdat <- .x
           rollapply(subdat,
           width = 30,
           FUN =  function(dat) {
           LinearModel = lm(formula = Close ~ dates,  as.data.frame(dat))
           LinearModel$coef
           }, by.column = FALSE, fill = NA_real_,  align = "right") %>% 
               as.data.frame %>%
               bind_cols(subdat, .)

           }

           )

ncol(out)
#[1] 38

ncol(df)
#[1] 8

In the devel version of dplyr , we can also do

out1 <- df %>% 
           group_by(stock) %>%
          condense(out =rollapply(cur_data(), width = 30,
           FUN = function(dat) lm(Close ~ dates, as.data.frame(dat))$coef,
           by.column = FALSE, fill = NA_real_, align = "right") %>% 
           as.data.frame %>%
           bind_cols(cur_data(), .))
out1
# A tibble: 2 x 2
# Rowwise:  stock
#  stock out                  
#  <chr> <list>               
#1 1     <tibble [3,309 × 37]>
#2 2     <tibble [3,309 × 37]>

The list column can be unnest ed when it is required

out1 %>% 
    unnest(c(out)) %>%
    head(3)
# A tibble: 3 x 38
#  stock  Open  High   Low Close Volumn Adjusted dates      `(Intercept)` `dates2007-01-0…
#  <chr> <dbl> <dbl> <dbl> <dbl>  <dbl>    <dbl> <date>             <dbl>            <dbl>
#1 1      232.  237.  230.  233. 1.55e7     233. 2007-01-03            NA               NA
#2 1      234.  241.  233.  241. 1.58e7     241. 2007-01-04            NA               NA
#3 1      240.  243.  238.  243. 1.38e7     243. 2007-01-05            NA               NA
# … with 28 more variables: `dates2007-01-05` <dbl>, `dates2007-01-08` <dbl>,
#   `dates2007-01-09` <dbl>, `dates2007-01-10` <dbl>, `dates2007-01-11` <dbl>,
#   `dates2007-01-12` <dbl>, `dates2007-01-16` <dbl>, `dates2007-01-17` <dbl>,
#   `dates2007-01-18` <dbl>, `dates2007-01-19` <dbl>, `dates2007-01-22` <dbl>,
#   `dates2007-01-23` <dbl>, `dates2007-01-24` <dbl>, `dates2007-01-25` <dbl>,
#   `dates2007-01-26` <dbl>, `dates2007-01-29` <dbl>, `dates2007-01-30` <dbl>,
#   `dates2007-01-31` <dbl>, `dates2007-02-01` <dbl>, `dates2007-02-02` <dbl>,
#   `dates2007-02-05` <dbl>, `dates2007-02-06` <dbl>, `dates2007-02-07` <dbl>,
#   `dates2007-02-08` <dbl>, `dates2007-02-09` <dbl>, `dates2007-02-12` <dbl>,
#   `dates2007-02-13` <dbl>, `dates2007-02-14` <dbl>

We can apply the tidy within the condense

library(broom)

out3 <-  df %>% 
   group_split(stock) %>%
   map_dfr(~ {
       subdat <- .x
       rollapply(subdat,
       width = 30,
       FUN =  function(dat) {
       LinearModel = lm(formula = Close ~ dates,  as.data.frame(dat))
       tidy(LinearModel)
       }, by.column = FALSE, fill = NA_real_,  align = "right") %>% 
           as.data.frame %>%
           bind_cols(subdat, .)

       }

       )



dim(out3)
#[1] 6618   13
names(out3)
# [1] "Open"      "High"      "Low"       "Close"     "Volumn"    "Adjusted"  "stock"    
# [8] "dates"     "term"      "estimate"  "std.error" "statistic" "p.value"  

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM