Error: Can't subset columns that don't exist when running prediction using {Tidymodels}

Question

I'm trying to predict real estate prices in R with Tidymodels. I'm following this tutorial . All goes well until the very and when I try to run prediction on my test data.

Please see the below code example and the error at the very end.

I looked at two similar questions ( here and here ) but it seems that I have defined variable roles and provided an unprepared recipe to my workflow.

    # libraries ---------------------------------------------------------------
    library(tidymodels)
    #> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
    #> ✓ broom     0.7.3      ✓ recipes   0.1.15
    #> ✓ dials     0.0.9      ✓ rsample   0.0.8 
    #> ✓ dplyr     1.0.3      ✓ tibble    3.0.5 
    #> ✓ ggplot2   3.3.3      ✓ tidyr     1.1.2 
    #> ✓ infer     0.5.4      ✓ tune      0.1.2 
    #> ✓ modeldata 0.1.0      ✓ workflows 0.2.1 
    #> ✓ parsnip   0.1.5      ✓ yardstick 0.0.7 
    #> ✓ purrr     0.3.4
    #> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
    #> x purrr::discard() masks scales::discard()
    #> x dplyr::filter()  masks stats::filter()
    #> x dplyr::lag()     masks stats::lag()
    #> x recipes::step()  masks stats::step()
    library(data.table)
    
    library(purrr)
    
    
    # data --------------------------------------------------------------------
    # 're' means real estate
    # I'm using data.table in general. Using tribble below for cleaner data definition.
    real_estate_data <- tibble::tribble(
        ~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
        "30876343",      0.534722222222222,        1,         3,
        "31914489",      0.476119402985075,        1,         1,
        "30972289",      0.507352941176471,        1,         2,
        "31739730",      0.472972972972973,        1,         3,
        "31783137",                0.49875,        2,         3,
        "31809435",      0.439705882352941,        2,         2,
        "31943408",      0.469117647058824,        2,         3,
        "31944348",       0.56231884057971,        2,         1,
        "31961146",      0.472972972972973,        3,         3,
        "24314388",      0.649550561797753,        3,         2,
        "29840270",      0.719178082191781,        3,         3,
        "29840429",      0.719178082191781,        3,         3,
        "30873484",      0.822857142857143,        4,         3,
        "30969673",      0.533802816901408,        4,         3,
        "31333120",      0.741511627906977,        4,         3,
        "31788730",      0.527142857142857,        4,         2,
        "31948441",      0.734848484848485,        5,         2,
        "31962350",                    0.8,        5,         3,
        "31962779",      0.670454545454545,        5,         3,
        "31979128",      0.689054054054054,        5,         1
    )
    
    real_estate_data <- as.data.table(real_estate_data) %>% .[, district := factor(district)]
    
    # train/test split --------------------------------------------------------
    set.seed(123)
    re_split <- initial_split(real_estate_data)
    re_train <- training(re_split)
    re_test  <- testing(re_split)
    
    # workflow (w/ recipe) ----------------------------------------------------
    re_rec <- recipe(re_train,
                     formula = price_per_sqm_huf_mil ~ .) %>%
        update_role(re_id, new_role = "ID") %>%
        step_center(all_numeric(), - district) %>%
        step_scale(all_predictors(), all_numeric(), - district) %>%
        step_dummy(district) %>%
        step_zv(all_predictors())
    
    summary(re_rec)
    #> # A tibble: 4 x 4
    #>   variable              type    role      source  
    #>   <chr>                 <chr>   <chr>     <chr>   
    #> 1 re_id                 nominal ID        original
    #> 2 district              nominal predictor original
    #> 3 num_room              numeric predictor original
    #> 4 price_per_sqm_huf_mil numeric outcome   original
    
    lr_model <-
        linear_reg() %>%
        set_engine("lm")
    
    re_wflow <-
        workflow() %>%
        add_model(lr_model) %>%
        add_recipe(re_rec)
    
    # model training and prediction -------------------------------------------
    re_fit <-
        re_wflow %>%
        fit(data = re_train)
    
    re_pred <- predict(re_fit, re_test)
    #> Error: Can't subset columns that don't exist.
    #> x Column `price_per_sqm_huf_mil` doesn't exist.

^{Created on 2021-01-25 by the reprex package (v0.3.0)}

Many thanks!

Answer 1

The issue here is that you used step_center() to transform the outcome ( price_per_sqm_huf_mil ) and at prediction time, there is no outcome available. You can instead specify that you want to center all_predictors() & all_numeric() like this:

library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom     0.7.3      ✓ recipes   0.1.15
#> ✓ dials     0.0.9      ✓ rsample   0.0.8 
#> ✓ dplyr     1.0.3      ✓ tibble    3.0.5 
#> ✓ ggplot2   3.3.3      ✓ tidyr     1.1.2 
#> ✓ infer     0.5.4      ✓ tune      0.1.2 
#> ✓ modeldata 0.1.0      ✓ workflows 0.2.1 
#> ✓ parsnip   0.1.5      ✓ yardstick 0.0.7 
#> ✓ purrr     0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()
library(dplyr)

real_estate_data <- tibble::tribble(
  ~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
  "30876343",      0.534722222222222,        1,         3,
  "31914489",      0.476119402985075,        1,         1,
  "30972289",      0.507352941176471,        1,         2,
  "31739730",      0.472972972972973,        1,         3,
  "31783137",                0.49875,        2,         3,
  "31809435",      0.439705882352941,        2,         2,
  "31943408",      0.469117647058824,        2,         3,
  "31944348",       0.56231884057971,        2,         1,
  "31961146",      0.472972972972973,        3,         3,
  "24314388",      0.649550561797753,        3,         2,
  "29840270",      0.719178082191781,        3,         3,
  "29840429",      0.719178082191781,        3,         3,
  "30873484",      0.822857142857143,        4,         3,
  "30969673",      0.533802816901408,        4,         3,
  "31333120",      0.741511627906977,        4,         3,
  "31788730",      0.527142857142857,        4,         2,
  "31948441",      0.734848484848485,        5,         2,
  "31962350",                    0.8,        5,         3,
  "31962779",      0.670454545454545,        5,         3,
  "31979128",      0.689054054054054,        5,         1
) %>%
  mutate(district = factor(district))


set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test  <- testing(re_split)

re_rec <- recipe(re_train,
                 formula = price_per_sqm_huf_mil ~ .) %>%
  update_role(re_id, new_role = "ID") %>%
  step_center(all_predictors() & all_numeric()) %>%
  step_scale(all_predictors() & all_numeric()) %>%
  step_dummy(district) %>%
  step_zv(all_predictors())

summary(re_rec)
#> # A tibble: 4 x 4
#>   variable              type    role      source  
#>   <chr>                 <chr>   <chr>     <chr>   
#> 1 re_id                 nominal ID        original
#> 2 district              nominal predictor original
#> 3 num_room              numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome   original

lr_model <-
  linear_reg() %>%
  set_engine("lm")

re_wflow <-
  workflow() %>%
  add_model(lr_model) %>%
  add_recipe(re_rec)

re_fit <-
  re_wflow %>%
  fit(data = re_train)

predict(re_fit, new_data = re_test)
#> # A tibble: 5 x 1
#>   .pred
#>   <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768

^{Created on 2021-01-25 by the reprex package (v0.3.0)}

This has tripped up more folks than you so we are working on adding a new set of selectors that will be merged in soon. The other option to think about, if you really do want to try transforming an outcome, is to look into using skip = TRUE .

Error: Can't subset columns that don't exist when running prediction using {Tidymodels}

Question

1 answers

solution1
4 ACCPTED 2021-01-25 20:47:32

Error: Can't subset columns that don't exist when running prediction using {Tidymodels}

Question

1 answers

solution1 4 ACCPTED 2021-01-25 20:47:32

solution1
4 ACCPTED 2021-01-25 20:47:32