简体   繁体   中英

Use dplyr to summarize range from min and max across multiple columns

I am trying to automatically generate several new variables based on existing variables. I would like to get the range of values for each of "a", "b" and "c" using their respective min and max variables. The data I am simulating are from satellite sensors aggregated to zonal statistics which would mean that each row is a polygon feature.

Here is a toy data frame to use:

dat <- data.frame(a.min = runif(100, 0, 100), 
                  b.min = runif(100, 0, 10), 
                  c.min = runif(100, 0, 0.5), 
                  a.max = runif(100, 100, 200), 
                  b.max = runif(100, 10, 20), 
                  c.max = runif(100, 0.5, 1))

Here is the manual way of performing this action:

dat$a.range <- dat$a.max - dat$a.min
dat$b.range <- dat$b.max - dat$b.min
dat$c.range <- dat$c.max - dat$c.min

head(dat)

How can accomplish this in an automated fashion with dplyr? I know that I will have NA values in my data.

So far I have:

dat %>% select(dat, matches("min|max")) 

I tried to define a range function:

rng <- function(x,y){y - x})

I am not sure where to go after select. I think I need to use "mutate" or "across"?

Cheers and thanks!

We could pivot to long format with pivot_longer and get the range

library(dplyr)
library(tidyr)
dat1 <- dat %>%
   pivot_longer(cols = everything(), names_to = c(".value", 'grp'), 
      names_sep = "\\.") %>%
  summarise(across(a:c,  ~.[grp == 'max']- .[grp == 'min'],
        .names = '{.col}.range')) %>% 
  bind_cols(dat, .)

-output

head(dat1)
#     a.min    b.min     c.min    a.max    b.max     c.max   a.range   b.range   c.range
#1 27.646339 4.055958 0.1095838 179.7785 14.82492 0.5455450 152.13219 10.768966 0.4359612
#2 77.459085 9.549793 0.4220214 187.4912 12.64510 0.5871106 110.03215  3.095303 0.1650892
#3 79.308797 8.449052 0.2786377 137.7695 15.64397 0.9327440  58.46075  7.194922 0.6541063
#4  8.430773 2.060054 0.3746367 125.9992 17.76314 0.9935886 117.56838 15.703083 0.6189519
#5 89.627414 5.498631 0.3217548 112.5346 17.39814 0.8001432  22.90720 11.899511 0.4783885
#6 74.553222 9.621933 0.4568924 156.3704 18.85852 0.7971354  81.81716  9.236589 0.3402430

A minor alternative to akrun's excellent answer:

# set.seed(42)
# dat <- ...
head(dat)
#      a.min    b.min      c.min    a.max    b.max     c.max
# 1 91.48060 6.262453 0.44255884 148.3768 10.22700 0.5682526
# 2 93.70754 2.171577 0.25855553 144.4570 15.13240 0.5885682
# 3 28.61395 2.165673 0.42596549 106.0386 16.30726 0.7597802
# 4 83.04476 3.889450 0.22139813 132.7506 14.18772 0.9055604
# 5 64.17455 9.424557 0.07894005 187.8429 18.79266 0.5576810
# 6 51.90959 9.626080 0.22116232 193.0605 11.07987 0.9467109

library(dplyr)
head(dat) %>%
  mutate(rn = row_number()) %>%
  pivot_longer(-rn, names_pattern = "(.)\\.(.*)", names_to=c("ltr", ".value")) %>%
  mutate(range = max - min) %>%
  pivot_wider(names_glue="{ltr}.{.value}", names_from = "ltr", values_from = min:range)
# # A tibble: 6 x 10
#      rn a.min b.min  c.min a.max b.max c.max a.range b.range c.range
#   <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>
# 1     1  91.5  6.26 0.443   148.  10.2 0.568    56.9    3.96   0.126
# 2     2  93.7  2.17 0.259   144.  15.1 0.589    50.7   13.0    0.330
# 3     3  28.6  2.17 0.426   106.  16.3 0.760    77.4   14.1    0.334
# 4     4  83.0  3.89 0.221   133.  14.2 0.906    49.7   10.3    0.684
# 5     5  64.2  9.42 0.0789  188.  18.8 0.558   124.     9.37   0.479
# 6     6  51.9  9.63 0.221   193.  11.1 0.947   141.     1.45   0.726

This might be slightly less-efficient in that this does one more reshape-operation, which is a little more work, whereas akrun's answer binds it back with the original data. With this sample data, I believe akrun's runs faster.

This can be done in dplyr only

You may use mutate if you want add columns instead

set.seed(1)
dat <- data.frame(a.min = runif(100, 0, 100), 
                  b.min = runif(100, 0, 10), 
                  c.min = runif(100, 0, 0.5), 
                  a.max = runif(100, 100, 200), 
                  b.max = runif(100, 10, 20), 
                  c.max = runif(100, 0.5, 1))



library(dplyr)

dat %>%
  transmute(across(ends_with("max"), ~ . - get(gsub( "max", "min", cur_column())), 
                   .names = '{.col}.range'))
#>     a.max.range b.max.range c.max.range
#> 1     140.82036  10.0415368   0.6433344
#> 2      72.27340   8.3187269   0.7348150
#> 3      91.97428  16.8411799   0.5706304
#> 4      55.33441   9.0516443   0.6971961
#> 5     117.35346  13.1020379   0.6455329
#> 6     109.27095  15.1048262   0.7254760
#> 7      23.16754  12.4098472   0.4197071
#> 8     115.26374  13.0289951   0.8601976
#> 9      43.93326   0.8707504   0.7501387
#> 10    133.86635  13.4154775   0.4073330
#> 11     93.51698  10.1757853   0.4563699
#> 12    101.67531   6.2561323   0.6834871
#> 13    115.43289  13.9090815   0.1224529
#> 14    133.58103  13.6143533   0.3899003
#> 15     49.73707  15.5764745   0.4489774
#> 16     99.73024  14.6274746   0.4395670
#> 17     36.54954   7.7908846   0.4982439
#> 18     36.19781  12.0486821   0.2943119
#> 19    158.91736  12.4872790   0.1907941
#> 20     84.72690  11.8269226   0.6489648
#> 21     72.99130   4.4287902   0.7833635
#> 22    110.03471  10.1913907   0.7717606
#> 23     75.40158  11.7866144   0.5131911
#> 24    187.05223   9.6972425   0.3091013
#> 25    158.78617   5.8966645   0.2320550
#> 26    156.74343   9.5186809   0.7086089
#> 27    179.89148   5.7414122   0.7733848
#> 28    139.97942  17.2502681   0.8561180
#> 29     39.81873  16.0972592   0.7696968
#> 30    142.18025  12.8372130   0.5699562
#> 31    150.42315  13.6084027   0.6296052
#> 32     69.40397   9.9539625   0.4489674
#> 33     90.58098  13.4321886   0.4809792
#> 34    162.59139   8.9506943   0.3231189
#> 35     24.97784   1.7643494   0.3635593
#> 36     69.52301  12.0359528   0.1767952
#> 37     64.83526  11.7874100   0.3404482
#> 38    104.87705   5.2612129   0.3909888
#> 39     85.84943   4.9707435   0.3720141
#> 40    155.88877  14.1287602   0.5845266
#> 41    116.85535   3.5874035   0.5457002
#> 42     52.93918   6.9245058   0.6886609
#> 43     75.91977  12.6198181   0.3714896
#> 44     83.12676  13.5158301   0.5930698
#> 45    114.64445   5.3493943   0.3910495
#> 46     47.99375   9.7052778   0.4522083
#> 47    144.59197  11.9143685   0.7836076
#> 48     69.45700  11.9580705   0.6232934
#> 49     63.68757  10.1873592   0.5509458
#> 50    103.26737   2.0548773   0.3948805
#> 51    100.85295  13.4967207   0.5544158
#> 52     20.25930   8.8535057   0.3196302
#> 53    134.64491  10.4149506   0.2701788
#> 54    117.35244   7.1711213   0.6513636
#> 55    191.03390   5.5374946   0.6428320
#> 56    118.34178  14.7696171   0.6368492
#> 57    153.16104  10.4853131   0.4231595
#> 58     56.36050  19.1182602   0.2755695
#> 59    122.44537  13.2603647   0.7241478
#> 60    106.51005   8.5225040   0.4473165
#> 61     19.62250   8.7731860   0.5653734
#> 62    103.96746   5.8030382   0.4078416
#> 63    137.83508   5.5569751   0.5319219
#> 64     94.44552   6.6147425   0.4719155
#> 65     93.61647   3.8031070   0.6024145
#> 66    157.87155  15.2528954   0.7655847
#> 67     59.26088  16.5202480   0.5885608
#> 68     93.64681   8.2759799   0.4447008
#> 69    161.45776   7.5246207   0.3301701
#> 70     58.86411  13.5264139   0.8940299
#> 71    109.78582   8.3048106   0.5480910
#> 72     72.27364  11.1453218   0.6546549
#> 73    158.17997   8.5640846   0.4351467
#> 74     89.66915  10.0578865   0.4625397
#> 75     74.54625   7.3722673   0.7645326
#> 76     52.80176   3.4467085   0.6665480
#> 77     46.91813  12.1931480   0.7052888
#> 78    147.48180   6.9489775   0.6135230
#> 79     39.98738   8.9256461   0.6826381
#> 80     53.27007   7.8884606   0.4832842
#> 81     99.50539  14.6658313   0.6602186
#> 82     85.17492   9.6414111   0.7108137
#> 83    125.61679   9.6300615   0.3831801
#> 84    165.32019  14.4347833   0.3311072
#> 85     47.50740   7.1136165   0.7742447
#> 86    103.81193  13.1305719   0.6383449
#> 87    108.57149   4.3167687   0.3777025
#> 88    170.99798   2.7733797   0.6866086
#> 89     86.80192   9.3385324   0.9150526
#> 90    182.00076   5.8646475   0.7437171
#> 91     90.76935   3.1099736   0.6635475
#> 92    108.46926  15.5023161   0.5557700
#> 93    128.29417  11.9207560   0.3982395
#> 94     63.07664   2.3702449   0.6624636
#> 95     37.59363  11.1587006   0.6616531
#> 96     55.09932   8.6900225   0.4835542
#> 97    120.45466  13.2367024   0.6903333
#> 98     90.16883   6.7650279   0.6880808
#> 99     54.07032  15.2828207   0.6647165
#> 100    54.29124   2.7231295   0.2606158

using mutate

set.seed(42)

dat %>%
  mutate(across(ends_with("max"), ~ . - get(gsub( "max", "min", cur_column())), 
                   .names = '{.col}.range')) %>% 
  head()
#>      a.min    b.min      c.min    a.max    b.max     c.max a.max.range
#> 1 91.48060 6.262453 0.44255884 148.3768 10.22700 0.5682526    56.89621
#> 2 93.70754 2.171577 0.25855553 144.4570 15.13240 0.5885682    50.74941
#> 3 28.61395 2.165673 0.42596549 106.0386 16.30726 0.7597802    77.42461
#> 4 83.04476 3.889450 0.22139813 132.7506 14.18772 0.9055604    49.70584
#> 5 64.17455 9.424557 0.07894005 187.8429 18.79266 0.5576810   123.66835
#> 6 51.90959 9.626080 0.22116232 193.0605 11.07987 0.9467109   141.15089
#>   b.max.range c.max.range
#> 1    3.964547   0.1256938
#> 2   12.960818   0.3300127
#> 3   14.141588   0.3338147
#> 4   10.298266   0.6841623
#> 5    9.368103   0.4787410
#> 6    1.453791   0.7255486

Created on 2021-04-29 by the reprex package (v2.0.0)

If the prefix part of the column names, such as "a", "b", "c", are known, this could be an alternative which may be easier to read, and slightly more efficient.

library(tidyverse)

base_names <- c("a", "b", "c")

result <- dat %>% {
  max_cols <- select(., str_c(base_names, ".max"))
  min_cols <- select(., str_c(base_names, ".min"))
  range_cols <- max_cols - min_cols
  names(range_cols) <- str_c(base_names, ".range")
  bind_cols(., range_cols)
}

You can also use the following solution. The combination of glue & get functions can be very useful in situations like this where you can create the name of the variables using glue function and then get their respective values with get .

library(dplyr)
library(purrr)
library(glue)

dat %>%
  rowwise() %>%
  mutate(map_dfc(list(a.range = "a", 
                  b.range = "b",
                  c.range = "c"), ~ get(glue("{.x}.max")) - get(glue("{.x}.min"))))


# A tibble: 100 x 9
# Rowwise: 
   a.min  b.min  c.min a.max b.max c.max a.range b.range c.range
   <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>
 1  91.5 6.26   0.443   148.  10.2 0.568    56.9    3.96   0.126
 2  93.7 2.17   0.259   144.  15.1 0.589    50.7   13.0    0.330
 3  28.6 2.17   0.426   106.  16.3 0.760    77.4   14.1    0.334
 4  83.0 3.89   0.221   133.  14.2 0.906    49.7   10.3    0.684
 5  64.2 9.42   0.0789  188.  18.8 0.558   124.     9.37   0.479
 6  51.9 9.63   0.221   193.  11.1 0.947   141.     1.45   0.726
 7  73.7 7.40   0.484   139.  19.8 0.788    65.6   12.4    0.304
 8  13.5 7.33   0.242   116.  12.6 0.573   102.     5.32   0.331
 9  65.7 5.36   0.126   132.  10.8 0.951    66.3    5.49   0.825
10  70.5 0.0227 0.130   131.  13.9 0.627    60.2   13.8    0.497
# ... with 90 more rows

Data

set.seed(42)
dat <- data.frame(a.min = runif(100, 0, 100), 
                  b.min = runif(100, 0, 10), 
                  c.min = runif(100, 0, 0.5), 
                  a.max = runif(100, 100, 200), 
                  b.max = runif(100, 10, 20), 
                  c.max = runif(100, 0.5, 1))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM