简体   繁体   中英

R recode within group ID

I want to (1) create a unique group ID, and (2) recode one variable if it meets a condition within the group. I have the following data of ATM locations:

data <- tribble(
  ~address, ~date, ~terminal_id, ~location_type_description, 
  "1 GATEWAY DR OROMOCTO", "2017-01-01", "NC79", "Gas Station",
  "1 GATEWAY DR OROMOCTO", "2018-01-01", "NC79", "Gas Station",
  "1 GATEWAY DR OROMOCTO", "2019-11-01", "NC79", "Financial Institution",
  "1 GATEWAY DR OROMOCTO", "2020-01-01", "NC79", "Financial Institution",
  "1 GATEWAY DR OROMOCTO", "2020-12-01", "NC79", "Financial Institution",
  
) %>%
  dplyr::mutate(
    dplyr::across(date, as.Date)
  )

After 2018, the location_type_description variable was incorrectly coded as "Financial Institution".

Condition : if the location_type_description within an address and terminal_id is anything other than "Financial Institution" before the year 2019, then we recode the location_type_description to be whatever is was before 2019. But if the location_type_description is "Financial Institution" for all years (2017 onwards) then we know if was coded correctly. In our example, since it was "Gas Station" in 2017 and 2018, we know that anything after 2018 is actually a gas station. Here is what the output would look like in the toy data

data_clean <- tribble(
  ~address, ~date, ~terminal_id, ~location_type_description, ~group_identifier, ~location_corrected, ~location_changed,
  "1 GATEWAY DR OROMOCTO", "2017-01-01", "NC79", "Gas Station", 1, "Gas Station", "yes",
  "1 GATEWAY DR OROMOCTO", "2018-01-01", "NC79", "Gas Station", 1, "Gas Station", "yes",
  "1 GATEWAY DR OROMOCTO", "2019-11-01", "NC79", "Financial Institution", 1, "Gas Station", "yes",
  "1 GATEWAY DR OROMOCTO", "2020-01-01", "NC79", "Financial Institution", 1, "Gas Station", "yes",
  "1 GATEWAY DR OROMOCTO", "2020-02-01", "NC79", "Financial Institution", 1, "Gas Station", "yes"
  
) %>%
  dplyr::mutate(
    dplyr::across(date, as.Date)
  )

How about this:

  library(dplyr)
  data <- tibble::tribble(
  ~address, ~date, ~terminal_id, ~location_type_description, 
  "1 GATEWAY DR OROMOCTO", "2017-01-01", "NC79", "Gas Station",
  "1 GATEWAY DR OROMOCTO", "2018-01-01", "NC79", "Gas Station",
  "1 GATEWAY DR OROMOCTO", "2019-11-01", "NC79", "Financial Institution",
  "1 GATEWAY DR OROMOCTO", "2020-01-01", "NC79", "Financial Institution",
  "1 GATEWAY DR OROMOCTO", "2020-12-01", "NC79", "Financial Institution",
  
) %>%
  dplyr::mutate(
    dplyr::across(date, as.Date)
  )

data %>% 
  group_by(address) %>% 
  mutate(id = cur_group_id(), 
         location_type_description = location_type_description[1])
#> # A tibble: 5 × 5
#> # Groups:   address [1]
#>   address               date       terminal_id location_type_description    id
#>   <chr>                 <date>     <chr>       <chr>                     <int>
#> 1 1 GATEWAY DR OROMOCTO 2017-01-01 NC79        Gas Station                   1
#> 2 1 GATEWAY DR OROMOCTO 2018-01-01 NC79        Gas Station                   1
#> 3 1 GATEWAY DR OROMOCTO 2019-11-01 NC79        Gas Station                   1
#> 4 1 GATEWAY DR OROMOCTO 2020-01-01 NC79        Gas Station                   1
#> 5 1 GATEWAY DR OROMOCTO 2020-12-01 NC79        Gas Station                   1

Created on 2022-06-29 by the reprex package (v2.0.1)

I added a few extra ATM locations to make sure it would work for various conditions.

library(magrittr)
library(dplyr)

data <- tribble(
  ~address, ~date, ~terminal_id, ~location_type_description, 
  "1 GATEWAY DR OROMOCTO", "2017-01-01", "NC79", "Gas Station",
  "1 GATEWAY DR OROMOCTO", "2018-01-01", "NC79", "Gas Station",
  "1 GATEWAY DR OROMOCTO", "2019-11-01", "NC79", "Financial Institution",
  "1 GATEWAY DR OROMOCTO", "2020-01-01", "NC79", "Financial Institution",
  "1 GATEWAY DR OROMOCTO", "2020-12-01", "NC79", "Financial Institution",
  "4 PRIVET DR LITTLE WHINGING", "2017-01-01", "AB123", "Gas Station",
  "4 PRIVET DR LITTLE WHINGING", "2018-01-01", "AB123", "Gas Station",
  "4 PRIVET DR LITTLE WHINGING", "2019-11-01", "AB123", "Gas Station",
  "4 PRIVET DR LITTLE WHINGING", "2020-01-01", "AB123", "Gas Station",
  "4 PRIVET DR LITTLE WHINGING", "2020-12-01", "AB123", "Gas Station",
  "42 WALLABY WAY SYDNEY AUSTRALIA", "2017-01-01", "XY10", "Other",
  "42 WALLABY WAY SYDNEY AUSTRALIA", "2018-01-01", "XY10", "Other",
  "42 WALLABY WAY SYDNEY AUSTRALIA", "2019-11-01", "XY10", "Financial Institution",
  "42 WALLABY WAY SYDNEY AUSTRALIA", "2020-01-01", "XY10", "Financial Institution",
  "42 WALLABY WAY SYDNEY AUSTRALIA", "2020-12-01", "XY10", "Financial Institution",
  "742 EVERGREEN TERRACE SPRINGFIELD", "2017-01-01", "4227", "Financial Institution",
  "742 EVERGREEN TERRACE SPRINGFIELD", "2018-01-01", "4227", "Financial Institution",
  "742 EVERGREEN TERRACE SPRINGFIELD", "2019-11-01", "4227", "Financial Institution",
  "742 EVERGREEN TERRACE SPRINGFIELD", "2020-01-01", "4227", "Financial Institution",
  "742 EVERGREEN TERRACE SPRINGFIELD", "2020-12-01", "4227", "Financial Institution",
) %>%
  dplyr::mutate(
    dplyr::across(date, as.Date)
  )

data_clean <- tribble(
  ~address, ~date, ~terminal_id, ~location_type_description, ~group_identifier, ~location_corrected, ~location_changed,
  "1 GATEWAY DR OROMOCTO", "2017-01-01", "NC79", "Gas Station", 1, "Gas Station", "yes",
  "1 GATEWAY DR OROMOCTO", "2018-01-01", "NC79", "Gas Station", 1, "Gas Station", "yes",
  "1 GATEWAY DR OROMOCTO", "2019-11-01", "NC79", "Financial Institution", 1, "Gas Station", "yes",
  "1 GATEWAY DR OROMOCTO", "2020-01-01", "NC79", "Financial Institution", 1, "Gas Station", "yes",
  "1 GATEWAY DR OROMOCTO", "2020-02-01", "NC79", "Financial Institution", 1, "Gas Station", "yes"
  
) %>%
  dplyr::mutate(
    dplyr::across(date, as.Date)
  )

# dataframe of address and group identifiers
groupID <- data.frame(terminal_id = unique(data$terminal_id), group_identifier = 1:length(unique(data$terminal_id)))
# dataframe of original location_types
OGloctype <- data %>%
  filter(date < as.Date('2019-01-01')) %>%
  rename(location_corrected = location_type_description) %>%
  select(c(terminal_id, location_corrected)) %>%
  distinct()

data %>%
  full_join(groupID, by = 'terminal_id') %>%
  full_join(OGloctype, by = 'terminal_id') %>%
  group_by(terminal_id) %>%
  # any() looks for any matches within the group
  mutate(location_changed = ifelse(any(location_corrected != location_type_description),
                                   'yes', 'no')) %>%
  ungroup()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM