简体   繁体   中英

comparing groups and filter common observations using dplyr

This is a sample dataset -

data.frame(ISIN = c("US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US9898171015", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US5535301064", "US5535301064", "US9898171015", 
                               "US9898171015", "US9898171015"), year = c(2016, 2017, 2009, 2010, 
                                                                         2011, 2012, 2013, 2014, 2015, 2015, 2010, 2011, 2012, 2013, 2014, 
                                                                         2015, 2016, 2017, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2000, 
                                                                         2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 
                                                                         2012, 2013, 2014, 2015, 2016, 2017, 2000, 2001, 2002, 2003, 2004, 
                                                                         2005, 2006, 2007, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 
                                                                         2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2000, 2001, 2002, 
                                                                         2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 
                                                                         2014, 2015, 2016, 2017, 2017, 2009, 2010, 2011, 2006, 2007, 2008, 
                                                                         2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2006, 2007, 
                                                                         2008, 2009, 2010, 2011, 2013, 2014, 2015, 2016, 2017, 2016, 2017, 
                                                                         2007, 2008, 2009), DirectorName = c("Steven Paladino", "Steven Paladino", 
                                                                                                             "Louise Koopman Goeser", "Louise Koopman Goeser", "Louise Koopman Goeser", 
                                                                                                             "Louise Koopman Goeser", "Louise Koopman Goeser", "Louise Koopman Goeser", 
                                                                                                             "Louise Koopman Goeser", "Kalen F Holmes", "Doctor Jonathan L Byrnes", 
                                                                                                             "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", 
                                                                                                             "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", 
                                                                                                             "Doctor Jonathan L Byrnes", "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", 
                                                                                                             "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", 
                                                                                                             "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", 
                                                                                                             "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", 
                                                                                                             "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", 
                                                                                                             "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", 
                                                                                                             "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", 
                                                                                                             "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", 
                                                                                                             "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Scott Andrew Bailey", "William (Bill) Milroy Barnum Jr", "William (Bill) Milroy Barnum Jr", 
                                                                                                             "William (Bill) Milroy Barnum Jr", "Thomas (Tom) E Davin", "James (Jim) M Weber", 
                                                                                                             "James (Jim) M Weber", "James (Jim) M Weber", "James (Jim) M Weber", 
                                                                                                             "James (Jim) M Weber", "Ernest R Johnson", "Ernest R Johnson", 
                                                                                                             "Ernest R Johnson", "Ernest R Johnson", "Ernest R Johnson", "Ernest R Johnson", 
                                                                                                             "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", 
                                                                                                             "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", 
                                                                                                             "Travis D Smith", "Travis D Smith", "Travis D Smith", "Travis D Smith", 
                                                                                                             "Travis D Smith", "Michael (Mike) C Kaufmann", "Michael (Mike) C Kaufmann", 
                                                                                                             "David (Dave) M DeMattei", "David (Dave) M DeMattei", "David (Dave) M DeMattei"
                                                                         ), DirectorID = c("1000169302", "1000169302", "1058973478", "1058973478", 
                                                                                           "1058973478", "1058973478", "1058973478", "1058973478", "1058973478", 
                                                                                           "11051172801", "11275933344", "11275933344", "11275933344", "11275933344", 
                                                                                           "11275933344", "11275933344", "11275933344", "11275933344", "11434863691", 
                                                                                           "11434863691", "11434863691", "11434863691", "11434863691", "11434863691", 
                                                                                           "11434863691", "1223062984", "1223062984", "1223062984", "1223062984", 
                                                                                           "1223062984", "1223062984", "1223062984", "1223062984", "1223062984", 
                                                                                           "1223062984", "1223062984", "1223062984", "1223062984", "1223062984", 
                                                                                           "1223062984", "1223062984", "1223062984", "1223062984", "1223122984", 
                                                                                           "1223122984", "1223122984", "1223122984", "1223122984", "1223122984", 
                                                                                           "1223122984", "1223122984", "1223392984", "1223392984", "1223392984", 
                                                                                           "1223392984", "1223392984", "1223392984", "1223392984", "1223392984", 
                                                                                           "1223392984", "1223392984", "1223392984", "1223392984", "1223392984", 
                                                                                           "1223392984", "1223392984", "1223392984", "1223552984", "1223552984", 
                                                                                           "1223552984", "1223552984", "1223552984", "1223552984", "1223552984", 
                                                                                           "1223552984", "1223552984", "1223552984", "1223552984", "1223552984", 
                                                                                           "1223552984", "1223552984", "1223552984", "1223552984", "1223552984", 
                                                                                           "1223552984", "174488610522", "20462211719", "20462211719", "20462211719", 
                                                                                           "2247441792", "3581636766", "3581636766", "3581636766", "3581636766", 
                                                                                           "3581636766", "40425210975", "40425210975", "40425210975", "40425210975", 
                                                                                           "40425210975", "40425210975", "4842568996", "4842568996", "4842568996", 
                                                                                           "4842568996", "4842568996", "4842568996", "53006212569", "53006212569", 
                                                                                           "53006212569", "53006212569", "53006212569", "5532705122", "5532705122", 
                                                                                           "759047198", "759047198", "759047198"))

Actually I want to filter out those DIRECTOR_ID that are same from previous year t-1 to current year t

I run the following code to create group -

ceo1 %>%
  group_by(ISIN, YEAR) %>% 
  mutate(
    GROUP_ID = cur_group_id()
  )


# A tibble: 38 x 6
# Groups:   ISIN, YEAR [12]
     ROW ISIN         YEAR DIRECTOR_NAME             DIRECTOR_ID GROUP_ID
   <dbl> <chr>       <dbl> <chr>                           <dbl>    <int>
 1     1 US98981710~  2006 Thomas (Tom) E Davin       2247441792        1
 2     2 US98981710~  2006 Matthew (Matt) L Hyde      4842568996        1
 3     3 US98981710~  2007 James (Jim) M Weber        3581636766        2
 4     4 US98981710~  2007 Matthew (Matt) L Hyde      4842568996        2
 5     5 US98981710~  2007 David (Dave) M DeMattei     759047198        2
 6     6 US98981710~  2008 James (Jim) M Weber        3581636766        3
 7     7 US98981710~  2008 Matthew (Matt) L Hyde      4842568996        3
 8     8 US98981710~  2008 David (Dave) M DeMattei     759047198        3
 9     9 US98981710~  2009 William (Bill) Milroy Ba~ 20462211719        4
10    10 US98981710~  2009 James (Jim) M Weber        3581636766        4

Then I do not know how to filter out those DIRECTOR_ID that are same from year to year. For example, for above outupt - DIRECTOR_ID - 4842568996 is in 2006, 2007, 2008 and DIRECTOR_ID - 3581636766 is in 2007, 2008, 2009. I want to filter them out who are present previous year ( t-1 ) and current year ( t ). Note that I have to keep all variables in the new data set. Thanks

The expected output until year 2012 of sample data will look like this -

A tibble: 16 x 5
     ROW ISIN          YEAR DIRECTOR_NAME                   DIRECTOR_ID
   <dbl> <chr>        <dbl> <chr>                                 <dbl>
 1     3 US9898171015  2007 James (Jim) M Weber              3581636766
 2     4 US9898171015  2007 Matthew (Matt) L Hyde            4842568996
 3     5 US9898171015  2007 David (Dave) M DeMattei           759047198
 4     6 US9898171015  2008 James (Jim) M Weber              3581636766
 5     7 US9898171015  2008 Matthew (Matt) L Hyde            4842568996
 6     8 US9898171015  2008 David (Dave) M DeMattei           759047198
 7     9 US9898171015  2009 William (Bill) Milroy Barnum Jr 20462211719
 8    10 US9898171015  2009 James (Jim) M Weber              3581636766
 9    11 US9898171015  2009 Matthew (Matt) L Hyde            4842568996
10    13 US9898171015  2010 William (Bill) Milroy Barnum Jr 20462211719
11    14 US9898171015  2010 James (Jim) M Weber              3581636766
12    15 US9898171015  2010 Matthew (Matt) L Hyde            4842568996
13    16 US9898171015  2011 Sarah (Sally) Gaines McCoy      11434863691
14    17 US9898171015  2011 William (Bill) Milroy Barnum Jr 20462211719
15    19 US9898171015  2011 Matthew (Matt) L Hyde            4842568996
16    20 US9898171015  2012 Sarah (Sally) Gaines McCoy      11434863691

I think this is what you need:

library(dplyr)
library(tidyr)
dat %>% 
  complete(DIRECTOR_ID, nesting(YEAR)) %>% 
  arrange(DIRECTOR_ID, YEAR) %>% 
  mutate(DIRECTOR_ID = case_when(is.na(ROW) ~ NA_real_, 
                                 TRUE ~ DIRECTOR_ID), 
         inprev = DIRECTOR_ID == lag(DIRECTOR_ID)) %>% 
  filter(inprev) %>% 
  na.omit() %>% 
  select(-inprev) %>% 
  arrange(ROW)

# # A tibble: 28 x 5
#   DIRECTOR_ID  YEAR   ROW ISIN       DIRECTOR_NAME          
#         <dbl> <dbl> <dbl> <chr>      <chr>                  
# 1  4842568996  2007     4 US9898171… Matthew (Matt) L Hyde  
# 2  3581636766  2008     6 US9898171… James (Jim) M Weber    
# 3  4842568996  2008     7 US9898171… Matthew (Matt) L Hyde  
# 4   759047198  2008     8 US9898171… David (Dave) M DeMattei
# 5  3581636766  2009    10 US9898171… James (Jim) M Weber    
# 6  4842568996  2009    11 US9898171… Matthew (Matt) L Hyde  
# 7   759047198  2009    12 US9898171… David (Dave) M DeMattei
# 8 20462211719  2010    13 US9898171… William (Bill) Milroy …
# 9  3581636766  2010    14 US9898171… James (Jim) M Weber    
# 10  4842568996  2010    15 US9898171… Matthew (Matt) L Hyde  

However, I'm not sure because I think the expected output above is still wrong. Only Davin and Hyde are present in 2006, so the only people who should be able to be present in 2007 are either Davin or Hyde. Since Davin is not present in 2007, he does not show up in the output above. If I've misunderstood, happy to reconsider the answer.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM