繁体   English   中英

使用 dplyr 比较组并过滤常见观察值

[英]comparing groups and filter common observations using dplyr

这是一个示例数据集 -

data.frame(ISIN = c("US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US9898171015", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US5535301064", "US5535301064", 
                               "US5535301064", "US5535301064", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US9898171015", "US9898171015", "US9898171015", 
                               "US9898171015", "US5535301064", "US5535301064", "US9898171015", 
                               "US9898171015", "US9898171015"), year = c(2016, 2017, 2009, 2010, 
                                                                         2011, 2012, 2013, 2014, 2015, 2015, 2010, 2011, 2012, 2013, 2014, 
                                                                         2015, 2016, 2017, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2000, 
                                                                         2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 
                                                                         2012, 2013, 2014, 2015, 2016, 2017, 2000, 2001, 2002, 2003, 2004, 
                                                                         2005, 2006, 2007, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 
                                                                         2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2000, 2001, 2002, 
                                                                         2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 
                                                                         2014, 2015, 2016, 2017, 2017, 2009, 2010, 2011, 2006, 2007, 2008, 
                                                                         2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2006, 2007, 
                                                                         2008, 2009, 2010, 2011, 2013, 2014, 2015, 2016, 2017, 2016, 2017, 
                                                                         2007, 2008, 2009), DirectorName = c("Steven Paladino", "Steven Paladino", 
                                                                                                             "Louise Koopman Goeser", "Louise Koopman Goeser", "Louise Koopman Goeser", 
                                                                                                             "Louise Koopman Goeser", "Louise Koopman Goeser", "Louise Koopman Goeser", 
                                                                                                             "Louise Koopman Goeser", "Kalen F Holmes", "Doctor Jonathan L Byrnes", 
                                                                                                             "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", 
                                                                                                             "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", 
                                                                                                             "Doctor Jonathan L Byrnes", "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", 
                                                                                                             "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", 
                                                                                                             "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly", 
                                                                                                             "Denis F Kelly", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", 
                                                                                                             "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", 
                                                                                                             "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton", 
                                                                                                             "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", 
                                                                                                             "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", 
                                                                                                             "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", 
                                                                                                             "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller", 
                                                                                                             "Scott Andrew Bailey", "William (Bill) Milroy Barnum Jr", "William (Bill) Milroy Barnum Jr", 
                                                                                                             "William (Bill) Milroy Barnum Jr", "Thomas (Tom) E Davin", "James (Jim) M Weber", 
                                                                                                             "James (Jim) M Weber", "James (Jim) M Weber", "James (Jim) M Weber", 
                                                                                                             "James (Jim) M Weber", "Ernest R Johnson", "Ernest R Johnson", 
                                                                                                             "Ernest R Johnson", "Ernest R Johnson", "Ernest R Johnson", "Ernest R Johnson", 
                                                                                                             "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", 
                                                                                                             "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", 
                                                                                                             "Travis D Smith", "Travis D Smith", "Travis D Smith", "Travis D Smith", 
                                                                                                             "Travis D Smith", "Michael (Mike) C Kaufmann", "Michael (Mike) C Kaufmann", 
                                                                                                             "David (Dave) M DeMattei", "David (Dave) M DeMattei", "David (Dave) M DeMattei"
                                                                         ), DirectorID = c("1000169302", "1000169302", "1058973478", "1058973478", 
                                                                                           "1058973478", "1058973478", "1058973478", "1058973478", "1058973478", 
                                                                                           "11051172801", "11275933344", "11275933344", "11275933344", "11275933344", 
                                                                                           "11275933344", "11275933344", "11275933344", "11275933344", "11434863691", 
                                                                                           "11434863691", "11434863691", "11434863691", "11434863691", "11434863691", 
                                                                                           "11434863691", "1223062984", "1223062984", "1223062984", "1223062984", 
                                                                                           "1223062984", "1223062984", "1223062984", "1223062984", "1223062984", 
                                                                                           "1223062984", "1223062984", "1223062984", "1223062984", "1223062984", 
                                                                                           "1223062984", "1223062984", "1223062984", "1223062984", "1223122984", 
                                                                                           "1223122984", "1223122984", "1223122984", "1223122984", "1223122984", 
                                                                                           "1223122984", "1223122984", "1223392984", "1223392984", "1223392984", 
                                                                                           "1223392984", "1223392984", "1223392984", "1223392984", "1223392984", 
                                                                                           "1223392984", "1223392984", "1223392984", "1223392984", "1223392984", 
                                                                                           "1223392984", "1223392984", "1223392984", "1223552984", "1223552984", 
                                                                                           "1223552984", "1223552984", "1223552984", "1223552984", "1223552984", 
                                                                                           "1223552984", "1223552984", "1223552984", "1223552984", "1223552984", 
                                                                                           "1223552984", "1223552984", "1223552984", "1223552984", "1223552984", 
                                                                                           "1223552984", "174488610522", "20462211719", "20462211719", "20462211719", 
                                                                                           "2247441792", "3581636766", "3581636766", "3581636766", "3581636766", 
                                                                                           "3581636766", "40425210975", "40425210975", "40425210975", "40425210975", 
                                                                                           "40425210975", "40425210975", "4842568996", "4842568996", "4842568996", 
                                                                                           "4842568996", "4842568996", "4842568996", "53006212569", "53006212569", 
                                                                                           "53006212569", "53006212569", "53006212569", "5532705122", "5532705122", 
                                                                                           "759047198", "759047198", "759047198"))

其实我想过滤掉那些从上一年t-1到当年t相同的DIRECTOR_ID

我运行以下代码来创建组 -

ceo1 %>%
  group_by(ISIN, YEAR) %>% 
  mutate(
    GROUP_ID = cur_group_id()
  )


# A tibble: 38 x 6
# Groups:   ISIN, YEAR [12]
     ROW ISIN         YEAR DIRECTOR_NAME             DIRECTOR_ID GROUP_ID
   <dbl> <chr>       <dbl> <chr>                           <dbl>    <int>
 1     1 US98981710~  2006 Thomas (Tom) E Davin       2247441792        1
 2     2 US98981710~  2006 Matthew (Matt) L Hyde      4842568996        1
 3     3 US98981710~  2007 James (Jim) M Weber        3581636766        2
 4     4 US98981710~  2007 Matthew (Matt) L Hyde      4842568996        2
 5     5 US98981710~  2007 David (Dave) M DeMattei     759047198        2
 6     6 US98981710~  2008 James (Jim) M Weber        3581636766        3
 7     7 US98981710~  2008 Matthew (Matt) L Hyde      4842568996        3
 8     8 US98981710~  2008 David (Dave) M DeMattei     759047198        3
 9     9 US98981710~  2009 William (Bill) Milroy Ba~ 20462211719        4
10    10 US98981710~  2009 James (Jim) M Weber        3581636766        4

然后我不知道如何过滤掉那些每年都相同的DIRECTOR_ID。 例如,对于上述输出 - DIRECTOR_ID - 4842568996 是在 2006、2007、2008 和 DIRECTOR_ID - 3581636766 是在 2007、2008、2009。我想过滤掉他们上一年( t-1 )和当年( t )。 请注意,我必须将所有变量保留在新数据集中。 谢谢

预计到 2012 年的样本数据 output 将如下所示 -

A tibble: 16 x 5
     ROW ISIN          YEAR DIRECTOR_NAME                   DIRECTOR_ID
   <dbl> <chr>        <dbl> <chr>                                 <dbl>
 1     3 US9898171015  2007 James (Jim) M Weber              3581636766
 2     4 US9898171015  2007 Matthew (Matt) L Hyde            4842568996
 3     5 US9898171015  2007 David (Dave) M DeMattei           759047198
 4     6 US9898171015  2008 James (Jim) M Weber              3581636766
 5     7 US9898171015  2008 Matthew (Matt) L Hyde            4842568996
 6     8 US9898171015  2008 David (Dave) M DeMattei           759047198
 7     9 US9898171015  2009 William (Bill) Milroy Barnum Jr 20462211719
 8    10 US9898171015  2009 James (Jim) M Weber              3581636766
 9    11 US9898171015  2009 Matthew (Matt) L Hyde            4842568996
10    13 US9898171015  2010 William (Bill) Milroy Barnum Jr 20462211719
11    14 US9898171015  2010 James (Jim) M Weber              3581636766
12    15 US9898171015  2010 Matthew (Matt) L Hyde            4842568996
13    16 US9898171015  2011 Sarah (Sally) Gaines McCoy      11434863691
14    17 US9898171015  2011 William (Bill) Milroy Barnum Jr 20462211719
15    19 US9898171015  2011 Matthew (Matt) L Hyde            4842568996
16    20 US9898171015  2012 Sarah (Sally) Gaines McCoy      11434863691

我认为这是你需要的:

library(dplyr)
library(tidyr)
dat %>% 
  complete(DIRECTOR_ID, nesting(YEAR)) %>% 
  arrange(DIRECTOR_ID, YEAR) %>% 
  mutate(DIRECTOR_ID = case_when(is.na(ROW) ~ NA_real_, 
                                 TRUE ~ DIRECTOR_ID), 
         inprev = DIRECTOR_ID == lag(DIRECTOR_ID)) %>% 
  filter(inprev) %>% 
  na.omit() %>% 
  select(-inprev) %>% 
  arrange(ROW)

# # A tibble: 28 x 5
#   DIRECTOR_ID  YEAR   ROW ISIN       DIRECTOR_NAME          
#         <dbl> <dbl> <dbl> <chr>      <chr>                  
# 1  4842568996  2007     4 US9898171… Matthew (Matt) L Hyde  
# 2  3581636766  2008     6 US9898171… James (Jim) M Weber    
# 3  4842568996  2008     7 US9898171… Matthew (Matt) L Hyde  
# 4   759047198  2008     8 US9898171… David (Dave) M DeMattei
# 5  3581636766  2009    10 US9898171… James (Jim) M Weber    
# 6  4842568996  2009    11 US9898171… Matthew (Matt) L Hyde  
# 7   759047198  2009    12 US9898171… David (Dave) M DeMattei
# 8 20462211719  2010    13 US9898171… William (Bill) Milroy …
# 9  3581636766  2010    14 US9898171… James (Jim) M Weber    
# 10  4842568996  2010    15 US9898171… Matthew (Matt) L Hyde  

但是,我不确定,因为我认为上面预期的 output 仍然是错误的。 2006 年只有 Davin 和 Hyde 出席,所以 2007 年应该能够出席的只有 Davin 或 Hyde。 由于 Davin 没有出现在 2007 年,因此他没有出现在上面的 output 中。 如果我误解了,很高兴重新考虑答案。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM