[英]Remove groups based on number of observations below a certain value using dplyr
[英]comparing groups and filter common observations using dplyr
这是一个示例数据集 -
data.frame(ISIN = c("US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US9898171015", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US9898171015",
"US9898171015", "US9898171015", "US9898171015", "US9898171015",
"US9898171015", "US9898171015", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US5535301064", "US5535301064",
"US5535301064", "US5535301064", "US9898171015", "US9898171015",
"US9898171015", "US9898171015", "US9898171015", "US9898171015",
"US9898171015", "US9898171015", "US9898171015", "US9898171015",
"US9898171015", "US9898171015", "US9898171015", "US9898171015",
"US9898171015", "US9898171015", "US9898171015", "US9898171015",
"US9898171015", "US9898171015", "US9898171015", "US9898171015",
"US9898171015", "US9898171015", "US9898171015", "US9898171015",
"US9898171015", "US5535301064", "US5535301064", "US9898171015",
"US9898171015", "US9898171015"), year = c(2016, 2017, 2009, 2010,
2011, 2012, 2013, 2014, 2015, 2015, 2010, 2011, 2012, 2013, 2014,
2015, 2016, 2017, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2000,
2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
2012, 2013, 2014, 2015, 2016, 2017, 2000, 2001, 2002, 2003, 2004,
2005, 2006, 2007, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2000, 2001, 2002,
2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
2014, 2015, 2016, 2017, 2017, 2009, 2010, 2011, 2006, 2007, 2008,
2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2006, 2007,
2008, 2009, 2010, 2011, 2013, 2014, 2015, 2016, 2017, 2016, 2017,
2007, 2008, 2009), DirectorName = c("Steven Paladino", "Steven Paladino",
"Louise Koopman Goeser", "Louise Koopman Goeser", "Louise Koopman Goeser",
"Louise Koopman Goeser", "Louise Koopman Goeser", "Louise Koopman Goeser",
"Louise Koopman Goeser", "Kalen F Holmes", "Doctor Jonathan L Byrnes",
"Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes",
"Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes", "Doctor Jonathan L Byrnes",
"Doctor Jonathan L Byrnes", "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy",
"Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy",
"Sarah (Sally) Gaines McCoy", "Sarah (Sally) Gaines McCoy", "Denis F Kelly",
"Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly",
"Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly",
"Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly",
"Denis F Kelly", "Denis F Kelly", "Denis F Kelly", "Denis F Kelly",
"Denis F Kelly", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton",
"Raymond (Ray) B Langton", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton",
"Raymond (Ray) B Langton", "Raymond (Ray) B Langton", "Raymond (Ray) B Langton",
"Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin",
"Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin",
"Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin",
"Roger B Fradin", "Roger B Fradin", "Roger B Fradin", "Roger B Fradin",
"Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller",
"Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller",
"Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller",
"Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller",
"Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller",
"Philip (Phil) R Peller", "Philip (Phil) R Peller", "Philip (Phil) R Peller",
"Scott Andrew Bailey", "William (Bill) Milroy Barnum Jr", "William (Bill) Milroy Barnum Jr",
"William (Bill) Milroy Barnum Jr", "Thomas (Tom) E Davin", "James (Jim) M Weber",
"James (Jim) M Weber", "James (Jim) M Weber", "James (Jim) M Weber",
"James (Jim) M Weber", "Ernest R Johnson", "Ernest R Johnson",
"Ernest R Johnson", "Ernest R Johnson", "Ernest R Johnson", "Ernest R Johnson",
"Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde",
"Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde", "Matthew (Matt) L Hyde",
"Travis D Smith", "Travis D Smith", "Travis D Smith", "Travis D Smith",
"Travis D Smith", "Michael (Mike) C Kaufmann", "Michael (Mike) C Kaufmann",
"David (Dave) M DeMattei", "David (Dave) M DeMattei", "David (Dave) M DeMattei"
), DirectorID = c("1000169302", "1000169302", "1058973478", "1058973478",
"1058973478", "1058973478", "1058973478", "1058973478", "1058973478",
"11051172801", "11275933344", "11275933344", "11275933344", "11275933344",
"11275933344", "11275933344", "11275933344", "11275933344", "11434863691",
"11434863691", "11434863691", "11434863691", "11434863691", "11434863691",
"11434863691", "1223062984", "1223062984", "1223062984", "1223062984",
"1223062984", "1223062984", "1223062984", "1223062984", "1223062984",
"1223062984", "1223062984", "1223062984", "1223062984", "1223062984",
"1223062984", "1223062984", "1223062984", "1223062984", "1223122984",
"1223122984", "1223122984", "1223122984", "1223122984", "1223122984",
"1223122984", "1223122984", "1223392984", "1223392984", "1223392984",
"1223392984", "1223392984", "1223392984", "1223392984", "1223392984",
"1223392984", "1223392984", "1223392984", "1223392984", "1223392984",
"1223392984", "1223392984", "1223392984", "1223552984", "1223552984",
"1223552984", "1223552984", "1223552984", "1223552984", "1223552984",
"1223552984", "1223552984", "1223552984", "1223552984", "1223552984",
"1223552984", "1223552984", "1223552984", "1223552984", "1223552984",
"1223552984", "174488610522", "20462211719", "20462211719", "20462211719",
"2247441792", "3581636766", "3581636766", "3581636766", "3581636766",
"3581636766", "40425210975", "40425210975", "40425210975", "40425210975",
"40425210975", "40425210975", "4842568996", "4842568996", "4842568996",
"4842568996", "4842568996", "4842568996", "53006212569", "53006212569",
"53006212569", "53006212569", "53006212569", "5532705122", "5532705122",
"759047198", "759047198", "759047198"))
其实我想过滤掉那些从上一年t-1
到当年t
相同的DIRECTOR_ID
我运行以下代码来创建组 -
ceo1 %>%
group_by(ISIN, YEAR) %>%
mutate(
GROUP_ID = cur_group_id()
)
# A tibble: 38 x 6
# Groups: ISIN, YEAR [12]
ROW ISIN YEAR DIRECTOR_NAME DIRECTOR_ID GROUP_ID
<dbl> <chr> <dbl> <chr> <dbl> <int>
1 1 US98981710~ 2006 Thomas (Tom) E Davin 2247441792 1
2 2 US98981710~ 2006 Matthew (Matt) L Hyde 4842568996 1
3 3 US98981710~ 2007 James (Jim) M Weber 3581636766 2
4 4 US98981710~ 2007 Matthew (Matt) L Hyde 4842568996 2
5 5 US98981710~ 2007 David (Dave) M DeMattei 759047198 2
6 6 US98981710~ 2008 James (Jim) M Weber 3581636766 3
7 7 US98981710~ 2008 Matthew (Matt) L Hyde 4842568996 3
8 8 US98981710~ 2008 David (Dave) M DeMattei 759047198 3
9 9 US98981710~ 2009 William (Bill) Milroy Ba~ 20462211719 4
10 10 US98981710~ 2009 James (Jim) M Weber 3581636766 4
然后我不知道如何过滤掉那些每年都相同的DIRECTOR_ID。 例如,对于上述输出 - DIRECTOR_ID - 4842568996 是在 2006、2007、2008 和 DIRECTOR_ID - 3581636766 是在 2007、2008、2009。我想过滤掉他们上一年( t-1
)和当年( t
)。 请注意,我必须将所有变量保留在新数据集中。 谢谢
预计到 2012 年的样本数据 output 将如下所示 -
A tibble: 16 x 5
ROW ISIN YEAR DIRECTOR_NAME DIRECTOR_ID
<dbl> <chr> <dbl> <chr> <dbl>
1 3 US9898171015 2007 James (Jim) M Weber 3581636766
2 4 US9898171015 2007 Matthew (Matt) L Hyde 4842568996
3 5 US9898171015 2007 David (Dave) M DeMattei 759047198
4 6 US9898171015 2008 James (Jim) M Weber 3581636766
5 7 US9898171015 2008 Matthew (Matt) L Hyde 4842568996
6 8 US9898171015 2008 David (Dave) M DeMattei 759047198
7 9 US9898171015 2009 William (Bill) Milroy Barnum Jr 20462211719
8 10 US9898171015 2009 James (Jim) M Weber 3581636766
9 11 US9898171015 2009 Matthew (Matt) L Hyde 4842568996
10 13 US9898171015 2010 William (Bill) Milroy Barnum Jr 20462211719
11 14 US9898171015 2010 James (Jim) M Weber 3581636766
12 15 US9898171015 2010 Matthew (Matt) L Hyde 4842568996
13 16 US9898171015 2011 Sarah (Sally) Gaines McCoy 11434863691
14 17 US9898171015 2011 William (Bill) Milroy Barnum Jr 20462211719
15 19 US9898171015 2011 Matthew (Matt) L Hyde 4842568996
16 20 US9898171015 2012 Sarah (Sally) Gaines McCoy 11434863691
我认为这是你需要的:
library(dplyr)
library(tidyr)
dat %>%
complete(DIRECTOR_ID, nesting(YEAR)) %>%
arrange(DIRECTOR_ID, YEAR) %>%
mutate(DIRECTOR_ID = case_when(is.na(ROW) ~ NA_real_,
TRUE ~ DIRECTOR_ID),
inprev = DIRECTOR_ID == lag(DIRECTOR_ID)) %>%
filter(inprev) %>%
na.omit() %>%
select(-inprev) %>%
arrange(ROW)
# # A tibble: 28 x 5
# DIRECTOR_ID YEAR ROW ISIN DIRECTOR_NAME
# <dbl> <dbl> <dbl> <chr> <chr>
# 1 4842568996 2007 4 US9898171… Matthew (Matt) L Hyde
# 2 3581636766 2008 6 US9898171… James (Jim) M Weber
# 3 4842568996 2008 7 US9898171… Matthew (Matt) L Hyde
# 4 759047198 2008 8 US9898171… David (Dave) M DeMattei
# 5 3581636766 2009 10 US9898171… James (Jim) M Weber
# 6 4842568996 2009 11 US9898171… Matthew (Matt) L Hyde
# 7 759047198 2009 12 US9898171… David (Dave) M DeMattei
# 8 20462211719 2010 13 US9898171… William (Bill) Milroy …
# 9 3581636766 2010 14 US9898171… James (Jim) M Weber
# 10 4842568996 2010 15 US9898171… Matthew (Matt) L Hyde
但是,我不确定,因为我认为上面预期的 output 仍然是错误的。 2006 年只有 Davin 和 Hyde 出席,所以 2007 年应该能够出席的只有 Davin 或 Hyde。 由于 Davin 没有出现在 2007 年,因此他没有出现在上面的 output 中。 如果我误解了,很高兴重新考虑答案。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.