简体   繁体   中英

Exclude rows where value used in another row

Imagine you have the following data set:

df = data.frame(ID = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20), gender= c(1,2,1,2,2,2,2,1,1,2,1,2,1,2,2,2,2,1,1,2),
                PID = c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10))

how can I write a code that removes the rows in the df whose gender and PID are the same (see picture). Please imagine that the code is over 1000 rows long (so it should be a solution that automatically searches for the right values to exclude).


base R

df[ave(rep(TRUE, nrow(df)), df[,c("gender","paar")], FUN = function(z) !any(duplicated(z))),]
#    ID gender paar
# 1   1      1    1
# 2   2      2    1
# 3   3      1    2
# 4   4      2    2
# 7   7      2    4
# 8   8      1    4
# 9   9      1    5
# 10 10      2    5
# 11 11      1    6
# 12 12      2    6
# 13 13      1    7
# 14 14      2    7
# 17 17      2    9
# 18 18      1    9
# 19 19      1   10
# 20 20      2   10


df %>%
  group_by(gender, paar) %>%
  filter(!any(duplicated(cbind(gender, paar)))) %>%

In base R , we may use subset after removing the observations where the group count for 'gender' and 'paar' are not 1

subset(df, ave(seq_along(gender), gender, paar, FUN = length) == 1)

Or with duplicated

df[!(duplicated(df[-1])|duplicated(df[-1], fromLast = TRUE)),]


   ID gender paar
1   1      1    1
2   2      2    1
3   3      1    2
4   4      2    2
7   7      2    4
8   8      1    4
9   9      1    5
10 10      2    5
11 11      1    6
12 12      2    6
13 13      1    7
14 14      2    7
17 17      2    9
18 18      1    9
19 19      1   10
20 20      2   10

Using aggregate

na.omit(aggregate(. ~ gender + PID, df, function(x) 
  ifelse(length(x) == 1, x, NA)))
   gender PID ID
1       1   1  1
2       2   1  2
3       1   2  3
4       2   2  4
6       1   4  8
7       2   4  7
8       1   5  9
9       2   5 10
10      1   6 11
11      2   6 12
12      1   7 13
13      2   7 14
15      1   9 18
16      2   9 17
17      1  10 19
18      2  10 20

With dplyr


df %>% 
  group_by(gender, PID) %>% 
  filter(n() == 1) %>% 
# A tibble: 16 × 3
      ID gender   PID
   <dbl>  <dbl> <dbl>
 1     1      1     1
 2     2      2     1
 3     3      1     2
 4     4      2     2
 5     7      2     4
 6     8      1     4
 7     9      1     5
 8    10      2     5
 9    11      1     6
10    12      2     6
11    13      1     7
12    14      2     7
13    17      2     9
14    18      1     9
15    19      1    10
16    20      2    10

Another dplyr option could be:

df %>%
 filter(with(rle(paste0(gender, PID)), rep(lengths == 1, lengths)))

   ID gender PID
1   1      1   1
2   2      2   1
3   3      1   2
4   4      2   2
5   7      2   4
6   8      1   4
7   9      1   5
8  10      2   5
9  11      1   6
10 12      2   6
11 13      1   7
12 14      2   7
13 17      2   9
14 18      1   9
15 19      1  10
16 20      2  10

If the duplicated values can occur also between non-consecutive rows:

df %>%
 arrange(gender, PID) %>%
 filter(with(rle(paste0(gender, PID)), rep(lengths == 1, lengths)))

Here is one more: :-)

df %>%
  group_by(gender, PID) %>%  
  filter(is.na(ifelse(n()>1, 1, NA))) 
     ID gender   PID
   <dbl>  <dbl> <dbl>
 1     1      1     1
 2     2      2     1
 3     3      1     2
 4     4      2     2
 5     7      2     4
 6     8      1     4
 7     9      1     5
 8    10      2     5
 9    11      1     6
10    12      2     6
11    13      1     7
12    14      2     7
13    17      2     9
14    18      1     9
15    19      1    10
16    20      2    10

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

粤ICP备18138465号  © 2020-2024 STACKOOM.COM