Filtering conditional on lag values in R

Question

df is a dataframe where each row is a pair of items (from item1 & item2 ).

I want to keep the 1st row of the dataframe, and then keep only the 1st rows where the previous value of item2 is the current value of item1 . So I except my data to look like output .

I would prefer a tidy (or purrr ) way of doing so but open to any suggestions.

df <- structure(list(item1 = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
  2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 7L), 
  item2 = c(4L, 5L, 6L, 7L, 8L, 4L, 5L, 6L, 7L, 8L, 4L, 5L, 
  6L, 7L, 8L, 5L, 6L, 7L, 8L, 7L, 8L, 7L, 8L, 8L)), row.names = c(NA, 
  -24L), class = c("tbl_df", "tbl", "data.frame"))
df
#>    item1 item2
#> 1      1     4
#> 2      1     5
#> 3      1     6
#> 4      1     7
#> 5      1     8
#> 6      2     4
#> 7      2     5
#> 8      2     6
#> 9      2     7
#> 10     2     8
#> 11     3     4
#> 12     3     5
#> 13     3     6
#> 14     3     7
#> 15     3     8
#> 16     4     5
#> 17     4     6
#> 18     4     7
#> 19     4     8
#> 20     5     7
#> 21     5     8
#> 22     6     7
#> 23     6     8
#> 24     7     8

output <- data.frame(item1 = c(1,4,5,7),
           item2 = c(4,5,7,8))
output
#>   item1 item2
#> 1     1     4
#> 2     4     5
#> 3     5     7
#> 4     7     8

^{Created on 2022-09-22 by the reprex package (v2.0.1)}

Answer 1

Here's a solution using the tidyverse.

Using a lag(..., default = 1) ensures we also output the first row.

library(tidyverse)

df <- tibble(
  item1 = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 7L), 
  item2 = c(4L, 5L, 6L, 7L, 8L, 4L, 5L, 6L, 7L, 8L, 4L, 5L, 6L, 7L, 8L, 5L, 6L, 7L, 8L, 7L, 8L, 7L, 8L, 8L)
)

df %>%
  group_by(item1) %>%
  summarize(item2 = first(item2)) %>%
  filter(item1 == lag(item2, default = 1))
#> # A tibble: 4 × 2
#>   item1 item2
#>   <int> <int>
#> 1     1     4
#> 2     4     5
#> 3     5     7
#> 4     7     8

^{Created on 2022-09-22 by the reprex package (v2.0.1)}

Answer 2

This is probably not what you were looking for (not a very tidy solution), but it yilds the desired output.

library(tidyverse)

df <- data.frame(
  item1 = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
            2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 7L), 
  item2 = c(4L, 5L, 6L, 7L, 8L, 4L, 5L, 6L, 7L, 8L, 4L, 5L, 
            6L, 7L, 8L, 5L, 6L, 7L, 8L, 7L, 8L, 7L, 8L, 8L)
)

my_filter <- function(df_to_find, df_orig){
  value_to_find <- tail(df_to_find, 1)$item2
  df_found <- df_orig %>%
    filter(item1 == value_to_find) %>%
    head(1)
  
  if(nrow(df_found) > 0){
    # if something found, recall this function
    # with the newly found data appended to the old results
    return(Recall(bind_rows(df_to_find, df_found), df_orig))
  } else{
    # once you reach a state when nothing else is found return the results so far
    # this is called recursion in programming
    return(bind_rows(df_to_find))
  }
  
}

^{Created on 2022-09-22 by the reprex package (v2.0.1)}

Answer 3

This won't be directly vectorizable--I would do it with a simple for loop. This will almost certainly be faster than a recursive solution for any sizable data.

keep = logical(length = nrow(df)) 
keep[1] = TRUE
target = df$item2[1]
for(i in 2:nrow(df)) {
  if(df$item1[i] == target) {
    keep[i] = TRUE
    target = df$item2[i]
  }
}
result = df[keep, ]
result
# # A tibble: 4 × 2
#   item1 item2
#   <int> <int>
# 1     1     4
# 2     4     5
# 3     5     7
# 4     7     8

Answer 4

Here is another untidy and recursive solution:

last2current = function (x) {
  first = x[1, ]
  first_match = with(x, match(item2[1], item1))
  if (is.na(first_match)) return(first)
  other = x[first_match:nrow(x), ]
  rbind(first, last2current(other))
}

last2current(df)

   item1 item2
1      1     4
16     4     5
20     5     7
24     7     8

Explanation:

This is a recursive function, this meaning that it calls itself . It stores the first row, then looks for the first match of item2[1] on item1 and stores the row number in first_match . If there is no first_match it means we are done, so return() . If there is a match then it does the same procedure on the rows from the first match to the end of the data frame. Finally it cbind s all the rows.

Note that this will fail if there is a row where item1 == item2 since item1[1] is included in match .

Answer 5

A base R recursion:

relation <- function(df, row){
  if(is.na(row)) head(row, -1)
  else c(row, relation(df, match(df[row, 2], df[,1]))) 
}

# Starting at row 1
df[relation(df, 1), ]

  item1 item2
1      1     4
16     4     5
20     5     7
24     7     8

# Starting at row 2
df[relation(df, 2), ]
   item1 item2
2      1     5
20     5     7
24     7     8

# Starting at row 4
df[relation(df, 4), ]
   item1 item2
4      1     7
24     7     8

Filtering conditional on lag values in R

Question

5 answers

solution1
1 2022-09-22 15:43:24

solution2
0 2022-09-22 15:01:19

solution3
0 2022-09-22 15:40:32

solution4
0 2022-09-22 15:46:16

Explanation:

solution5
0 2022-09-22 15:59:51

Filtering conditional on lag values in R

Question

5 answers

solution1 1 2022-09-22 15:43:24

solution2 0 2022-09-22 15:01:19

solution3 0 2022-09-22 15:40:32

solution4 0 2022-09-22 15:46:16

Explanation:

solution5 0 2022-09-22 15:59:51

solution1
1 2022-09-22 15:43:24

solution2
0 2022-09-22 15:01:19

solution3
0 2022-09-22 15:40:32

solution4
0 2022-09-22 15:46:16

solution5
0 2022-09-22 15:59:51