简体   繁体   中英

Fill in values between start and end value in R

W (blue line below) in my data.frame represents where the water level in the river intersects the elevation profile.

在此处输入图像描述

In my data.frame, for each group in ID, I need to fill in values between the start and end value (W)

My data

> head(df, 23)
   ID elevation code
1   1       150 <NA>
2   1       140 <NA>
3   1       130    W
4   1       120 <NA>
5   1       110 <NA>
6   1       120 <NA>
7   1       130    W
8   1       140 <NA>
9   1       150 <NA>
10  2        90 <NA>
11  2        80 <NA>
12  2        70 <NA>
13  2        66    W
14  2        60 <NA>
15  2        50 <NA>
16  2        66    W
17  2        70 <NA>
18  2        72 <NA>
19  2        68    W
20  2        65 <NA>
21  2        60 <NA>
22  2        68    W
23  2        70 <NA>

I want the final result to look like below

   ID elevation code
1   1       150 <NA>
2   1       140 <NA>
3   1       130    W
4   1       120    W
5   1       110    W
6   1       120    W
7   1       130    W
8   1       140 <NA>
9   1       150 <NA>
10  2        90 <NA>
11  2        80 <NA>
12  2        70 <NA>
13  2        66    W
14  2        60    W
15  2        50    W
16  2        66    W
17  2        70 <NA>
18  2        72 <NA>
19  2        68    W
20  2        65    W
21  2        60    W
22  2        68    W
23  2        70 <NA>

I tried many things but my trials were not successful. Your help will be appreciated.

DATA

> dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), elevation = c(150L, 
140L, 130L, 120L, 110L, 120L, 130L, 140L, 150L, 90L, 80L, 70L, 
66L, 60L, 50L, 66L, 70L, 72L, 68L, 65L, 60L, 68L, 70L), code = c(NA, 
NA, "W", NA, NA, NA, "W", NA, NA, NA, NA, NA, "W", NA, NA, "W", 
NA, NA, "W", NA, NA, "W", NA)), class = "data.frame", row.names = c(NA, 
-23L))

You could do:

df %>%
  group_by(ID)%>%
  mutate(code = coalesce(code, c(NA, "W")[cumsum(!is.na(code)) %% 2 + 1]))

   ID elevation code
1   1       150 <NA>
2   1       140 <NA>
3   1       130    W
4   1       120    W
5   1       110    W
6   1       120    W
7   1       130    W
8   1       140 <NA>
9   1       150 <NA>
10  2        90 <NA>
11  2        80 <NA>
12  2        70 <NA>
13  2        66    W
14  2        60    W
15  2        50    W
16  2        66    W
17  2        70 <NA>
18  2        72 <NA>
19  2        68    W
20  2        65    W
21  2        60    W
22  2        68    W
23  2        70 <NA>

We can try replace + cumsum

df %>%
  group_by(ID) %>%
  mutate(code = replace(code, cumsum(!is.na(code)) %% 2 == 1, "W")) %>%
  ungroup()

which gives

# A tibble: 23 x 3
      ID elevation code
   <int>     <int> <chr>
 1     1       150 NA
 2     1       140 NA
 3     1       130 W
 4     1       120 W    
 5     1       110 W
 6     1       120 W
 7     1       130 W
 8     1       140 NA
 9     1       150 NA
10     2        90 NA
# ... with 13 more rows

You can create a helper function that creates a sequence between each start and end and assigns 'W' to it.

assign_w <- function(code) {
  inds <- which(code == 'W')
  code[unlist(Map(seq, inds[c(TRUE, FALSE)], inds[c(FALSE, TRUE)]))] <- 'W'
  code
}

and apply it for each ID :

library(dplyr)

df %>%
  group_by(ID) %>%
  mutate(result = assign_w(code)) %>%
  ungroup

#   ID elevation code result
#1   1       150 <NA>   <NA>
#2   1       140 <NA>   <NA>
#3   1       130    W      W
#4   1       120 <NA>      W
#5   1       110 <NA>      W
#6   1       120 <NA>      W
#7   1       130    W      W
#8   1       140 <NA>   <NA>
#9   1       150 <NA>   <NA>
#10  2        90 <NA>   <NA>
#11  2        80 <NA>   <NA>
#12  2        70 <NA>   <NA>
#13  2        66    W      W
#14  2        60 <NA>      W
#15  2        50 <NA>      W
#16  2        66    W      W
#17  2        70 <NA>   <NA>
#18  2        72 <NA>   <NA>
#19  2        68    W      W
#20  2        65 <NA>      W
#21  2        60 <NA>      W
#22  2        68    W      W
#23  2        70 <NA>   <NA>
library(dplyr)
df %>%
  group_by(ID) %>%
  mutate(water_flag = (1 * !is.na(code)) * if_else(elevation < lag(elevation, default = 0), 1, -1),
         water = if_else(cumsum(water_flag) == 1, "W", NA_character_))

This answer is similar to @Onyambu's: create an 'index' (ind) that increases by one each time a non-NA is encountered in the 'code' column. If the index value is divisible by 2 (ie it is an even number) insert "NA" into the new column. If the index is not divisible by 2, add a "W" into the new column. Then if there is a "W" in the 'code' or 'new' columns, replace the NA in the 'code' column with W and drop the 'new' column from the dataframe.

df %>% 
  mutate(ind = ifelse(cumsum(!is.na(code)) %% 2 == 0, NA, "W")) %>% 
  mutate(code = ifelse(ind == "W" | code == "W", "W", NA)) %>% 
  select(-c(ind))

#>   ID elevation code
#>1   1       150 <NA>
#>2   1       140 <NA>
#>3   1       130    W
#>4   1       120    W
#>5   1       110    W
#>6   1       120    W
#>7   1       130    W
#>8   1       140 <NA>
#>9   1       150 <NA>
#>10  2        90 <NA>
#>11  2        80 <NA>
#>12  2        70 <NA>
#>13  2        66    W
#>14  2        60    W
#>15  2        50    W
#>16  2        66    W
#>17  2        70 <NA>
#>18  2        72 <NA>
#>19  2        68    W
#>20  2        65    W
#>21  2        60    W
#>22  2        68    W
#>23  2        70 <NA>

First I tried to use fill but had no success. Then I learned here about the benefit of R's recycling property Rename first and second occurence of the same specific value in a column iteratively (Thanks to Ronak!)

# prepare data with renaming `start` and `stop` sequence
df$code[is.na(df$code)] <- "NA"
df$code[df$code == 'W'] <- c('start', 'end')
df$code[df$code=="NA"]<-NA

# Now with different names of start and stop sequence I was able to implement `cumsum`
library(tidyverse)

df <- df %>% 
  group_by(grp = cumsum(!is.na(code))) %>% 
  dplyr::mutate(code = replace(code, first(code) == 'start', 'W'),
                code = replace(code, code=='end', 'W')) %>% 
  ungroup() %>% 
  select(-grp) 

Output:

# A tibble: 23 x 3
      ID elevation code 
   <int>     <int> <chr>
 1     1       150 NA   
 2     1       140 NA   
 3     1       130 W    
 4     1       120 W    
 5     1       110 W    
 6     1       120 W    
 7     1       130 W    
 8     1       140 NA   
 9     1       150 NA   
10     2        90 NA   
11     2        80 NA   
12     2        70 NA   
13     2        66 W    
14     2        60 W    
15     2        50 W    
16     2        66 W    
17     2        70 NA   
18     2        72 NA   
19     2        68 W    
20     2        65 W    
21     2        60 W    
22     2        68 W    
23     2        70 NA  

Though the question has been marked as solved(answer accepted) yet for further/future reference, there is a function fill_run in library runner which does exactly this.

fill_run replaces NA values if they were surrounded by pair of identical values. Since our additional requirement is to look at elevation too we can do something like this

df %>% group_by(ID) %>%
  mutate(code = runner::fill_run(ifelse(!is.na(code), paste(elevation,code), code), only_within = T))

# A tibble: 23 x 3
# Groups:   ID [2]
      ID elevation code 
   <int>     <int> <chr>
 1     1       150 NA   
 2     1       140 NA   
 3     1       130 130 W
 4     1       120 130 W
 5     1       110 130 W
 6     1       120 130 W
 7     1       130 130 W
 8     1       140 NA   
 9     1       150 NA   
10     2        90 NA   
# ... with 13 more rows

Needless to say, you can again mutate non-NA values from code to W very easily, if required.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM