[英]uniting multiple columns that contain empty NA values
我有一个创建多列的数据集,但是数据是相同的(boxID)。 我想统一这些列,以便仅拥有boxID(字母数字代码:两个字母的状态缩写和2个数字)而不是NA值,这就是现在当我从以下位置使用unite()函数时发生的情况dplyr。 有类似的功能可以做到这一点,还是我需要基于与stringr的模式匹配来提取boxID?
dat <- structure(list(boxId = c("CA04", "CA04", "CA01", "CA02", "CA04",
"CA02", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxId__1 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "NM01", "NM14", "NM15",
"NM16", "NM17", "NM18", "NM19", "NM20", "NM02", "NM03", "NM04",
"NM05", "NM06", "NM07", "NM08", "NM09", "NM10", "NM11", "NM12",
"NM13"), boxId__2 = c(NA, NA, NA, NA, NA, NA, "FL01", "FL02",
"FL03", "FL09", "FL08", "FL07", "FL04", "FL05", "FL06", "FL10",
"FL11", "FL13", "FL12", "FL20", "FL19", "FL18", "FL17", "FL16",
"FL14", "FL15", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxID = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxID__1 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
boxID__2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), boxID__3 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, "IN05", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), boxID__4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), boxID__5 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
boxID__6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), boxID__7 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
boxID__8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), boxID__9 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "WA11", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), row.names = c(NA, -48L), class = c("tbl_df", "tbl", "data.frame"
))
数据如下所示:
# A tibble: 48 x 13
boxId boxId__1 boxId__2 boxID boxID__1 boxID__2 boxID__3 boxID__4 boxID__5 boxID__6
<chr> <chr> <chr> <lgl> <lgl> <lgl> <chr> <lgl> <lgl> <lgl>
1 CA04 NA NA NA NA NA NA NA NA NA
2 CA04 NA NA NA NA NA NA NA NA NA
3 CA01 NA NA NA NA NA NA NA NA NA
4 CA02 NA NA NA NA NA NA NA NA NA
5 CA04 NA NA NA NA NA NA NA NA NA
6 CA02 NA NA NA NA NA NA NA NA NA
7 NA NA FL01 NA NA NA NA NA NA NA
8 NA NA FL02 NA NA NA NA NA NA NA
9 NA NA FL03 NA NA NA NA NA NA NA
10 NA NA FL09 NA NA NA NA NA NA NA
# … with 38 more rows, and 3 more variables: boxID__7 <lgl>, boxID__8 <lgl>, boxID__9 <chr>
当我使用unite()时,它看起来像这样:
dat %>%
unite('newID')
我坚持使用这些NA值:
# A tibble: 48 x 1
newID
<chr>
1 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
2 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
3 CA01_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
4 CA02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
5 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
6 CA02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
7 NA_NA_FL01_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
8 NA_NA_FL02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
您知道每行只有一个非NA吗? 如果是,那么您可以透视表并删除所有NA。 您将获得一张表,该表的行数与原始表相同。
library("tidyverse")
dat %>%
# Adding row ID for clarity
mutate(row = row_number()) %>%
gather(box, name, - row) %>%
drop_na()
#> # A tibble: 48 x 3
#> row box name
#> <int> <chr> <chr>
#> 1 1 boxId CA04
#> 2 2 boxId CA04
#> 3 3 boxId CA01
#> 4 4 boxId CA02
#> 5 5 boxId CA04
#> 6 6 boxId CA02
#> 7 29 boxId__1 NM01
#> 8 30 boxId__1 NM14
#> 9 31 boxId__1 NM15
#> 10 32 boxId__1 NM16
#> # ... with 38 more rows
由reprex软件包 (v0.2.1)创建于2019-03-11
一种基本的R方法是从数据unlist
所有值,并仅选择非NA值以创建一列的新数据框。
x <- unlist(dat)
data.frame(new_id = x[!is.na(x)])
# new_id
#boxId1 CA04
#boxId2 CA04
#boxId3 CA01
#boxId4 CA02
#boxId5 CA04
#boxId6 CA02
#boxId__129 NM01
#boxId__130 NM14
#boxId__131 NM15
#......
随着coalesce
:
dat %>%
mutate_all(as.character) %>%
transmute(newID = coalesce(!!! syms(names(.))))
# # A tibble: 48 x 1
# newID
# <chr>
# 1 CA04
# 2 CA04
# 3 CA01
# 4 CA02
# 5 CA04
# 6 CA02
# 7 FL01
# 8 FL02
# 9 FL03
# 10 FL09
# # … with 38 more rows
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.