[英]Fill blank values with data from following columns
有这样的数据框:
data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""), text2 = c("more","another","add",""), text3 = c("final","and","where","all"))
如何检测 text1 列中的一行是否为空白并用 text2、text3 或 text4 列中存在的值填充空白,并在此过程后离开此 NA
预期输出示例
data.frame(id = c(1,2,3,4), text1 = c("sth","another","another","all"), text2 = c("more","","add",""), text3 = c("final","and","where",""))
向量化的基本 R 方法:
#Get indices where text1 is empty
inds <- which(df$text1 == '')
#get values to replace from the corresponding rows
vals <- cbind(inds, max.col(df[inds, 3:ncol(df)] != "") + 2)
#Replace the values
df$text1[inds] <- df[vals]
#Change the replaced value with blank.
df[vals] <- ''
df
# id text1 text2 text3
#1 1 sth more final
#2 2 another and
#3 3 another add where
#4 4 all
数据
df <- data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""),
text2 = c("more","another","add",""),
text3 = c("final","and","where","all"), stringsAsFactors = FALSE)
在基础 R 中,您可以执行以下操作:
txt <- do.call(paste,c(sep = ',',`is.na<-`(df,df=="")))
df1 <- read.csv(text = sub("((?:,NA)+)(,\\w+)","\\2\\1",txt),
header = FALSE,
col.names = names(df),
stringsAsFactors = FALSE)
df1[is.na(df1)] <- ""
df1
id text1 text2 text3
1 1 sth more final
2 2 another and
3 3 another add where
4 4 all
这是一个data.table
方法......
在下面的评论中解释
#sample data
df <- data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""), text2 = c("more","another","add",""), text3 = c("final","and","where","all"), stringsAsFactors = FALSE)
library( data.table )
#create data.table
setDT( df )
#paste together columns by id
ans <- df[, .(string = paste0( .SD, collapse =";")), by = .(id) ][]
# id string
# 1: 1 sth;more;final
# 2: 2 ;another;and
# 3: 3 another;add;where
# 4: 4 ;;all
#remove leading;'s
ans[, string := gsub("^;+", "", string) ]
# id string
# 1: 1 sth;more;final
# 2: 2 another;and
# 3: 3 another;add;where
# 4: 4 all
#split string back to columns, remove the temporary string-column
ans[, paste0( "text", 1:length( tstrsplit(ans$string, ";") ) ) :=
tstrsplit( string, ";") ][, string := NULL ]
# id text1 text2 text3
# 1: 1 sth more final
# 2: 2 another and <NA>
# 3: 3 another add where
# 4: 4 all <NA> <NA>
您可以使用dplyr
+ purrr
:
df %>%
tidyr::nest(-id) %>%
dplyr::mutate(
new_text = purrr::map_chr(
data, ~
as.vector(t(.x[1,])) %>%
.[. != ""] %>%
dplyr::first())) %>%
tidyr::unnest()
A tibble: 4 x 5
id text1 text2 text3 new_text
<dbl> <fct> <fct> <fct> <chr>
1 1 sth more final sth
2 2 "" another and another
3 3 another add where another
4 4 "" "" all all
在这个阶段,为什么不用 dplyr 方法呢? 诚然,中间有一点基础 R
df <- data.frame(id = c(1,2,3,4),
text1 = c("sth","","another",""),
text2 = c("more","another","add",""),
text3 = c("final","and","where","all"))
library("dplyr")
library("tidyr")
df_filled <- df %>%
pivot_longer(cols = starts_with("text"),
names_to = "text_id",
values_to = "value") %>%
mutate(value = as.character(value)) %>%
group_by(id) %>%
mutate(value = if_else(value=="", as.character(NA), value)) %>%
mutate(previously_missing = value) %>%
tidyr::fill(value, .direction = "downup")
df_filled$value[which(is.na(df_filled$previously_missing)&df_filled$text_id!="text3")+1] <- NA
df_filled %>%
ungroup() %>%
pivot_wider(id_cols = id,
names_from = "text_id",
values_from = "value")
#> # A tibble: 4 x 4
#> id text1 text2 text3
#> <dbl> <chr> <chr> <chr>
#> 1 1 sth more final
#> 2 2 another <NA> and
#> 3 3 another add where
#> 4 4 all <NA> <NA>
由reprex 包(v0.3.0) 于 2020 年 2 月 19 日创建
另一个基本的 R 解决方案是定义您的自定义函数swap
并按行应用它,即,
swap <- function(v) {v[inds]<-v[rev(inds <- c(1,head(which(nchar(v)>0),1)))];v}
df[-1]<-t(apply(df[-1], 1, swap))
以至于
> df
id text1 text2 text3
1 1 sth more final
2 2 another and
3 3 another add where
4 4 all
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.