繁体   English   中英

如何按r中的特定行拆分数据帧

[英]how to split a dataframe by specific rows in r

我有一个data看起来像这样:

data <- structure(list(A = c("1", "1", "1", "A", "10", "10", "B", "200"), B = c("2", "2", "2", "B", "20", "20", "C", "300"), C = c("3","3", "3", "C", "30", "30", "D", "400"), D = c("4", "4", "4", "D", "40", "40", NA, NA)), row.names = c(NA, -8L), class = c("tbl_df","tbl", "data.frame"))
data

> data
# A tibble: 8 x 4
  A     B     C     D    
  <chr> <chr> <chr> <chr>
1 1     2     3     4    
2 1     2     3     4    
3 1     2     3     4    
4 A     B     C     D    
5 10    20    30    40   
6 10    20    30    40   
7 B     C     D     NA   
8 200   300   400   NA    

按行绑定是错误的,我想将data拆分为 3 个子数据( d1d2d3 ),如下所示:

注意:在我的真实情况下, d1d2d3有不同的nrow() 我设置nrow(d1) = 3 , nrow(d2) = 2nrow(d3) = 1只是为了简化这个例子中的问题。

d1 <- data.frame(A = rep(1,3),  B = rep(2,3),   C = rep(3,3),   D = rep(4,3))
d2 <- data.frame(A = rep(10,2), B = rep(20,2),  C = rep(30,2),  D = rep(40,2))
d3 <- data.frame(        B = 200, C = 300, D = 400)

> d1
  A B C D
1 1 2 3 4
2 1 2 3 4
3 1 2 3 4
> d2
   A  B  C  D
1 10 20 30 40
2 10 20 30 40
> d3
    B   C   D
1 200 300 400

然后我可以使用bind_rowsdplyr正确绑定它们

bind_rows(d1, d2, d3) %>% as_tibble()

# A tibble: 6 x 4
      A     B     C     D
  <dbl> <dbl> <dbl> <dbl>
1     1     2     3     4
2     1     2     3     4
3     1     2     3     4
4    10    20    30    40
5    10    20    30    40
6    NA   200   300   400

问题是我对如何从data获取d1d2d3感到困扰。

任何帮助将不胜感激!

这是一个tidyverse解决方案。

process_df获取一个数据框并设置列名并删除第一行。

process_df <- function(df, ...) {
  df %>%
    set_names(slice(., 1)) %>%
    select(which(!is.na(names(.)))) %>%
    slice(-1)
}

添加仅包含列名称的标题行。

使用rowwise()c_across()获取所有列的值。 使用它来确定哪些行是标题行。

group_map将在每个组上应用一个函数, bind_rows将组合结果。

data %>%
  add_row(!!!set_names(names(.)), .before = 1) %>%
  rowwise() %>%
  mutate(
    group = all(is.na(c_across()) | c_across() %in% names(.))
  ) %>%
  ungroup() %>%
  mutate(group = cumsum(group)) %>%
  group_by(group) %>%
  group_map(process_df) %>%
  bind_rows()
#> # A tibble: 6 x 4
#>   A     B     C     D    
#>   <chr> <chr> <chr> <chr>
#> 1 1     2     3     4    
#> 2 1     2     3     4    
#> 3 1     2     3     4    
#> 4 10    20    30    40   
#> 5 10    20    30    40   
#> 6 NA    200   300   400 

的用法说明!!! 在新new_row

set_names(names(.))创建一个命名向量,表示我们要添加的行。 但是, add_row不接受命名向量 - 它希望将值指定为参数。

这是一个简化的示例。

new_row <- c(speed = 1, dist = 2)

add_row不接受命名向量,因此这不起作用。

cars %>% add_row(new_row, .before = TRUE)
# (Error)

!!! 将向量解包作为函数的参数。

cars %>% add_row(!!!new_row, .before = TRUE)
# (Works)

!!! 以上基本上导致:

cars %>% add_row(speed = 1, dist = 2, .before = TRUE)

这是否有效:

data
# A tibble: 5 x 4
  A     B     C     D    
  <chr> <chr> <chr> <chr>
1 1     2     3     4    
2 A     B     C     D    
3 10    20    30    40   
4 B     C     D     NA   
5 200   300   400   NA   
data <- rbind(LETTERS[1:4],data)
data
# A tibble: 6 x 4
  A     B     C     D    
  <chr> <chr> <chr> <chr>
1 A     B     C     D    
2 1     2     3     4    
3 A     B     C     D    
4 10    20    30    40   
5 B     C     D     NA   
6 200   300   400   NA   
split(data, rep(1:ceiling(nrow(data)/2), each = 2))
$`1`
# A tibble: 2 x 4
  A     B     C     D    
  <chr> <chr> <chr> <chr>
1 A     B     C     D    
2 1     2     3     4    

$`2`
# A tibble: 2 x 4
  A     B     C     D    
  <chr> <chr> <chr> <chr>
1 A     B     C     D    
2 10    20    30    40   

$`3`
# A tibble: 2 x 4
  A     B     C     D    
  <chr> <chr> <chr> <chr>
1 B     C     D     NA   
2 200   300   400   NA   

基础 R 解决方案:

Map(function(x){setNames(data.frame(t(x[,2, drop = FALSE])), x[,1])[,!is.na(x[,1])]},
  split.default(cbind(X0 = names(df), data.frame(t(df))), c(0, seq_len(nrow(df)) %/% 2)))

包括将单独的 data.frames 推送到 Global Environment:

list2env(setNames(Map(function(x){setNames(data.frame(t(x[,2, drop = FALSE])), x[,1])[,!is.na(x[,1])]},
  split.default(cbind(X0 = names(df), data.frame(t(df))), c(0, seq_len(nrow(df)) %/% 2))),
    paste0('d', seq_len(ceiling(nrow(df) / 2)))), .GlobalEnv)

Tidyverse 解决方案:

library(tidyverse)
df %>%
  rbind(names(df), .) %>%
  split(cumsum(seq_len(nrow(.)) %% 2)) %>%
  Map(function(x){setNames(x[2,], x[1,])[,complete.cases(t(x))]}, .) %>%
  set_names(str_c('d', names(.))) %>%
  list2env(., .GlobalEnv)

注意解决方案已调整以反映对问题的编辑:

rdf <- type.convert(data.frame(t(rbind(names(df), df))))

Map(function(x){
  y <- setNames(t(x[,-1, drop = FALSE]), x[,1]); y[,!is.na(colSums(y))]
}, split.default(rdf, cumsum(!sapply(rdf, is.integer))))

新的解决方案,包括推送到 Global Env:

rdf <- type.convert(data.frame(t(rbind(names(df), df))))

dflist <- Map(function(x) {
  y <-
    setNames(t(x[, -1, drop = FALSE]), x[, 1])
  y[, !is.na(colSums(y))]
}, split.default(rdf, cumsum(!sapply(rdf, is.integer))))

list2env(setNames(dflist, paste0('d', names(dflist))), .GlobalEnv)

调整后的 Tidyverse 解决方案:

df %>%
  rbind(names(.), .) %>%
  t() %>%
  data.frame() %>% 
  type.convert() %>%
  split.default(cumsum(!sapply(., is.integer))) %>%
  Map(function(x){
    y <- setNames(t(x[,-1, drop = FALSE]), x[,1])
    data.frame(y[,!is.na(colSums(y)), drop = FALSE])}, .) %>%
  set_names(str_c('d', names(.))) %>%
  list2env(., .GlobalEnv)

数据:

df <- structure(list(A = c("1", "A", "10", "B", "200"), B = c("2", "B", "20", "C", "300"), C = c("3", "C", "30", "D", "400"), D = c("4","D", "40", NA, NA)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))

更新数据:

df <- structure(list(A = c("1", "1", "1", "A", "10", "10", "B", "200"), B = c("2", "2", "2", "B", "20", "20", "C", "300"), C = c("3","3", "3", "C", "30", "30", "D", "400"), D = c("4", "4", "4", "D", "40", "40", NA, NA)), row.names = c(NA, -8L), class = c("tbl_df","tbl", "data.frame"))

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM