繁体   English   中英

R dataframe 中每一列的前 n 行

[英]Top n rows for every column in R dataframe

我有以下 CSV 文件。

,cid1,cid2,cid3
rid1,0.1,0.4,0.3
rid2,1.0,0.1,0.5
rid3,0.2,0.5,0.1
rid4,0.3,0.4,0.8
rid5,0.2,0.3,0.7
rid6,0.9,0.2,0.1
rid7,0.4,0.8,0.9
rid8,0.6,0.5,0.7
rid9,0.3,0.9,0.4

我想显示文件中每一列具有最高值的n行。 例如,对于n = 3,我想得到下面的 output。

cid1  rid2  1.0
cid1  rid6  0.9
cid1  rid8  0.6
                    # Blank lines are only for visibility.
cid2  rid9  0.9
cid2  rid7  0.8
cid2  rid8  0.5

cid3  rid7  0.9
cid3  rid4  0.8
cid3  rid8  0.7

这是我到目前为止所拥有的:

csv <- read.csv("input.csv", row.names=1);
n <- 3

for (col in colnames(csv)) {
    print(csv[order(csv[col], decreasing = T)[1:n],][col]);
}

这几乎给了我想要的东西。

     cid1
rid2  1.0
rid6  0.9
rid8  0.6
     cid2
rid9  0.9
rid7  0.8
rid3  0.5
     cid3
rid7  0.9
rid4  0.8
rid5  0.7

这是一种tidyverse方式:

我们可以使用行名创建一个新列,以长格式获取数据,并为每个列名 select 获取前 3 个值。

library(tidyverse)

csv %>%
  rownames_to_column() %>%
  pivot_longer(cols = -rowname) %>%
  group_by(name) %>%
  slice_max(value, n = 3, with_ties = FALSE)

#  rowname name  value
#  <chr>   <chr> <dbl>
#1 rid2    cid1    1  
#2 rid6    cid1    0.9
#3 rid8    cid1    0.6
#4 rid9    cid2    0.9
#5 rid7    cid2    0.8
#6 rid3    cid2    0.5
#7 rid7    cid3    0.9
#8 rid4    cid3    0.8
#9 rid5    cid3    0.7

在您的for循环中,您可能希望将前n行与相关的rid和列names组合在一个data.frame中。

csv <- read.csv("input.csv", row.names=1)

n <- 3
for (k in 1:ncol(csv)) {
  o <- order(-csv[, k])[1:n]
  print(data.frame(cid=names(csv)[k], rid=rownames(csv)[o], v=csv[o, k]))
}
#    cid  rid   v
# 1 cid1 rid2 1.0
# 2 cid1 rid6 0.9
# 3 cid1 rid8 0.6
#    cid  rid   v
# 1 cid2 rid9 0.9
# 2 cid2 rid7 0.8
# 3 cid2 rid3 0.5
#    cid  rid   v
# 1 cid3 rid7 0.9
# 2 cid3 rid4 0.8
# 3 cid3 rid5 0.7

或者,您可以使用lapply ,它会生成一个列表。

n <- 3
lapply(seq(csv), function(x) 
  data.frame(cid=names(csv)[x], rid=rownames(csv), v=csv[, x])[order(-csv[, x]), ][1:n, ])
# [[1]]
#    cid  rid   v
# 2 cid1 rid2 1.0
# 6 cid1 rid6 0.9
# 8 cid1 rid8 0.6
# 
# [[2]]
#    cid  rid   v
# 9 cid2 rid9 0.9
# 7 cid2 rid7 0.8
# 3 cid2 rid3 0.5
# 
# [[3]]
#    cid  rid   v
# 7 cid3 rid7 0.9
# 4 cid3 rid4 0.8
# 5 cid3 rid5 0.7

编辑

要以阈值而不是顺序进行子集,请执行

th <- .5
for (k in 1:ncol(csv)) {
  rows <- csv[, k] >= th
  print(data.frame(cid=names(csv)[k], rid=rownames(csv)[rows], v=csv[rows, k]))
}
#    cid  rid   v
# 1 cid1 rid2 1.0
# 2 cid1 rid6 0.9
# 3 cid1 rid8 0.6
#    cid  rid   v
# 1 cid2 rid3 0.5
# 2 cid2 rid7 0.8
# 3 cid2 rid8 0.5
# 4 cid2 rid9 0.9
#    cid  rid   v
# 1 cid3 rid2 0.5
# 2 cid3 rid4 0.8
# 3 cid3 rid5 0.7
# 4 cid3 rid7 0.9
# 5 cid3 rid8 0.7

或者,使用lapply

th <- .5
lapply(seq(csv), function(x) {
  ss <- csv[[x]] >= th
  data.frame(cid=names(csv)[x], rid=rownames(csv), v=csv[, x])[ss, ]
})
# [[1]]
#    cid  rid   v
# 2 cid1 rid2 1.0
# 6 cid1 rid6 0.9
# 8 cid1 rid8 0.6
# 
# [[2]]
#    cid  rid   v
# 3 cid2 rid3 0.5
# 7 cid2 rid7 0.8
# 8 cid2 rid8 0.5
# 9 cid2 rid9 0.9
# 
# [[3]]
#    cid  rid   v
# 2 cid3 rid2 0.5
# 4 cid3 rid4 0.8
# 5 cid3 rid5 0.7
# 7 cid3 rid7 0.9
# 8 cid3 rid8 0.7

编辑 2

这是订购的版本。

th <- .5
lapply(seq(csv), function(x) {
  xo <- csv[order(-csv[, x]), x, F]
  o <- xo[xo >= th,,F]
  cbind(cid=colnames(o), rid=rownames(o), v=unname(o))
})
# [[1]]
# cid  rid   v
# rid2 cid1 rid2 1.0
# rid6 cid1 rid6 0.9
# rid8 cid1 rid8 0.6
# 
# [[2]]
# cid  rid   v
# rid9 cid2 rid9 0.9
# rid7 cid2 rid7 0.8
# rid3 cid2 rid3 0.5
# rid8 cid2 rid8 0.5
# 
# [[3]]
# cid  rid   v
# rid7 cid3 rid7 0.9
# rid4 cid3 rid4 0.8
# rid5 cid3 rid5 0.7
# rid8 cid3 rid8 0.7
# rid2 cid3 rid2 0.5

要么

for (x in 1:ncol(csv)) {
  xo <- csv[order(-csv[, x]), x, F]
  o <- xo[xo >= th,,F]
  print(cbind(cid=colnames(o), rid=rownames(o), v=unname(o)))
}
#       cid  rid   v
# rid2 cid1 rid2 1.0
# rid6 cid1 rid6 0.9
# rid8 cid1 rid8 0.6
#       cid  rid   v
# rid9 cid2 rid9 0.9
# rid7 cid2 rid7 0.8
# rid3 cid2 rid3 0.5
# rid8 cid2 rid8 0.5
#       cid  rid   v
# rid7 cid3 rid7 0.9
# rid4 cid3 rid4 0.8
# rid5 cid3 rid5 0.7
# rid8 cid3 rid8 0.7
# rid2 cid3 rid2 0.5

数据:

csv <- structure(list(cid1 = c(0.1, 1, 0.2, 0.3, 0.2, 0.9, 0.4, 0.6, 
0.3), cid2 = c(0.4, 0.1, 0.5, 0.4, 0.3, 0.2, 0.8, 0.5, 0.9), 
    cid3 = c(0.3, 0.5, 0.1, 0.8, 0.7, 0.1, 0.9, 0.7, 0.4)), class = "data.frame", row.names = c("rid1", 
"rid2", "rid3", "rid4", "rid5", "rid6", "rid7", "rid8", "rid9"
))

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM