[英]Extract sorted rows from matrix
給定矩陣m
:
# [,1] [,2] [,3] [,4]
# [1,] 2 1 3 4
# [2,] 4 3 2 1
# [3,] 2 3 1 4
# [4,] 1 2 3 4
# [5,] 4 2 3 1
# [6,] 4 3 1 2
# [7,] 2 4 3 1
# [8,] 4 3 2 1
# [9,] 3 2 1 4
# [10,] 1 2 3 4
# [11,] 3 2 4 1
# [12,] 4 3 2 1
# [13,] 2 1 3 4
# [14,] 2 1 3 4
# [15,] 1 2 3 4
# [16,] 4 3 2 1
# [17,] 2 1 3 4
# [18,] 1 4 3 2
# [19,] 3 2 1 4
# [20,] 1 2 3 4
m <- structure(c(2L, 4L, 2L, 1L, 4L, 4L, 2L, 4L, 3L, 1L, 3L, 4L, 2L,
2L, 1L, 4L, 2L, 1L, 3L, 1L, 1L, 3L, 3L, 2L, 2L, 3L, 4L, 3L, 2L,
2L, 2L, 3L, 1L, 1L, 2L, 3L, 1L, 4L, 2L, 2L, 3L, 2L, 1L, 3L, 3L,
1L, 3L, 2L, 1L, 3L, 4L, 2L, 3L, 3L, 3L, 2L, 3L, 3L, 1L, 3L, 4L,
1L, 4L, 4L, 1L, 2L, 1L, 1L, 4L, 4L, 1L, 1L, 4L, 4L, 4L, 1L, 4L,
2L, 4L, 4L), .Dim = c(20L, 4L))
我們可以用這種方式提取排序的行:
apply(m, 1, function(x) !is.unsorted(x) | !is.unsorted(rev(x)))
#[1] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE
#FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
矩陣不大也沒關系。 但我在談論數百萬行的矩陣。 我們可以做得更好嗎? 我們可以用矢量化的方式來做嗎? 矩陣m
僅作為玩具數據給出。 我正在尋找一個通用的解決方案 。
這很難看,但你可以通過檢查每列中的所有差異是否為正或負來達到目的。
colSums(sign(diff(t(m)))) %in% c(-3,3)
# [1] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE
#[13] FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
我的快速測試表明它的執行速度要快得多。
您可以通過檢查矩陣m
的大小來概括它:
colSums(sign(diff(t(m)))) %in% c(-(ncol(m)-1), ncol(m)-1)
如果您已經對具有重復值的c(1,1,2,3)
等行進行了排序,則可以使用稍微冗長的方法:
sdm <- diff(t(m))
nc <- ncol(m) - 1
colSums(sdm <= 0)==nc | colSums(sdm >= 0)==nc
# [1] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE
#[13] FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
一些快速基准測試(記住這些在處理重復值方面並不完全相同):
set.seed(1)
m2 <- m[sample(1:nrow(m),1e6,replace=T),]
## original apply code
system.time({
apply(m2, 1, function(x) !is.unsorted(x) | !is.unsorted(rev(x)))
})
# user system elapsed
# 14.888 0.272 15.153
比較運行:
system.time({
n <- t(m2)
forwards <- colSums(n == sort(m2[1,])) == ncol(m2)
backwards <- colSums(n == rev(sort(m2[1,]))) == ncol(m2)
vec <- forwards | backwards
})
# user system elapsed
# 0.104 0.020 0.123
system.time({
sdm <- diff(t(m2))
nc <- ncol(m) - 1
colSums(sdm <= 0)==nc | colSums(sdm >= 0)==nc
})
# user system elapsed
# 0.248 0.032 0.279
system.time({
apply(m2[,-1] - m2[,-ncol(m2)], 1, function(x) all(x>=0) || all(x <= 0))
})
# user system elapsed
# 3.724 0.004 3.731
library(matrixStats)
system.time(rowVarDiffs(m2) == 0)
# user system elapsed
# 40.176 1.156 42.071
我采取了回收方式:
n <- t(m)
forwards <- colSums(n == sort(m[1,])) == ncol(m)
backwards <- colSums(n == rev(sort(m[1,]))) == ncol(m)
vec <- forwards | backwards
unvec <- apply(m, 1, function(x) !is.unsorted(x) | !is.unsorted(rev(x)))
identical(vec, unvec)
[1] TRUE
一個想法是,如果各行進行排序,那么他們的差異將始終為1,因此,方差為0。使用rowVarDiffs
從matrixStats
包,那么,
library(matrixStats)
rowVarDiffs(m) == 0
#or
rowVarDiffs(rowRanks(m)) == 0
#[1] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
我得到的最好的答案是檢查元素之間的所有差異(連續)是非負的還是非正面的(從上面的colSums答案中借用,當我被毆打它時,我只是測試相同的方法!)
system.time({
dm2 <- m2[,-1] - m2[,-ncol(m2)]
vec <- rowSums(dm2>=0) == (ncol(m2)-1) |
rowSums(dm2<=0) == (ncol(m2)-1)
})
這適用於任何間距的數值(整數或非整數)。
在我有一百萬行的矩陣上:
user system elapsed
0.11 0.00 0.11
與OP相比:
user system elapsed
8.98 0.00 8.98
這是對原始問題中由矩陣m
構造的dim 1e+5 x 4
矩陣的擬議解決方案的基准。 請注意,矩陣m
每行具有相同的數字,並且每行沒有任何重復的數字。
重要的是要注意,只有以下解決方案是通用解決方案,這意味着它們適用於任何整數矩陣,即使每行重復數字:
也就是說,它們適用於以下矩陣,而其他解決方案則失敗!
m <- structure(c(18, 1, 7, 1, 2, 12, 9, 6, 18, 20, 7, 2, 12, 13, 19,
7, 20, 6, 5, 19, 17, 2, 2, 4, 5, 9, 18, 13, 9, 18, 1, 11, 13,
7, 18, 10, 20, 2, 3, 3, 14, 8, 19, 8, 12, 7, 19, 16, 12, 16,
17, 19, 7, 13, 15, 6, 18, 15, 2, 18, 9, 14, 8, 14, 15, 6, 13,
18, 3, 10, 9, 5, 5, 9, 10, 6, 11, 17, 12, 15, 7, 15, 17, 15,
16, 19, 3, 14, 2, 9, 4, 19, 14, 14, 7, 3, 10, 11, 18, 12, 3,
18, 9, 18, 20, 12, 18, 10, 4, 7, 5, 2, 12, 11, 3, 4, 3, 7, 18,
10), .Dim = c(20L, 6L))
set.seed(1)
library(matrixStats)
library(microbenchmark)
m1 <- structure(c(3, 1, 3, 3, 1, 5, 1, 5, 3, 5, 1, 3, 5, 3, 1, 3, 4,
2, 5, 5, 5, 2, 2, 5, 5, 1, 2, 4, 2, 2, 2, 1, 4, 5, 2, 4, 1, 4,
4, 3, 4, 3, 5, 2, 4, 2, 4, 3, 4, 4, 3, 5, 1, 1, 3, 5, 5, 1, 3,
2, 2, 4, 1, 1, 2, 3, 3, 2, 1, 1, 4, 4, 3, 2, 4, 2, 3, 5, 2, 1,
1, 5, 4, 4, 3, 4, 5, 1, 5, 3, 5, 2, 2, 4, 5, 1, 2, 3, 1, 4), .Dim = c(20L,
5L))
m <- m1[sample(1:nrow(m1),1e5,replace=T),]
dim(m)
#[1] 100000 5
f_m0h3n <- function(m) apply(m, 1, function(x) !is.unsorted(x) || !is.unsorted(rev(x)))
f_thelatemail1 <- function(m) colSums(sign(diff(t(m)))) %in% c(-(ncol(m)-1), ncol(m)-1)
f_thelatemail2 <- function(m) {sdm <- diff(t(m));nc <- ncol(m) - 1;colSums(sdm <= 0)==nc | colSums(sdm >= 0)==nc}
f_sebastian_c <- function(m){n <- t(m);forwards <- colSums(n == sort(m[1,])) == ncol(m);
backwards <- colSums(n == rev(sort(m[1,]))) == ncol(m);forwards | backwards}
f_Sotos1 <- function(m) rowVarDiffs(m) == 0
f_Sotos2 <- function(m) apply(m, 1, function(i) var(diff(i)) == 0)
f_Sotos3 <- function(m) rowVarDiffs(rowRanks(m)) == 0
f_stephematician <- function(m2) {dm2 <- m2[,-1] - m2[,-ncol(m2)];
vec <- rowSums(dm2>=0) == (ncol(m2)-1) | rowSums(dm2<=0) == (ncol(m2)-1);vec}
f_Chirayu_Chamoli <- function(m) {i=apply(m, 1, is.unsorted);j=apply(m[,c(ncol(m):1),drop = FALSE], 1, is.unsorted);k=xor(i,j);k}
res <- f_m0h3n(m)
all(res==f_thelatemail1(m))
# [1] TRUE
all(res==f_thelatemail2(m))
# [1] TRUE
all(res==f_sebastian_c(m))
# [1] TRUE
all(res==f_Sotos1(m))
# [1] TRUE
all(res==f_Sotos2(m))
# [1] TRUE
all(res==f_Sotos3(m))
# [1] TRUE
all(res==f_stephematician(m))
# [1] TRUE
all(res==f_Chirayu_Chamoli(m))
# [1] TRUE
microbenchmark(f_m0h3n(m), f_thelatemail1(m), f_thelatemail2(m), f_sebastian_c(m), f_Sotos1(m), f_Sotos2(m), f_Sotos3(m), f_stephematician(m), f_Chirayu_Chamoli(m))
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_m0h3n(m) 504.901409 522.640977 542.398387 535.72417 561.723344 634.99808 100
# f_thelatemail1(m) 9.426029 11.479137 23.454441 13.20548 17.308545 91.18738 100
# f_thelatemail2(m) 8.841014 10.607174 25.820464 12.09675 17.740771 103.00244 100
# f_sebastian_c(m) 5.358874 5.975436 9.709314 6.66186 8.725784 77.40695 100
# f_Sotos1(m) 1526.461296 1604.177128 1639.571861 1644.11763 1669.721992 1752.77551 100
# f_Sotos2(m) 1772.076169 1850.762817 1889.386328 1891.78832 1917.528489 2047.85548 100
# f_Sotos3(m) 1538.428094 1600.285447 1637.314434 1644.03891 1671.703437 1738.84665 100
# f_stephematician(m) 8.994555 9.986554 15.098616 10.97570 12.217240 83.86915 100
# f_Chirayu_Chamoli(m) 273.571757 289.372545 321.199457 330.37146 346.979005 384.64962 100
這是你可以做的另一件簡單的事情。 我認為這已經足夠概括但速度明智,它與Latemail的矢量化解決方案並不接近。
i=apply(m, 1, is.unsorted)
j=apply(m[,c(ncol(m):1),drop = FALSE], 1, is.unsorted)
k=xor(i,j)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.