如何在R中生成N个最不相似的组合

Question

我有一组6个颜色代码（x），一组N个人，每个人都需要用唯一的颜色代码标记，以及每只动物的四个位置，每个位置可以带有不同的颜色。 我有6种不同的颜色。

因此，两个人的代码可能是；
1.红色，蓝色，蓝色，白色
2.白色，黄色，粉红色，黄色

但是，由于每个位置的颜色都会掉落，因此我想生成一个冗余的标记方案，即使在一个（甚至两个？）位置失去颜色后，该方法仍允许将个人与其他人区分开。。

即使6种颜色和4个位置给出了1296种组合，我发现很难选择N个最不相似的组合：

可重现的示例：

library(gtools)
x     <- c("white", "red", "green", "blue", "pink", "yellow")
Perms <- permutations(n=6,r=4,v=x,repeats.allowed=T)
print(nrow(Perms))
head(Perms)

请注意，前6种组合仅在1个位置处具有不同的颜色-如果该代码丢失> 1个，则表示它们不再可区分！

那么，对于介于50-150之间的N值，如何选择N个最不相似的组合 ？

谢谢！

Answer 1

我无法最终回答您的问题，但是我有一个想法可能会对您有所帮助。

用每种颜色的第一个字母构建字符串代码：

library(gtools)
x     <- c("w", "r", "g", "b", "p", "y")
Perms <- permutations(n=6,r=4,v=x,repeats.allowed=T)
m <- apply(Perms, 1, paste, collapse = "")

> head(m)
[1] "bbbb" "bbbg" "bbbp" "bbbr" "bbbw" "bbby"

样本n代码：

set.seed(1)
n <- 50
y <- sample(m, n)

创建一个Levenshtein距离的* n矩阵：

library(vwr)
lvmat <- sapply(y, function(x) levenshtein.distance(x, y))

> lvmat[1:5, 1:5]
     grrp pgpg rprr yprw gggp
grrp    0    4    3    3    2
pgpg    4    0    4    4    3
rprr    3    4    0    2    4
yprw    3    4    2    0    4
gggp    2    3    4    4    0

现在，您可以通过引导程序或使您的船浮起的水来最大化sum(lvmat) ，以获取大多数不同组合的样本。

Answer 2

以上LAP建议的可重现示例。 注意，由于依赖于随机采样，因此仍不能保证仅在一个位置上不会有不同的代码对。 不过，这还是一个不错的开始-感谢LAP！

# install.packages("gtools")
library(gtools)
library(vwr)

## Available colours
x <- c("W", "R", "G", "B", "P", "Y")

## Generate all possible colour combinations, for 6 colours & 4 positions
body <- data.frame(permutations(n=6,r=4,v=x,repeats.allowed=T), stringsAsFactors = F) ; colnames(body) <- c("Head","Thorax","L_gaster","R_gaster")

## concatenate each colour-code to a sequence without spaces, etc
m    <- paste( body$Head, body$Thorax, body$L_gaster, body$R_gaster, sep="")


## 
set.seed(1)
COLONY_SIZE <- 50    ## How many adult workers in the colony excluding the queen
N_Attempts  <- 1000  ## How many alternative solutions to generate - the more the better, but it takes longer

## prepare data-containers
Summary <- NULL
LvList <- list()

for (TRY in 1:N_Attempts)
{print(paste(TRY,"of",N_Attempts))
  y <- sample(m, COLONY_SIZE)     ## randomly sample COLONY_SIZE codes
  ## measure pairwise Levenshtein distances for all pair combinations
  Matrix <- sapply(y, function(x) levenshtein.distance(x, y))
  diag(Matrix) <- NA              ## eliminate self-self measure (distance = 0)
  Matrix[lower.tri(Matrix)] <- NA ## dist i-j = dist j-i
  ## store solution
  LvList[[TRY]] <- Matrix         
  ## summarize each solution using three metrics:
  ## (i) the average pair distance (higher is better)
  ## (ii) the number of 'close' code pairs (those with the minimum distance of 1 - lower is better)
  ## (iii) the maximum number of 'close' code *pairs across all codes (lower is better)
  Summary <- rbind(Summary, data.frame(Mean_Distance          = mean(Matrix, na.rm=T),
                                       N_close_pairs         = sum(Matrix[!is.na(Matrix)]==1),
                                       N_close_pairs_per_ant = max(rowSums( Matrix==1, na.rm=T)) ))
}


## ***Find the solution with the fewest pairs wiRth the lowest distance***

Summary$Mean_Distance_Rank          <- rank(Summary$Mean_Distance)
Summary$N_close_pairs_Rank         <- rank(-Summary$N_close_pairs)
Summary$N_close_pairs_per_ant_Rank <- rank(-Summary$N_close_pairs_per_ant)
Summary$Rank_Total <- Summary$Mean_Distance_Rank + Summary$N_close_pairs_Rank + Summary$N_close_pairs_per_ant_Rank

solution <- rownames( LvList[[which.max(Summary$Rank_Total)]] )

## Highlight candidate solutions
Colour <- rep(rgb(0,0,0,0.1,1),nrow(Summary) )
Colour [which.max(Summary$Rank_Total) ] <- "red"
pairs(Summary[,c("Mean_Distance","N_close_pairs","N_close_pairs_per_ant")], col=Colour, bg=Colour, pch=21, cex=1.4) 


## format into a table
SOLUTION <- data.frame(Code=1:COLONY_SIZE, t(as.data.frame(sapply(solution, strsplit, "")))) 
colnames(SOLUTION)[2:5] <-  c("Head","Thorax","L_gaster","R_gaster")

Answer 3

这是一个更好的方法，它不依赖盲采样，而是将每个代码对之间的相似性表示为网络中的一条边，然后使用igraph函数maximum_ivs搜索最不相似的代码对：

rm(list=ls())

library(gtools)
library(igraph)

##
outputfolder <- "XXXXXXXXXX"
dir.create(outputfolder,showWarnings = F)
setwd(outputfolder)

## Available colours
x <- c("W", "R", "G", "B", "P", "Y")

## Generate all possible colour combinations, for 6 colours & 4 positions
body <- data.frame(permutations(n=6,r=4,v=x,repeats.allowed=T), stringsAsFactors = F) ; colnames(body) <- c("Head","Thorax","L_gaster","R_gaster")
write.table(body,file="Paint_marks_full_list.txt",col.names=T,row.names=F,quote=F,append=F)

## Generate edge list
edge_list <- data.frame(comb_1=character(),comb_2=character(),similarity=character())
if (!file.exists("Edge_list.txt")){
  write.table(edge_list,file="Edge_list.txt",col.names=T,row.names=F,quote=F,append=F)
}else{
  edge_list <- read.table("Edge_list.txt",header=T,stringsAsFactors = F)
}
if (nrow(edge_list)>0){
  last_i <- edge_list[nrow(edge_list),"comb_1"]
  last_j <- edge_list[nrow(edge_list),"comb_2"]
}

if (!(last_i==(nrow(body)-1)&last_j==nrow(body))){
  for (i in last_i:(nrow(body)-1)){
    print(paste("Combination",i))
    for (j in (i+1):nrow(body)){
      if (i>last_i|j>last_j){
        simil <- length(which(body[i,]==body[j,]))
        if (simil>0){
          write.table(data.frame(comb_1=i,comb_2=j,similarity=simil),file="Edge_list.txt",col.names=F,row.names=F,quote=F,append=T)
        }

      }
    }
  }

}

######let's make 3 graphs with edges representing overlap between combinations ###
##First graph, in which ANY overlap between two combinations is seen as an edge. Will be used to produce list of paint combination with no overlap
net1 <- graph.data.frame(edge_list[c("comb_1","comb_2")],directed=F)

##Second graph, in which only overlaps of 2 or more spots is seen as an edge. Will be used to produce list of paint combinations with no more than 1 spot in common
net2 <- graph.data.frame(edge_list[which(edge_list$similarity>=2),c("comb_1","comb_2")],directed=F)

##Third graph, in which only overlaps of 3 or more spots is seen as an edge. Will be used to produce list of paint combinations with no more than 2 spots in common
net3 <- graph.data.frame(edge_list[which(edge_list$similarity>=3),c("comb_1","comb_2")],directed=F)


#######Now let's use the ivs function to get independent vertex sets, i.e., set of vertices with no connections between any of them
no_overlap_list <- largest_ivs(net1)
max_one_spot_overlap_list <- largest_ivs(net2)
max_two_spots_overlap_list <- largest_ivs(net3)

如何在R中生成N个最不相似的组合

问题描述

可重现的示例：

3 个解决方案

解决方案1
1 2018-05-17 11:20:22

解决方案2
1 2018-05-17 16:46:33

解决方案3
0 已采纳 2018-05-20 19:53:04

如何在R中生成N个最不相似的组合

问题描述

可重现的示例：

3 个解决方案

解决方案1 1 2018-05-17 11:20:22

解决方案2 1 2018-05-17 16:46:33

解决方案3 0 已采纳 2018-05-20 19:53:04

解决方案1
1 2018-05-17 11:20:22

解决方案2
1 2018-05-17 16:46:33

解决方案3
0 已采纳 2018-05-20 19:53:04