简体   繁体   中英

How to generate the N most dissimilar combinations in R

I have a set of 6 colour codes (x), a set of N individuals that each need to be labelled with a unique colour code, and four locations on each animal, each of which can carry a different colour. I have 6 different colors.

So, the codes for two individual might be;
1. red, blue, blue,white
2. white,yellow,pink,yellow

However, as the colour at each position can fall off, I would like to generate a redundant labelling scheme, that would allow still allow an individual to be distinguished from others, even after it loses the colour at one (or even two?) locations.

Even though 6 colours and 4 positions gives 1296 combinations, I am finding it difficult to select the N most dissimilar combinations:

Reproducible example:

library(gtools)
x     <- c("white", "red", "green", "blue", "pink", "yellow")
Perms <- permutations(n=6,r=4,v=x,repeats.allowed=T)
print(nrow(Perms))
head(Perms)

Note that the first 6 combinations differ in the colour at only 1 position - loss of this code by >1 individual would mean that they can no longer be distinguished!

So, for values of N between 50-150, how to select the N most dissimilar combinations ?

Thanks !

I cannot conclusively answer your question, but I have an idea that might help you.

Build string codes with the first letter of each color:

library(gtools)
x     <- c("w", "r", "g", "b", "p", "y")
Perms <- permutations(n=6,r=4,v=x,repeats.allowed=T)
m <- apply(Perms, 1, paste, collapse = "")

> head(m)
[1] "bbbb" "bbbg" "bbbp" "bbbr" "bbbw" "bbby"

Sample n codes:

set.seed(1)
n <- 50
y <- sample(m, n)

Create an*n matrix of Levenshtein distances :

library(vwr)
lvmat <- sapply(y, function(x) levenshtein.distance(x, y))

> lvmat[1:5, 1:5]
     grrp pgpg rprr yprw gggp
grrp    0    4    3    3    2
pgpg    4    0    4    4    3
rprr    3    4    0    2    4
yprw    3    4    2    0    4
gggp    2    3    4    4    0

Now you could maximize sum(lvmat) , maybe via bootstrapping or whatever floats your boat, to get the sample of most dissimilar combinations.

Reproducible example of LAPs suggestion above. Note, due to the reliance upon random sampling, this still does not guarantee that there will be no code pairs that differ at only one position. Still, it's a good start -thanks LAP!

# install.packages("gtools")
library(gtools)
library(vwr)

## Available colours
x <- c("W", "R", "G", "B", "P", "Y")

## Generate all possible colour combinations, for 6 colours & 4 positions
body <- data.frame(permutations(n=6,r=4,v=x,repeats.allowed=T), stringsAsFactors = F) ; colnames(body) <- c("Head","Thorax","L_gaster","R_gaster")

## concatenate each colour-code to a sequence without spaces, etc
m    <- paste( body$Head, body$Thorax, body$L_gaster, body$R_gaster, sep="")


## 
set.seed(1)
COLONY_SIZE <- 50    ## How many adult workers in the colony excluding the queen
N_Attempts  <- 1000  ## How many alternative solutions to generate - the more the better, but it takes longer

## prepare data-containers
Summary <- NULL
LvList <- list()

for (TRY in 1:N_Attempts)
{print(paste(TRY,"of",N_Attempts))
  y <- sample(m, COLONY_SIZE)     ## randomly sample COLONY_SIZE codes
  ## measure pairwise Levenshtein distances for all pair combinations
  Matrix <- sapply(y, function(x) levenshtein.distance(x, y))
  diag(Matrix) <- NA              ## eliminate self-self measure (distance = 0)
  Matrix[lower.tri(Matrix)] <- NA ## dist i-j = dist j-i
  ## store solution
  LvList[[TRY]] <- Matrix         
  ## summarize each solution using three metrics:
  ## (i) the average pair distance (higher is better)
  ## (ii) the number of 'close' code pairs (those with the minimum distance of 1 - lower is better)
  ## (iii) the maximum number of 'close' code *pairs across all codes (lower is better)
  Summary <- rbind(Summary, data.frame(Mean_Distance          = mean(Matrix, na.rm=T),
                                       N_close_pairs         = sum(Matrix[!is.na(Matrix)]==1),
                                       N_close_pairs_per_ant = max(rowSums( Matrix==1, na.rm=T)) ))
}


## ***Find the solution with the fewest pairs wiRth the lowest distance***

Summary$Mean_Distance_Rank          <- rank(Summary$Mean_Distance)
Summary$N_close_pairs_Rank         <- rank(-Summary$N_close_pairs)
Summary$N_close_pairs_per_ant_Rank <- rank(-Summary$N_close_pairs_per_ant)
Summary$Rank_Total <- Summary$Mean_Distance_Rank + Summary$N_close_pairs_Rank + Summary$N_close_pairs_per_ant_Rank

solution <- rownames( LvList[[which.max(Summary$Rank_Total)]] )

## Highlight candidate solutions
Colour <- rep(rgb(0,0,0,0.1,1),nrow(Summary) )
Colour [which.max(Summary$Rank_Total) ] <- "red"
pairs(Summary[,c("Mean_Distance","N_close_pairs","N_close_pairs_per_ant")], col=Colour, bg=Colour, pch=21, cex=1.4) 


## format into a table
SOLUTION <- data.frame(Code=1:COLONY_SIZE, t(as.data.frame(sapply(solution, strsplit, "")))) 
colnames(SOLUTION)[2:5] <-  c("Head","Thorax","L_gaster","R_gaster")

Here's a better approach that does not rely upon blind sampling, but instead represents the similarity between each code pair as an edge in a network, and then uses the igraph function largest_ivs to searche for the most dissimilar code pairs:

rm(list=ls())

library(gtools)
library(igraph)

##
outputfolder <- "XXXXXXXXXX"
dir.create(outputfolder,showWarnings = F)
setwd(outputfolder)

## Available colours
x <- c("W", "R", "G", "B", "P", "Y")

## Generate all possible colour combinations, for 6 colours & 4 positions
body <- data.frame(permutations(n=6,r=4,v=x,repeats.allowed=T), stringsAsFactors = F) ; colnames(body) <- c("Head","Thorax","L_gaster","R_gaster")
write.table(body,file="Paint_marks_full_list.txt",col.names=T,row.names=F,quote=F,append=F)

## Generate edge list
edge_list <- data.frame(comb_1=character(),comb_2=character(),similarity=character())
if (!file.exists("Edge_list.txt")){
  write.table(edge_list,file="Edge_list.txt",col.names=T,row.names=F,quote=F,append=F)
}else{
  edge_list <- read.table("Edge_list.txt",header=T,stringsAsFactors = F)
}
if (nrow(edge_list)>0){
  last_i <- edge_list[nrow(edge_list),"comb_1"]
  last_j <- edge_list[nrow(edge_list),"comb_2"]
}

if (!(last_i==(nrow(body)-1)&last_j==nrow(body))){
  for (i in last_i:(nrow(body)-1)){
    print(paste("Combination",i))
    for (j in (i+1):nrow(body)){
      if (i>last_i|j>last_j){
        simil <- length(which(body[i,]==body[j,]))
        if (simil>0){
          write.table(data.frame(comb_1=i,comb_2=j,similarity=simil),file="Edge_list.txt",col.names=F,row.names=F,quote=F,append=T)
        }

      }
    }
  }

}

######let's make 3 graphs with edges representing overlap between combinations ###
##First graph, in which ANY overlap between two combinations is seen as an edge. Will be used to produce list of paint combination with no overlap
net1 <- graph.data.frame(edge_list[c("comb_1","comb_2")],directed=F)

##Second graph, in which only overlaps of 2 or more spots is seen as an edge. Will be used to produce list of paint combinations with no more than 1 spot in common
net2 <- graph.data.frame(edge_list[which(edge_list$similarity>=2),c("comb_1","comb_2")],directed=F)

##Third graph, in which only overlaps of 3 or more spots is seen as an edge. Will be used to produce list of paint combinations with no more than 2 spots in common
net3 <- graph.data.frame(edge_list[which(edge_list$similarity>=3),c("comb_1","comb_2")],directed=F)


#######Now let's use the ivs function to get independent vertex sets, i.e., set of vertices with no connections between any of them
no_overlap_list <- largest_ivs(net1)
max_one_spot_overlap_list <- largest_ivs(net2)
max_two_spots_overlap_list <- largest_ivs(net3)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM