I have a dataset with a number of yes/no columns indicating whether a particular record is associated with a given store number in three different regions (ie, a record will have a single yes value for one of the two stores in each of the three regions, where the variable names are formatted as 'region'_'storenumber'):
var1_1 var1_2 var2_1 var2_2 var3_1 var3_2
1 Yes No No Yes Yes No
2 No Yes Yes No No Yes
3 No Yes Yes No No Yes
4 No Yes No Yes Yes No
5 No Yes No Yes No Yes
6 Yes No No Yes No Yes
7 Yes No Yes No Yes No
8 No Yes Yes No No Yes
I'd like to create a variable for each region called 'region_1', 'region_2', and 'region_3' that are equal to the "Yes" store number in that region for that record (1 or 2):
region_1 region_2 region_3
1 1 2 1
2 2 1 2
3 2 1 2
4 2 2 1
5 2 2 2
6 1 2 2
7 1 1 1
8 2 1 2
I am able to create each region variable separate using for loops as follows:
for(i in 1:3) {
df[paste("region_", toString(i), sep = "")] <- ""
}
for(i in 1:2) {
df$region_1 <- ifelse(df[paste("var1_", toString(i), sep = "")] == "Yes" & df$region_1 == "", toString(i), df$region_1)
}
for(i in 1:2) {
df_1$region_2 <- ifelse(df_1[paste("var2_", toString(i), sep = "")] == "Yes" & df_1$region_2 == "", toString(i), df_1$region_2)
}
for(i in 1:2) {
df_1$region_3 <- ifelse(df_1[paste("var3_", toString(i), sep = "")] == "Yes" & df_1$region_3 == "", toString(i), df_1$region_3)
}
My actual data has many more than 3 regions (and more than 2 stores per region), so rather than writing a separate loop for each region, I'd like to nest this loop to loop over all regions. I've attempted the following:
for(j in 1:3) {
for(i in 1:2) {
df[paste("region_", toString(j), sep = "")] <- ifelse(df[paste("var", toString(j), "_", toString(i), sep = "")] == "Yes" & df[paste("region_", toString(j), sep = "")] == "", toString(i), df[paste("region_", toString(j), sep = "")])
}
}
but recieve the warning "provided #### variables to replace 1 variables" and end up with each region variable populated with the same single value for every record.
Any thoughts on where I'm going wrong with my nested loop?
If we convert Yes and No to logical values, we can simply do
regions = as.data.frame( sapply(seq(1, NCOL(stores), by=2),function(j) ifelse(stores[,j],1,2)))
names(regions) = c("region_1", "region_2", "region_3")
regions
# region_1 region_2 region_3
#1 1 2 1
#2 2 1 2
#3 2 1 2
#4 2 2 1
#5 2 2 2
#6 1 2 2
#7 1 1 1
#8 2 1 2
The data:
library(data.table)
stores = setDF(fread(gsub("No", "FALSE", gsub("Yes", "TRUE",
"var1_1 var1_2 var2_1 var2_2 var3_1 var3_2
Yes No No Yes Yes No
No Yes Yes No No Yes
No Yes Yes No No Yes
No Yes No Yes Yes No
No Yes No Yes No Yes
Yes No No Yes No Yes
Yes No Yes No Yes No
No Yes Yes No No Yes"))))
For this, you may be better served converting your data into a "long" format, instead of the current "wide" format. Here are examples using dplyr
and tidyr
. I have tried to comment each line, but the basic idea is to generate one row per store-variable measure, and just have the presence/absence show. Then, you can group the rows by region, and count up the number of "Yes" entries.
# Data entry from @dww, without conversion to logical (though that would make it easier)
library(data.table)
stores = setDF(fread("var1_1 var1_2 var2_1 var2_2 var3_1 var3_2
Yes No No Yes Yes No
No Yes Yes No No Yes
No Yes Yes No No Yes
No Yes No Yes Yes No
No Yes No Yes No Yes
Yes No No Yes No Yes
Yes No Yes No Yes No
No Yes Yes No No Yes"))
Change to long format, store as new variable
longStores <-
stores %>%
# tag for printing
tbl_df() %>%
# Store the variable of interest as a column, instead of row.names
mutate(variableInterest = rownames(.)) %>%
# Convert the data to long format
gather(StoreID, present, -variableInterest) %>%
# Split the store_region format
separate(StoreID, c("Store", "Region"), sep = "_") %>%
# Eliminate the leading "var" from store names, just for display
mutate(Store = gsub("var", "", Store))
Summarise by region, still in long format
longRegional <-
longStores %>%
# Set grouping
group_by(variableInterest, Region) %>%
# Count the number of correct values in the region
summarise(nStoresWithVariable = sum(present == "Yes"))
Finally, reformat to your original request
longRegional %>%
spread(Region, nStoresWithVariable)
# variableInterest `1` `2`
# * <chr> <int> <int>
# 1 1 2 1
# 2 2 1 2
# 3 3 1 2
# 4 4 1 2
# 5 5 0 3
# 6 6 1 2
# 7 7 3 0
# 8 8 1 2
Here is my messy attempt with a simulation of your data as True and False:
Simulated Data - Generate data table using random Booleans
rb <- function()
{
sample(c(T,F), size=10, replace=TRUE, prob=c(0.5, 0.5) )
}
var1_1 = rb()
var2_1 = rb()
var3_1 = rb()
df <- data.frame( var1_1, !var1_1,
var2_1, !var2_1,
var3_1, !var3_1)
colnames(df) = c('var1_1', 'var1_2', 'var2_1', 'var2_2', 'var3_1', 'var3_2')
df
var1_1 var1_2 var2_1 var2_2 var3_1 var3_2
1 FALSE TRUE TRUE FALSE FALSE TRUE
2 FALSE TRUE FALSE TRUE FALSE TRUE
3 FALSE TRUE TRUE FALSE TRUE FALSE
4 FALSE TRUE TRUE FALSE FALSE TRUE
5 FALSE TRUE FALSE TRUE TRUE FALSE
6 FALSE TRUE FALSE TRUE TRUE FALSE
7 TRUE FALSE TRUE FALSE TRUE FALSE
8 TRUE FALSE FALSE TRUE TRUE FALSE
9 TRUE FALSE FALSE TRUE TRUE FALSE
10 FALSE TRUE FALSE TRUE TRUE FALSE
Solution
cn <- names(df)
cnprefixes <- gsub("_.*?$","",cn)
cnsuffixes <- gsub("^.*?_","",cn)
newblock<-data.frame()
bFirstTime<-T
for (prefix in unique(cnprefixes))
{
block<-df[ , grepl( prefix , names( df ) ) ]
theseSuffixes <- cnsuffixes[startsWith(cn, prefix)]
j <- 1
for(suffix in theseSuffixes)
{
block[,j][block[,j]==T]=as.numeric(suffix)
j<-j+1
}
tempblock=data.frame(rowSums(block))
colnames(tempblock)<- prefix
if (bFirstTime){
newblock <- tempblock
bFirstTime <- F
}
else{
newblock<-cbind(newblock, tempblock)
}
}
newblock
var1 var2 var3
1 2 1 2
2 2 2 2
3 2 1 1
4 2 1 2
5 2 2 1
6 2 2 1
7 1 1 1
8 1 2 1
9 1 2 1
10 2 2 1
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.