简体   繁体   中英

How to reshape data based on contents of dataframe R

I have an R data frame consisting of a single column, and lots of rows. Within this column are a number of individuals and their responses. I would like to reshape this data, with one row for each individual. However there is no ID variable, and the only pattern is that the last score for each individual is numeric. Hence you can deduce that what follows a number should be a new row.

Existing data format:

alpha
bravo
charlie
5
alpha
charlie
2
delta
1

dd <- data.frame(xx = c("alpha","bravo","charlie",5,"alpha","charlie",2,"delta",1))

I would like this data to be rearranged into one of the following forms, in order of most desirable to least desirable:

alpha   bravo   charlie          5    # Best
alpha           charlie          2
                          delta  1

or

alpha   bravo   charlie  5
alpha   charlie          2
delta                    1

or

alpha   bravo   charlie 5    # Worst but acceptable if above is not possible.
alpha   charlie 2
delta   1 

Here are some options and formats:

txt <- readLines(n=9)
alpha
bravo
charlie
5
alpha
charlie
2
delta
1
idx <- grepl("^\\d+$", txt)
group <- cumsum(head(c(FALSE, idx), -1))
unname(split(txt, group))
# [[1]]
# [1] "alpha"   "bravo"   "charlie" "5"      
# 
# [[2]]
# [1] "alpha"   "charlie" "2"      
# 
# [[3]]
# [1] "delta" "1"

lst <- split(txt[!idx], group[!idx])
cols <- unique(unlist(lst, F,F)) 
df <- cbind(
  setNames(do.call(
      rbind.data.frame, 
      lapply(lst, is.element, el=cols)), 
    cols),
  val = as.integer(txt[idx])
)
#   alpha bravo charlie delta val
# 0  TRUE  TRUE    TRUE FALSE   5
# 1  TRUE FALSE    TRUE FALSE   2
# 2 FALSE FALSE   FALSE  TRUE   1


unname(cbind.data.frame(
  do.call(rbind, lapply(lst, function(x) {
    res <- setNames(x, x)[cols]
    res <- ifelse(is.na(res), "", res)
  })), 
  as.integer(txt[idx])
))
# 0 alpha bravo charlie       5
# 1 alpha       charlie       2
# 2                     delta 1

This gives you the second best option after creating an id and counter variable

library(reshape2)

row <- c('alpha', 'bravo', 'charlie', '5', 'alpha', 'charlie', '2', 'delta', '1')
df <- data.frame(row, id = 1, counter = 1:length(row))
for (i in 1:(nrow(df) - 1)) {
  # if a number increment next id 
  if (length(grep('[0-9]+', df[i, 1])) > 0) {
    df[(i + 1):nrow(df), 2] <- df[i + 1, 2] + 1
  }
}

for (i in 2:nrow(df)) {
  # if new start set to 1
  if (df[i, 2] > df[i - 1, 2]) {
    df[i, 3] <- 1
  } else {
    df[i, 3] <- df[i - 1, 3] + 1
  }
}

reshape(df, idvar = 'id', timevar = 'counter', direction = 'wide')

The ideal one requires more information so that people know how the cells should be allocated. The second best can be achieved by the following.

x <- c('alpha',
       'bravo',
       'charlie',
       '5',
       'alpha',
       'charlie',
       '2',
       'delta',
       '1')

rowend <- grep("^[0-9]+$", x)
n <- length(rowend) # number of individuals
rowbegin <- c(1, head(rowend, n-1) + 1)
m <- max(rowend - rowbegin) + 1  # number of column

y <- Map(function(i, j) c(x[i:(j-1)], rep("", m - (j-i+1)), x[j]), 
         rowbegin, rowend)
as.data.frame(matrix(unlist(y), nrow = n, ncol = m, byrow = TRUE))

A crack at the ideal layout, as intimiated in comments R probably not the best tool for the job here but ...

x = c("alpha","bravo","charlie",5,"alpha","charlie",2,"delta",1)

num = grepl("[0-9]",x)

l = split(x,c(0,cumsum(num)[-length(num)]))
# last element always numeric
vals = unlist(lapply(l, function(x) as.numeric(x[length(x)])))
nams = lapply(l, function(x) x[-length(x)])
# unique names for structure
unique_nams = unique(unlist(nams))

full = matrix(unique_nams, nrow = length(nams), ncol = length(unique_nams), byrow = TRUE)

# reassign
sapply(seq_along(nams), function(i) full[i,!unique_nams %in% nams[[i]]] <<- NA)

answer = cbind(as.data.frame(full),vals)

##    V1    V2      V3    V4 vals
## 0 alpha bravo charlie  <NA>    5
## 1 alpha  <NA> charlie  <NA>    2
## 2  <NA>  <NA>    <NA> delta    1

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM