简体   繁体   English

将每个数据帧排列在数据帧列表中

[英]Arrange each data frame in a list of data frames

I have a list from of data frames ( dfA and dfB ) with different number of rows: 我有一个列表from数据帧(的dfAdfB具有不同的行数):

# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")

# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")

# Create data frames and check their structures    
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)

# Create a list of data frames
from <- list(dfA, dfB)
from

# Check its type
is.list(from)

# Read each elements of the list one by one
from[[1]] 
from[[2]]

# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB

But how to do this to each data frames in the list all at a time? 但是如何一次对列表中的每个数据帧执行此操作? If I understand correctly your code would return a list of rearranged data frames (in a "wide" format). 如果我理解正确,您的代码将返回重新排列的数据帧列表(以“宽”格式)。 Then I need to convert the list to a new data frame. 然后我需要将列表转换为新的数据框。

(Another issue is that all data frames in the list have similarly named variables (id ID, Var1, Var2, Var3... for each df in the list). Here I can't reproduce this issue.) (另一个问题是列表中的所有数据帧都有类似命名的变量(列表中每个df的id ID,Var1,Var2,Var3 ...)。这里我不能重现这个问题。)

Thank you. 谢谢。

My code is: 我的代码是:

genSeq <-  c('https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/alignments/A_gen.txt')

# Read raw data as character vector
a <- readLines(genSeq)

# Some diagnostics
# is.vector(a)
# typeof(a)
# length(a)

# Convert vector a to data frame b
b <- as.data.frame(a, stringsAsFactors = FALSE)
# is.data.frame(b)
# typeof(a)
# length(a)

# Install some packages
  install.packages("stringr")
  install.packages("stringi")
  install.packages("xlsx")

# Load the packages
library(stringr)
library(stringi)
library(xlsx)

# Read the lines with nucleotide sequences
bb <- b[c(9:19762),]

# Some diagnostics
# head(bb)
# tail(bb)
# length(bb)
# typeof(bb)
# is.vector(bb)

# Split lines
d <-  strsplit(bb, split = "")

# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)

# Count number of variables ( http://stackoverflow.com/a/15201478/1009306 )
max.length <- max(sapply(d, length))

# Add NA values to list elements when the lists are shorter than others
d <- lapply(d, function(x) {c(x, rep(NA, max.length-length(x)))})

# Combine all elements
do.call(rbind, d)

# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)


# Transform matrix
dd <- t(matrix(unlist(d),ncol=length(d)))

# Some diagnostics
# head(dd)
# tail(dd)
# is.matrix(dd)

# Transform existing dd matrix into ddd data frame
ddd <- as.data.frame(dd)

# Some diagnostics
# head(ddd)
# tail(ddd)
# is.data.frame(ddd)
# typeof(ddd)
# length(ddd)
# class(ddd)
# str(ddd)
# names(ddd)
# nrow(ddd)
# ncol(ddd)
# summary(ddd)

# Add new variable allel by concatenating values in existing variables V1...v19
ddd <- transform(ddd, allel = paste0(ddd$V1, ddd$V2, ddd$V3, ddd$V4, ddd$V5, ddd$V6, ddd$V7, ddd$V8, ddd$V9, ddd$V10, ddd$V11, ddd$V12, ddd$V13, ddd$V14, ddd$V15, ddd$V16, ddd$V17, ddd$V18, ddd$V19, sep = " "))

# Some diagnostics
# names(ddd)

# Reorder variable allel to be the first
new_ordered <- ddd[c(length(ddd),c(1:(length(ddd)-1)))]

# Some diagnostics
# names(new_ordered)
# ncol(new_ordered)

# Remove unnecessary variables V1...V19
new_ordered$V1 <- NULL
new_ordered$V2 <- NULL
new_ordered$V3 <- NULL
new_ordered$V4 <- NULL
new_ordered$V5 <- NULL
new_ordered$V6 <- NULL
new_ordered$V7 <- NULL
new_ordered$V8 <- NULL
new_ordered$V9 <- NULL
new_ordered$V10 <- NULL
new_ordered$V11 <- NULL
new_ordered$V12 <- NULL
new_ordered$V13 <- NULL
new_ordered$V14 <- NULL
new_ordered$V15 <- NULL
new_ordered$V16 <- NULL
new_ordered$V17 <- NULL
new_ordered$V18 <- NULL
new_ordered$V19 <- NULL

# Some diagnostics
# ncol(new_ordered)
# nrow(new_ordered)

# Remove rows containing NA ( http://stackoverflow.com/q/8005154/1009306 )
new_ordered <- subset(new_ordered, !(V50 == "NA" & V100 == "NA"))

# Some diagnostics
# head(new_ordered)
# ncol(new_ordered)
# nrow(new_ordered)


# Shrink whitespaces in allel names with the help of library(stringr)'s function:
new_ordered$allel <- gsub(" ", "", new_ordered$allel)




# The list of unique allels accordingly to LL*NN:NN(NL) template
#####

# Sort new_ordered data frame in an ascending order by allel variable
new_odrd_srtd <- new_ordered[order(new_ordered$allel),]

# Some diagnostics
# head(new_odrd_srtd)
# typeof(new_odrd_srtd)
# is.data.frame(new_odrd_srtd)

# The list of unique allel names
unique.allels <- unique(new_odrd_srtd$allel)

# Let the list to be a character vector
unique.allels <- as.character(unique.allels)

# Show them:
# unique.allels

# Their number is:
# length(unique.allels)

# Export them into MS Excel workbook:
# write.xlsx(unique.allels, file="d:/hla.xlsx", sheetName="01 unique.allels", append=TRUE)

# Extract the part of an allel name considering specific HLA protein only: LL*NN:NN(NL).
# The final point for the pattern of interest is cleared at http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
specific.HLA.protein <- unique(gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", unique.allels))

# Show them:
# specific.HLA.protein

# Their number is:
# length(specific.HLA.protein)

# Export  them into _the same_ MS Excel workbook
# write.xlsx(specific.HLA.protein, file="d:/hla.xlsx", sheetName="02 specific.HLA.protein", append=TRUE)













##################################################################################
# Plan
#
# convert multiple rows per subject into single row
# Create data frame with these long rows
# Concatenate values of each variable into corresponding single cells of a new row
#
#
# Example for http://stackoverflow.com/q/42711357
#####

# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")

# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")

# Create data frames and check their structures    
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)

# Create a list of data frames
from <- list(dfA, dfB)
from

# Check its type
is.list(from)

# Read each elements of the list one by one
from[[1]] 
from[[2]]

# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB


l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df


# Some diagnostics
diagnostic <- new_df
head(diagnostic)
tail(diagnostic)
is.data.frame(diagnostic)
typeof(diagnostic)
length(diagnostic)
class(diagnostic)
str(diagnostic)
names(diagnostic)
nrow(diagnostic)
ncol(diagnostic)
summary(diagnostic)


##################################################################################
# End of Example

# Select strings only for A*01:01:01:01 allel
new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("A\\*01:01:01*\\:*[0-9A-Za-z]", allel) )
# A regular expression for the pattern with spaces plus extra info:
# new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("^.*(\\A\\*[0-9A-Za-z]*\\:0[1-2]).*$", allel) )
head(new_odrd_srtd_sbst)

unique(new_odrd_srtd_sbst$allel)




# Add new vaiable allelGroup_specific.HLA.protein by copying values in existing variable allel
new_odrd_srtd_sbst <- transform(new_odrd_srtd_sbst, allelGroup_specific.HLA.protein = paste0(new_odrd_srtd_sbst$allel))

# Reorder variables
new_odrd_srtd_sbst_added_ordrd <- new_odrd_srtd_sbst[c(length(new_odrd_srtd_sbst), c(1:(length(new_odrd_srtd_sbst)-1)))]

# Extract the part of an allel name considering specific HLA protein only: A*NN:NN(NL).
# The final point for the pattern of interest is cleared here: http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein <- gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein)

# Diagnostic
is.data.frame(new_odrd_srtd_sbst_added_ordrd)
typeof(new_odrd_srtd_sbst_added_ordrd)


# Split dataframe into a list of data frames based on a value in allel variable
# http://stackoverflow.com/q/18527051
ndf <- split(new_odrd_srtd_sbst_added_ordrd, new_odrd_srtd_sbst_added_ordrd$allel)
ndf[[1]][1:36,1:25]

# Diagnostic
is.data.frame(ndf)
typeof(ndf)
class(ndf)
length(ndf)

# From this step I fail to step further...

Here is one possibility, 这是一种可能性,

l2 <- lapply(from, function(i) as.vector(c(as.character(i[1,1]), t(c(t(i[-1]))))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), 
                     c('ID', paste0('Var', seq(max(lengths(l2))-1))))

new_df
#  ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1  a    1    2    3    4         6    .    8    9
#2  b   11   22   33   44        66 <NA> <NA> <NA>

You could of course avoid the concatenation with i[1,1] which is not in your requirements but rather an addition of mine I thought It could apply here. 你当然可以避免与i[1,1]的连接,这不是你的要求,而是我的想法它可以在这里应用。 So, by avoiding this and keeping your original transpose function, you get 所以,通过避免这种情况并保持原有的转置功能,你得到了

l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), 
                    c('ID', paste0('Var', seq(max(lengths(l2))-1))))

new_df
#  ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11
#1  a    1    2    3    a    4         6    a    .     8     9
#2  b   11   22   33    b   44        66 <NA> <NA>  <NA>  <NA>

Try this in three steps. 试试这三个步骤。

First create your data frame without the IDs, 首先创建没有ID的数据框,

l3 <- lapply(from, function(i) t(c(t(i[-1]))))
l3 <- lapply(l3, `length<-`, max(lengths(l3)))

 new_df1 <- setNames(data.frame(do.call(rbind, l3)), 
                     paste0('Var', seq(max(lengths(l3)))))

new_df1
#  Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1  1    2    3    4         6    .    8    9
#2 11   22   33   44        66 <NA> <NA> <NA>

Extract all the unique IDs, 提取所有唯一ID,

i1 <- sapply(from, function(i) unique(as.character(i[[1]])))
i1
#[1] "a" "b"

Bind them together, 将它们捆绑在一起,

final_df1 <- cbind(IDs = i1, new_df1)

final_df1
#  IDs Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1   a  1    2    3    4         6    .    8    9
#2   b 11   22   33   44        66 <NA> <NA> <NA>

Following your example: 按照你的例子:

library(data.table)
# Create a list of data frames
from <- list(dfA, dfB)
from
[[1]]
  IDA Var1 Var2 Var3
1   a    1    2    3
2   a    4         6
3   a    .    8    9

[[2]]
  IDB Var4 Var5 Var6
1   b   11   22   33
2   b   44        

# rbind all the elements in the list of data.tables
    out <- lapply(from, function(x){as.data.table(t(c(t(x))))} )
    out <- rbindlist(out, fill =  TRUE)
    out
       V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
    1:  a  1  2  3  a  4     6  a   .   8   9
    2:  b 11 22 33  b 44    66 NA  NA  NA  NA

# If the files are stored on your drive, you can call them by bulk, and then `rbindlist`:

    files <- list.files(pattern = ".csv")
    files <- lapply(files, fread)

I feel like you can just use lapply to iterate over all the data.frame in the list to do what you are already doing on each individual data.frame . 我觉得你可以使用lapply迭代list中的所有data.frame来完成你在每个data.frame上已经做过的事情。 Just make sure you subset each vector in such a way that the number of columns in the output is equal to the number of elements in the data.frame with the maximum number of elements. 只需确保以这样的方式对每个向量进行子集化,即输出中的列数等于data.frame具有最大元素数的元素数。 This maximum number ( max_length in this example) can be obtained by unlisting each data.frame , obtaining the number of elements using lengths , and then using max to get the number of maximum elements. 通过取消列出每个data.frame ,获取使用lengths的元素数量,然后使用max获取最大元素数,可以获得此最大数量(在此示例中为max_length )。

max_length = max(lengths(lapply(from, unlist)))
do.call(rbind, lapply(from, function(df)
    t(c(t(df)))[1:max_length]))
#     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a"  "1"  "2"  "3"  "a"  "4"  " "  "6"  "a"  "."   "8"   "9"  
#[2,] "b"  "11" "22" "33" "b"  "44" " "  "66" NA   NA    NA    NA

UPDATE UPDATE

do.call(rbind, lapply(from, function(df)
     c(as.character(df[1,1]), t(c(t(df[,-1]))))[1:max_length]))
#     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a"  "1"  "2"  "3"  "4"  " "  "6"  "."  "8"  "9"   NA    NA   
#[2,] "b"  "11" "22" "33" "44" " "  "66" NA   NA   NA    NA    NA   

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM