[英]Arrange each data frame in a list of data frames
I have a list from
of data frames ( dfA
and dfB
) with different number of rows: 我有一个列表
from
数据帧(的dfA
和dfB
具有不同的行数):
# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")
# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")
# Create data frames and check their structures
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)
# Create a list of data frames
from <- list(dfA, dfB)
from
# Check its type
is.list(from)
# Read each elements of the list one by one
from[[1]]
from[[2]]
# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB
But how to do this to each data frames in the list all at a time? 但是如何一次对列表中的每个数据帧执行此操作? If I understand correctly your code would return a list of rearranged data frames (in a "wide" format).
如果我理解正确,您的代码将返回重新排列的数据帧列表(以“宽”格式)。 Then I need to convert the list to a new data frame.
然后我需要将列表转换为新的数据框。
(Another issue is that all data frames in the list have similarly named variables (id ID, Var1, Var2, Var3... for each df in the list). Here I can't reproduce this issue.) (另一个问题是列表中的所有数据帧都有类似命名的变量(列表中每个df的id ID,Var1,Var2,Var3 ...)。这里我不能重现这个问题。)
Thank you. 谢谢。
My code is: 我的代码是:
genSeq <- c('https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/alignments/A_gen.txt')
# Read raw data as character vector
a <- readLines(genSeq)
# Some diagnostics
# is.vector(a)
# typeof(a)
# length(a)
# Convert vector a to data frame b
b <- as.data.frame(a, stringsAsFactors = FALSE)
# is.data.frame(b)
# typeof(a)
# length(a)
# Install some packages
install.packages("stringr")
install.packages("stringi")
install.packages("xlsx")
# Load the packages
library(stringr)
library(stringi)
library(xlsx)
# Read the lines with nucleotide sequences
bb <- b[c(9:19762),]
# Some diagnostics
# head(bb)
# tail(bb)
# length(bb)
# typeof(bb)
# is.vector(bb)
# Split lines
d <- strsplit(bb, split = "")
# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)
# Count number of variables ( http://stackoverflow.com/a/15201478/1009306 )
max.length <- max(sapply(d, length))
# Add NA values to list elements when the lists are shorter than others
d <- lapply(d, function(x) {c(x, rep(NA, max.length-length(x)))})
# Combine all elements
do.call(rbind, d)
# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)
# Transform matrix
dd <- t(matrix(unlist(d),ncol=length(d)))
# Some diagnostics
# head(dd)
# tail(dd)
# is.matrix(dd)
# Transform existing dd matrix into ddd data frame
ddd <- as.data.frame(dd)
# Some diagnostics
# head(ddd)
# tail(ddd)
# is.data.frame(ddd)
# typeof(ddd)
# length(ddd)
# class(ddd)
# str(ddd)
# names(ddd)
# nrow(ddd)
# ncol(ddd)
# summary(ddd)
# Add new variable allel by concatenating values in existing variables V1...v19
ddd <- transform(ddd, allel = paste0(ddd$V1, ddd$V2, ddd$V3, ddd$V4, ddd$V5, ddd$V6, ddd$V7, ddd$V8, ddd$V9, ddd$V10, ddd$V11, ddd$V12, ddd$V13, ddd$V14, ddd$V15, ddd$V16, ddd$V17, ddd$V18, ddd$V19, sep = " "))
# Some diagnostics
# names(ddd)
# Reorder variable allel to be the first
new_ordered <- ddd[c(length(ddd),c(1:(length(ddd)-1)))]
# Some diagnostics
# names(new_ordered)
# ncol(new_ordered)
# Remove unnecessary variables V1...V19
new_ordered$V1 <- NULL
new_ordered$V2 <- NULL
new_ordered$V3 <- NULL
new_ordered$V4 <- NULL
new_ordered$V5 <- NULL
new_ordered$V6 <- NULL
new_ordered$V7 <- NULL
new_ordered$V8 <- NULL
new_ordered$V9 <- NULL
new_ordered$V10 <- NULL
new_ordered$V11 <- NULL
new_ordered$V12 <- NULL
new_ordered$V13 <- NULL
new_ordered$V14 <- NULL
new_ordered$V15 <- NULL
new_ordered$V16 <- NULL
new_ordered$V17 <- NULL
new_ordered$V18 <- NULL
new_ordered$V19 <- NULL
# Some diagnostics
# ncol(new_ordered)
# nrow(new_ordered)
# Remove rows containing NA ( http://stackoverflow.com/q/8005154/1009306 )
new_ordered <- subset(new_ordered, !(V50 == "NA" & V100 == "NA"))
# Some diagnostics
# head(new_ordered)
# ncol(new_ordered)
# nrow(new_ordered)
# Shrink whitespaces in allel names with the help of library(stringr)'s function:
new_ordered$allel <- gsub(" ", "", new_ordered$allel)
# The list of unique allels accordingly to LL*NN:NN(NL) template
#####
# Sort new_ordered data frame in an ascending order by allel variable
new_odrd_srtd <- new_ordered[order(new_ordered$allel),]
# Some diagnostics
# head(new_odrd_srtd)
# typeof(new_odrd_srtd)
# is.data.frame(new_odrd_srtd)
# The list of unique allel names
unique.allels <- unique(new_odrd_srtd$allel)
# Let the list to be a character vector
unique.allels <- as.character(unique.allels)
# Show them:
# unique.allels
# Their number is:
# length(unique.allels)
# Export them into MS Excel workbook:
# write.xlsx(unique.allels, file="d:/hla.xlsx", sheetName="01 unique.allels", append=TRUE)
# Extract the part of an allel name considering specific HLA protein only: LL*NN:NN(NL).
# The final point for the pattern of interest is cleared at http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
specific.HLA.protein <- unique(gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", unique.allels))
# Show them:
# specific.HLA.protein
# Their number is:
# length(specific.HLA.protein)
# Export them into _the same_ MS Excel workbook
# write.xlsx(specific.HLA.protein, file="d:/hla.xlsx", sheetName="02 specific.HLA.protein", append=TRUE)
##################################################################################
# Plan
#
# convert multiple rows per subject into single row
# Create data frame with these long rows
# Concatenate values of each variable into corresponding single cells of a new row
#
#
# Example for http://stackoverflow.com/q/42711357
#####
# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")
# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")
# Create data frames and check their structures
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)
# Create a list of data frames
from <- list(dfA, dfB)
from
# Check its type
is.list(from)
# Read each elements of the list one by one
from[[1]]
from[[2]]
# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB
l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))
new_df <- setNames(data.frame(do.call(rbind, l2)), c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df
# Some diagnostics
diagnostic <- new_df
head(diagnostic)
tail(diagnostic)
is.data.frame(diagnostic)
typeof(diagnostic)
length(diagnostic)
class(diagnostic)
str(diagnostic)
names(diagnostic)
nrow(diagnostic)
ncol(diagnostic)
summary(diagnostic)
##################################################################################
# End of Example
# Select strings only for A*01:01:01:01 allel
new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("A\\*01:01:01*\\:*[0-9A-Za-z]", allel) )
# A regular expression for the pattern with spaces plus extra info:
# new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("^.*(\\A\\*[0-9A-Za-z]*\\:0[1-2]).*$", allel) )
head(new_odrd_srtd_sbst)
unique(new_odrd_srtd_sbst$allel)
# Add new vaiable allelGroup_specific.HLA.protein by copying values in existing variable allel
new_odrd_srtd_sbst <- transform(new_odrd_srtd_sbst, allelGroup_specific.HLA.protein = paste0(new_odrd_srtd_sbst$allel))
# Reorder variables
new_odrd_srtd_sbst_added_ordrd <- new_odrd_srtd_sbst[c(length(new_odrd_srtd_sbst), c(1:(length(new_odrd_srtd_sbst)-1)))]
# Extract the part of an allel name considering specific HLA protein only: A*NN:NN(NL).
# The final point for the pattern of interest is cleared here: http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein <- gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein)
# Diagnostic
is.data.frame(new_odrd_srtd_sbst_added_ordrd)
typeof(new_odrd_srtd_sbst_added_ordrd)
# Split dataframe into a list of data frames based on a value in allel variable
# http://stackoverflow.com/q/18527051
ndf <- split(new_odrd_srtd_sbst_added_ordrd, new_odrd_srtd_sbst_added_ordrd$allel)
ndf[[1]][1:36,1:25]
# Diagnostic
is.data.frame(ndf)
typeof(ndf)
class(ndf)
length(ndf)
# From this step I fail to step further...
Here is one possibility, 这是一种可能性,
l2 <- lapply(from, function(i) as.vector(c(as.character(i[1,1]), t(c(t(i[-1]))))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))
new_df <- setNames(data.frame(do.call(rbind, l2)),
c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df
# ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1 a 1 2 3 4 6 . 8 9
#2 b 11 22 33 44 66 <NA> <NA> <NA>
You could of course avoid the concatenation with i[1,1]
which is not in your requirements but rather an addition of mine I thought It could apply here. 你当然可以避免与
i[1,1]
的连接,这不是你的要求,而是我的想法它可以在这里应用。 So, by avoiding this and keeping your original transpose function, you get 所以,通过避免这种情况并保持原有的转置功能,你得到了
l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))
new_df <- setNames(data.frame(do.call(rbind, l2)),
c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df
# ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11
#1 a 1 2 3 a 4 6 a . 8 9
#2 b 11 22 33 b 44 66 <NA> <NA> <NA> <NA>
Try this in three steps. 试试这三个步骤。
First create your data frame without the IDs, 首先创建没有ID的数据框,
l3 <- lapply(from, function(i) t(c(t(i[-1]))))
l3 <- lapply(l3, `length<-`, max(lengths(l3)))
new_df1 <- setNames(data.frame(do.call(rbind, l3)),
paste0('Var', seq(max(lengths(l3)))))
new_df1
# Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1 1 2 3 4 6 . 8 9
#2 11 22 33 44 66 <NA> <NA> <NA>
Extract all the unique IDs, 提取所有唯一ID,
i1 <- sapply(from, function(i) unique(as.character(i[[1]])))
i1
#[1] "a" "b"
Bind them together, 将它们捆绑在一起,
final_df1 <- cbind(IDs = i1, new_df1)
final_df1
# IDs Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1 a 1 2 3 4 6 . 8 9
#2 b 11 22 33 44 66 <NA> <NA> <NA>
Following your example: 按照你的例子:
library(data.table)
# Create a list of data frames
from <- list(dfA, dfB)
from
[[1]]
IDA Var1 Var2 Var3
1 a 1 2 3
2 a 4 6
3 a . 8 9
[[2]]
IDB Var4 Var5 Var6
1 b 11 22 33
2 b 44
# rbind all the elements in the list of data.tables
out <- lapply(from, function(x){as.data.table(t(c(t(x))))} )
out <- rbindlist(out, fill = TRUE)
out
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
1: a 1 2 3 a 4 6 a . 8 9
2: b 11 22 33 b 44 66 NA NA NA NA
# If the files are stored on your drive, you can call them by bulk, and then `rbindlist`:
files <- list.files(pattern = ".csv")
files <- lapply(files, fread)
I feel like you can just use lapply
to iterate over all the data.frame
in the list
to do what you are already doing on each individual data.frame
. 我觉得你可以使用
lapply
迭代list
中的所有data.frame
来完成你在每个data.frame
上已经做过的事情。 Just make sure you subset each vector in such a way that the number of columns in the output is equal to the number of elements in the data.frame
with the maximum number of elements. 只需确保以这样的方式对每个向量进行子集化,即输出中的列数等于
data.frame
具有最大元素数的元素数。 This maximum number ( max_length
in this example) can be obtained by unlisting each data.frame
, obtaining the number of elements using lengths
, and then using max
to get the number of maximum elements. 通过取消列出每个
data.frame
,获取使用lengths
的元素数量,然后使用max
获取最大元素数,可以获得此最大数量(在此示例中为max_length
)。
max_length = max(lengths(lapply(from, unlist)))
do.call(rbind, lapply(from, function(df)
t(c(t(df)))[1:max_length]))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a" "1" "2" "3" "a" "4" " " "6" "a" "." "8" "9"
#[2,] "b" "11" "22" "33" "b" "44" " " "66" NA NA NA NA
UPDATE UPDATE
do.call(rbind, lapply(from, function(df)
c(as.character(df[1,1]), t(c(t(df[,-1]))))[1:max_length]))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a" "1" "2" "3" "4" " " "6" "." "8" "9" NA NA
#[2,] "b" "11" "22" "33" "44" " " "66" NA NA NA NA NA
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.