[英]data.table - split multiple columns
I have a data.table G
as follows.我有一个 data.table G
如下。
d <- list( c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), c("SD455:TOMATO", "SD86:RAY", "SD39:MATURE", "SD86:COSMIC"), c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"), c("SD32:TRUMPET", "SD1:FIELD", "SD40:GREEN", "SD40:PARK"))
d2 <- lapply(d, function(x) sapply(strsplit(x, ":"), "[", 1))
d <- lapply(d, function(x) paste0(unique(x), collapse=", "))
d2 <- lapply(d2, function(x) paste0(unique(x), collapse=", "))
d <- as.data.frame(as.matrix(lapply(d, paste0, collapse=", ")))
d2 <- as.data.frame(as.matrix(lapply(d2, paste0, collapse=", ")))
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
d$sd <- as.character(d$sd)
d$sdw <- as.character(d$sdw)
G <- data.table( d , key = "sd" )
G[, GNO:= seq(1,nrow(G))]
setcolorder(G, neworder=c("GNO", "sd", "sdw"))
G
GNO sd sdw
1: 1 SD1, SD44, SD384, SD32 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET
2: 2 SD12, SD315, SD387 SD12:PAINTING, SD315:MONEY31, SD387:SPRING
3: 3 SD23, SD1, SD567 SD23:SWITCH, SD1:LUSH, SD567:TREK
4: 4 SD32, SD1, SD40 SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK
5: 5 SD345, SD340, SD387 SD345:HOLE, SD340:DUST, SD387:ROLL
6: 6 SD42, SD345, SD183 SD42:CRAYON, SD345:FOX, SD183:WIRE
7: 7 SD455, SD86, SD39 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC
I am trying to split each set of elements in column sd
and get a modified data.table according to column GNO
as follows.我正在尝试拆分列sd
每组元素,并根据列GNO
获取修改后的GNO
,如下所示。
G2 <- G[, list(sd = unlist(strsplit( sd , "," ))) , by = list(GNO, sdw)]
G2
GNO sdw sd
1: 1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET SD1
2: 1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET SD44
3: 1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET SD384
4: 1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET SD32
5: 2 SD12:PAINTING, SD315:MONEY31, SD387:SPRING SD12
6: 2 SD12:PAINTING, SD315:MONEY31, SD387:SPRING SD315
7: 2 SD12:PAINTING, SD315:MONEY31, SD387:SPRING SD387
8: 3 SD23:SWITCH, SD1:LUSH, SD567:TREK SD23
9: 3 SD23:SWITCH, SD1:LUSH, SD567:TREK SD1
10: 3 SD23:SWITCH, SD1:LUSH, SD567:TREK SD567
11: 4 SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK SD32
12: 4 SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK SD1
13: 4 SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK SD40
14: 5 SD345:HOLE, SD340:DUST, SD387:ROLL SD345
15: 5 SD345:HOLE, SD340:DUST, SD387:ROLL SD340
16: 5 SD345:HOLE, SD340:DUST, SD387:ROLL SD387
17: 6 SD42:CRAYON, SD345:FOX, SD183:WIRE SD42
18: 6 SD42:CRAYON, SD345:FOX, SD183:WIRE SD345
19: 6 SD42:CRAYON, SD345:FOX, SD183:WIRE SD183
20: 7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC SD455
21: 7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC SD86
22: 7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC SD39
I would also like to split the elements in sdw.我还想拆分 sdw 中的元素。 But when I am trying that, it all gets messed up due to the fact that number of elements in sd
and sdw
are not necessarily the same.但是当我尝试这样做时,由于sd
和sdw
中的元素数量不一定相同,这一切都变得一团糟。
t <- G[ , list(sd = unlist(strsplit(sd, "," )),
sdw = unlist(strsplit(sdw, "," ))) , by = list(GNO) ]
It give the warning它发出警告
Warning message:
In `[.data.table`(G, , list(sd = unlist(strsplit(sd, ",")), sdw = unlist(strsplit(sdw, :
Column 1 of result for group 4 is length 3 but the longest column in this result is 4. Recycled leaving remainder of 1 items. This warning is once only for the first group with this issue.
The output I am looking for is as follows.我正在寻找的输出如下。
out <- structure(list(GNO = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
5, 5, 5, 6, 6, 6, 7, 7, 7), sd = structure(c(1L, 14L, 9L, 6L,
2L, 5L, 10L, 4L, 1L, 16L, 6L, 1L, 12L, 8L, 7L, 10L, 13L, 8L,
3L, 15L, 17L, 11L), .Label = c("SD1", "SD12", "SD183", "SD23",
"SD315", "SD32", "SD340", "SD345", "SD384", "SD387", "SD39",
"SD40", "SD42", "SD44", "SD455", "SD567", "SD86"), class = "factor"),
sdw = structure(c(2L, 17L, 11L, 7L, 3L, 6L, 13L, 5L, 2L,
19L, 7L, 1L, 15L, 10L, 8L, 12L, 16L, 9L, 4L, 18L, 20L, 14L
), .Label = c("SD1:FIELD", "SD1:LUSH", "SD12:PAINTING", "SD183:WIRE",
"SD23:SWITCH", "SD315:MONEY31", "SD32:TRUMPET", "SD340:DUST",
"SD345:FOX", "SD345:HOLE", "SD384:FR563", "SD387:ROLL", "SD387:SPRING",
"SD39:MATURE", "SD40:GREEN, SD40:PARK", "SD42:CRAYON", "SD44:CANCEL",
"SD455:TOMATO", "SD567:TREK", "SD86:COSMIC, SD86:RAY"), class = "factor")), .Names = c("GNO",
"sd", "sdw"), row.names = c(NA, -22L), class = "data.frame")
out
GNO sd sdw
1 1 SD1 SD1:LUSH
2 1 SD44 SD44:CANCEL
3 1 SD384 SD384:FR563
4 1 SD32 SD32:TRUMPET
5 2 SD12 SD12:PAINTING
6 2 SD315 SD315:MONEY31
7 2 SD387 SD387:SPRING
8 3 SD23 SD23:SWITCH
9 3 SD1 SD1:LUSH
10 3 SD567 SD567:TREK
11 4 SD32 SD32:TRUMPET
12 4 SD1 SD1:FIELD
13 4 SD40 SD40:GREEN, SD40:PARK
14 5 SD345 SD345:HOLE
15 5 SD340 SD340:DUST
16 5 SD387 SD387:ROLL
17 6 SD42 SD42:CRAYON
18 6 SD345 SD345:FOX
19 6 SD183 SD183:WIRE
20 7 SD455 SD455:TOMATO
21 7 SD86 SD86:COSMIC, SD86:RAY
22 7 SD39 SD39:MATURE
How to get this output?如何获得这个输出?
Here's how I would approach this using only data.table
package (no need in creating G2
neither)这是我将如何仅使用data.table
包来解决这个问题(也不需要创建G2
)
Here we are first splitting by swd
and then aggregating by both GNO
and the part before :
in the stw
column在这里,我们首先按swd
拆分,然后按GNO
和之前的部分聚合:
在stw
列中
G[, .(unlist(strsplit(sdw, ", ", TRUE))), GNO][,
.(stw = toString(V1)), .(GNO, sd = gsub(":.*", "", V1))]
# GNO sd stw
# 1: 1 SD1 SD1:LUSH
# 2: 1 SD44 SD44:CANCEL
# 3: 1 SD384 SD384:FR563
# 4: 1 SD32 SD32:TRUMPET
# 5: 2 SD12 SD12:PAINTING
# 6: 2 SD315 SD315:MONEY31
# 7: 2 SD387 SD387:SPRING
# 8: 3 SD23 SD23:SWITCH
# 9: 3 SD1 SD1:LUSH
# 10: 3 SD567 SD567:TREK
# 11: 4 SD32 SD32:TRUMPET
# 12: 4 SD1 SD1:FIELD
# 13: 4 SD40 SD40:GREEN, SD40:PARK
# 14: 5 SD345 SD345:HOLE
# 15: 5 SD340 SD340:DUST
# 16: 5 SD387 SD387:ROLL
# 17: 6 SD42 SD42:CRAYON
# 18: 6 SD345 SD345:FOX
# 19: 6 SD183 SD183:WIRE
# 20: 7 SD455 SD455:TOMATO
# 21: 7 SD86 SD86:RAY, SD86:COSMIC
# 22: 7 SD39 SD39:MATURE
This may be making too big of an assumption about the nature of your data, but it appears that the "sd" column is actually somewhat irrelevant since that information is already embedded in the "sdw" column.这可能对您的数据性质做出了过大的假设,但似乎“sd”列实际上有些无关紧要,因为该信息已嵌入“sdw”列中。
As such, this is what I came up with, using two cSplit
s from my "splitstackshape" package:因此,这就是我想出的,使用我的“splitstackshape”包中的两个cSplit
:
library(splitstackshape)
temp <- cSplit(
cSplit(
G, "sdw", ",", "long"),
"sdw", ":")[, c(1, 3, 4)
][, list(paste(unique(sdw_1), unique(sdw_2),
sep = ":", collapse = ", ")),
by = list(GNO, sdw_1)]
setnames(temp, c("GNO", "sd", "sdw"))
temp
# GNO sd sdw
# 1: 1 SD1 SD1:LUSH
# 2: 1 SD44 SD44:CANCEL
# 3: 1 SD384 SD384:FR563
# 4: 1 SD32 SD32:TRUMPET
# 5: 2 SD12 SD12:PAINTING
# 6: 2 SD315 SD315:MONEY31
# 7: 2 SD387 SD387:SPRING
# 8: 3 SD23 SD23:SWITCH
# 9: 3 SD1 SD1:LUSH
# 10: 3 SD567 SD567:TREK
# 11: 4 SD32 SD32:TRUMPET
# 12: 4 SD1 SD1:FIELD
# 13: 4 SD40 SD40:GREEN, SD40:PARK
# 14: 5 SD345 SD345:HOLE
# 15: 5 SD340 SD340:DUST
# 16: 5 SD387 SD387:ROLL
# 17: 6 SD42 SD42:CRAYON
# 18: 6 SD345 SD345:FOX
# 19: 6 SD183 SD183:WIRE
# 20: 7 SD455 SD455:TOMATO
# 21: 7 SD86 SD86:RAY, SD86:COSMIC
# 22: 7 SD39 SD39:MATURE
# GNO sd sdw
library(compare)
compare(out, temp, allowAll = TRUE)
# TRUE
# [GNO] coerced from <integer> to <numeric>
# [sdw] coerced from <character> to <factor>
# dropped attributes
# [3] dropped attributes
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.