One of my data set is as follows
name alias (list of alias)
x c("R","V","Q")
y "Z"
q c("A", "R", "M")
w c("C","A","R")
I would like to first simplify the table as follows
name alias
x "R"
x "V"
x "Q"
y "Z"
q "A"
q "R"
q "M"
w "C"
w "A"
w "R"
and later modify the data to get
alias name
"R" c(x,q,w)
"V" x
"Q" x
"Z" y
"A" c(q,w)
"M" q
"C" w
How can I achieve this in R?
Here is the actual data set
> \dput(head(cases))
structure(list(caseid = c(7703415, 7758128, 7858259, 8802954,
8829620, 8847200), tcount = c(2L, 2L, 3L, 10L, 4L, 2L), helplinks = c("character(0",
"c(\"60107\", \"56085\", \"57587\", \"3000020\"", "character(0",
"character(0", "c(\"60107\", \"3000023\", \"3000020\", \"60107\", \"56085\", \"57587\"",
"character(0")), .Names = c("caseid", "tcount", "helplinks"), row.names = c(NA,
6L), class = "data.frame")
> head(cases)
caseid tcount helplinks
1 7703415 2 character(0
2 7758128 2 c("60107", "56085", "57587", "3000020"
3 7858259 3 character(0
4 8802954 10 character(0
5 8829620 4 c("60107", "3000023", "3000020", "60107", "56085", "57587"
6 8847200 2 character(0
Use cSplit
from my "splistackshape" package:
cSplit(cases, "helplinks", ",", "long")[, helplinks := gsub(
'character\\(0|c\\(|\\"', "", helplinks)][, list(
caseid = list(caseid)), by = helplinks]
# helplinks caseid
# 1: 7703415,7858259,8802954,8847200
# 2: 60107 7758128,8829620,8829620
# 3: 56085 7758128,8829620
# 4: 57587 7758128,8829620
# 5: 3000020 7758128,8829620
# 6: 3000023 8829620
I'm assuming you're starting with something like this:
df <- data.frame(
name = c("x", "y", "q", "w"),
alias = I(list(c("R","V","Q"), "Z", c("A", "R", "M"), c("C","A","R")))
)
df
# name alias
# 1 x R, V, Q
# 2 y Z
# 3 q A, R, M
# 4 w C, A, R
If that's the case, here's one approach using listCol_l
from my "splitstackshape" package in conjunction with "data.table".
library(splitstackshape)
listCol_l(df, "alias")[, list(name = list(name)), by = alias_ul]
# alias_ul name
# 1: R x,q,w
# 2: V x
# 3: Q x
# 4: Z y
# 5: A q,w
# 6: M q
# 7: C w
You don't really need "splitstackshape" for that, so if you want to remove the self-promotion part of my answer and just use "data.table", you could do:
library(data.table)
as.data.table(df)[, list(
alias = unlist(alias)), by = name][, list(
name = list(name)), by = alias]
First we clean up the "character(0"
's. Then we read in those character values that were once lists but now need to be scan
-ned. Then we apply a function that makes a dataframe from every line:
good.case <- cases[ grepl("c\\(", cases$helplinks),]
lapply( split(good.case, row.names(good.case) ), function(d){
vec <- scan(text=gsub("c\\(|[, ]", "", d$helplinks) ,what="")
do.call( data.frame, list(caseid=d$caseid, alias=vec) )
}
)
#-------
#Read 4 items
#Read 6 items
$`2`
caseid alias
1 7758128 60107
2 7758128 56085
3 7758128 57587
4 7758128 3000020
$`5`
caseid alias
1 8829620 60107
2 8829620 3000023
3 8829620 3000020
4 8829620 60107
5 8829620 56085
6 8829620 57587
expanded <- lapply( split(good.case, row.names(good.case) ), function(d){
vec <- scan(text=gsub("c\\(|[, ]", "", d$helplinks) ,what="")
do.call( data.frame, list(caseid=rep(d$caseid, length(vec)), alias=vec) )
}
)
#Read 4 items
#Read 6 items
Now we bind the dataframes together:
do.call(rbind, expanded)
#---------------
caseid alias
2.1 7758128 60107
2.2 7758128 56085
2.3 7758128 57587
2.4 7758128 3000020
5.1 8829620 60107
5.2 8829620 3000023
5.3 8829620 3000020
5.4 8829620 60107
5.5 8829620 56085
5.6 8829620 57587
But only half the way there I suppose. No point in pursuing further with Ananda's 5 caret answer sitting there.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.