简体   繁体   中英

Simplify and summarize data table in R

One of my data set is as follows

name  alias (list of alias)
x     c("R","V","Q")
y     "Z"
q     c("A", "R", "M")
w     c("C","A","R")

I would like to first simplify the table as follows

name alias 
x  "R"
x  "V"
x  "Q"
y  "Z"
q  "A"
q  "R"
q  "M"
w  "C"
w  "A"
w  "R"

and later modify the data to get

alias name 
"R"   c(x,q,w)
"V"   x
"Q"   x
"Z"   y
"A"  c(q,w)
"M"  q
"C"  w

How can I achieve this in R?

Here is the actual data set

> \dput(head(cases))
structure(list(caseid = c(7703415, 7758128, 7858259, 8802954, 
8829620, 8847200), tcount = c(2L, 2L, 3L, 10L, 4L, 2L), helplinks = c("character(0", 
"c(\"60107\", \"56085\", \"57587\", \"3000020\"", "character(0", 
"character(0", "c(\"60107\", \"3000023\", \"3000020\", \"60107\", \"56085\", \"57587\"", 
"character(0")), .Names = c("caseid", "tcount", "helplinks"), row.names = c(NA, 
6L), class = "data.frame")

> head(cases)
   caseid tcount                                                  helplinks
1 7703415      2                                                character(0
2 7758128      2                     c("60107", "56085", "57587", "3000020"
3 7858259      3                                                character(0
4 8802954     10                                                character(0
5 8829620      4 c("60107", "3000023", "3000020", "60107", "56085", "57587"
6 8847200      2                                                character(0

New answer

Use cSplit from my "splistackshape" package:

cSplit(cases, "helplinks", ",", "long")[, helplinks := gsub(
  'character\\(0|c\\(|\\"', "", helplinks)][, list(
    caseid = list(caseid)), by = helplinks]
#    helplinks                          caseid
# 1:           7703415,7858259,8802954,8847200
# 2:     60107         7758128,8829620,8829620
# 3:     56085                 7758128,8829620
# 4:     57587                 7758128,8829620
# 5:   3000020                 7758128,8829620
# 6:   3000023                         8829620

Old answer

I'm assuming you're starting with something like this:

df <- data.frame(
  name = c("x", "y", "q", "w"),
  alias = I(list(c("R","V","Q"), "Z", c("A", "R", "M"), c("C","A","R")))
)
df
#   name   alias
# 1    x R, V, Q
# 2    y       Z
# 3    q A, R, M
# 4    w C, A, R

If that's the case, here's one approach using listCol_l from my "splitstackshape" package in conjunction with "data.table".

library(splitstackshape)
listCol_l(df, "alias")[, list(name = list(name)), by = alias_ul]
#    alias_ul  name
# 1:        R x,q,w
# 2:        V     x
# 3:        Q     x
# 4:        Z     y
# 5:        A   q,w
# 6:        M     q
# 7:        C     w

You don't really need "splitstackshape" for that, so if you want to remove the self-promotion part of my answer and just use "data.table", you could do:

library(data.table)
as.data.table(df)[, list(
  alias = unlist(alias)), by = name][, list(
  name = list(name)), by = alias]

First we clean up the "character(0" 's. Then we read in those character values that were once lists but now need to be scan -ned. Then we apply a function that makes a dataframe from every line:

good.case <- cases[ grepl("c\\(", cases$helplinks),]
 lapply( split(good.case, row.names(good.case) ), function(d){
   vec <- scan(text=gsub("c\\(|[, ]", "", d$helplinks) ,what="")
   do.call( data.frame, list(caseid=d$caseid, alias=vec) )
 }
 )
#-------
#Read 4 items
#Read 6 items
$`2`
   caseid   alias
1 7758128   60107
2 7758128   56085
3 7758128   57587
4 7758128 3000020

$`5`
   caseid   alias
1 8829620   60107
2 8829620 3000023
3 8829620 3000020
4 8829620   60107
5 8829620   56085
6 8829620   57587

 expanded <- lapply( split(good.case, row.names(good.case) ), function(d){
    vec <- scan(text=gsub("c\\(|[, ]", "", d$helplinks) ,what="")
    do.call( data.frame, list(caseid=rep(d$caseid, length(vec)), alias=vec) )
  }
  )
#Read 4 items
#Read 6 items

Now we bind the dataframes together:

 do.call(rbind, expanded)
#---------------
     caseid   alias
2.1 7758128   60107
2.2 7758128   56085
2.3 7758128   57587
2.4 7758128 3000020
5.1 8829620   60107
5.2 8829620 3000023
5.3 8829620 3000020
5.4 8829620   60107
5.5 8829620   56085
5.6 8829620   57587

But only half the way there I suppose. No point in pursuing further with Ananda's 5 caret answer sitting there.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM