简体   繁体   English

r data.frame创建新变量

[英]r data.frame create new variable

I have a dataframe with around 1.5 million rows and 5 cols. 我有一个大约150万行和5列的数据帧。 One variable (VARIABLE) is of this type NATIONALITY_YEAR (eg SPAIN_1998) and I want to split it in two columns, one containing the Nationality, which is the left side of the name before the underscore, and one containing the Year, right side of the underscore. 一个变量(VARIABLE)属于这种类型NATIONALITY_YEAR(例如SPAIN_1998),我想将它分成两列,一列包含国籍,这是在下划线之前的名称的左侧,另一列包含年份,右侧下划线。 I have tried with concat.split which should be the easiest way: 我尝试过concat.split,这应该是最简单的方法:

aa <- concat.split(mydata, "VARIABLE", sep = "_", drop = F)

but after 2 hours running it did not produce any output. 但运行2小时后它没有产生任何输出。 I am not sure if I should leave it running for a longer period of time or if there is a non time consuming way to do this. 我不确定是否应该让它运行更长时间,或者是否有非耗时的方法来执行此操作。

Any help on the issue would be very much appreciated! 对此问题的任何帮助将非常感谢!

Here is a reproducible (subset!) sample: 这是一个可重现的(子集!)样本:

mydata<-  structure(list(PROVINCE = c(1L, 4L, 7L, 8L, 11L, 14L, 17L, 20L, 
24L, 28L, 30L, 33L, 36L, 41L, 44L, 46L, 48L, 3L, 6L, 8L, 10L, 
13L, 15L, 18L, 23L, 26L, 29L, 31L, 35L, 38L, 41L, 46L, 47L, 2L, 
4L, 8L, 8L, 11L, 15L, 17L, 21L, 24L, 28L, 30L, 33L, 37L, 41L, 
45L, 46L, 49L, 3L, 6L, 8L, 10L, 13L, 15L, 19L, 23L, 27L, 29L, 
32L, 36L, 39L, 43L, 46L, 48L, 2L, 5L, 8L, 8L, 12L, 15L, 18L, 
21L, 24L, 28L, 30L, 33L, 37L, 41L, 45L, 46L, 50L, 3L, 7L, 8L, 
10L, 14L, 16L, 20L, 23L, 27L, 29L, 32L, 36L, 39L, 43L, 46L, 48L, 
3L, 6L, 8L, 8L, 12L, 15L, 18L, 21L, 25L, 28L, 31L, 34L, 38L, 
41L, 45L, 46L, 50L, 3L, 7L, 8L, 11L, 14L, 17L, 20L, 23L, 27L, 
29L, 33L, 36L, 40L, 43L, 46L, 48L, 3L, 6L, 8L, 9L, 12L, 15L, 
18L, 22L, 25L, 28L, 31L, 35L, 38L, 41L, 45L, 46L, 50L, 4L, 7L, 
8L, 11L, 14L, 17L, 20L, 24L, 28L, 30L, 33L, 36L, 41L, 43L, 46L, 
48L, 3L, 6L, 8L, 10L, 13L, 15L, 18L, 22L, 26L, 28L, 31L, 35L, 
38L, 41L, 46L, 47L, 1L, 4L, 8L, 8L, 11L, 14L, 17L, 20L, 24L, 
28L, 30L, 33L, 36L, 41L, 44L, 46L, 49L, 3L, 6L), AGE5 = structure(c(1L, 
5L, 9L, 7L, 6L, 7L, 5L, 8L, 3L, 3L, 3L, 5L, 8L, 2L, 3L, 6L, 9L, 
5L, 7L, 4L, 3L, 5L, 8L, 8L, 2L, 8L, 2L, 9L, 7L, 9L, 9L, 2L, 7L, 
2L, 9L, 1L, 8L, 8L, 1L, 8L, 1L, 6L, 4L, 6L, 7L, 2L, 3L, 1L, 7L, 
5L, 6L, 9L, 5L, 6L, 8L, 9L, 3L, 4L, 3L, 4L, 4L, 1L, 3L, 1L, 2L, 
2L, 6L, 6L, 2L, 9L, 2L, 2L, 1L, 5L, 9L, 5L, 8L, 9L, 7L, 4L, 3L, 
7L, 2L, 8L, 2L, 6L, 9L, 1L, 5L, 1L, 6L, 6L, 6L, 7L, 3L, 6L, 3L, 
3L, 4L, 1L, 1L, 2L, 9L, 6L, 4L, 3L, 8L, 3L, 7L, 1L, 5L, 2L, 6L, 
6L, 8L, 5L, 9L, 5L, 6L, 2L, 3L, 1L, 4L, 8L, 9L, 8L, 1L, 5L, 1L, 
6L, 4L, 6L, 2L, 3L, 3L, 5L, 9L, 5L, 5L, 4L, 7L, 8L, 4L, 2L, 5L, 
7L, 8L, 9L, 8L, 3L, 7L, 7L, 5L, 6L, 3L, 6L, 1L, 2L, 2L, 3L, 7L, 
1L, 9L, 5L, 8L, 4L, 5L, 4L, 1L, 3L, 7L, 7L, 9L, 3L, 9L, 7L, 5L, 
7L, 8L, 1L, 4L, 4L, 6L, 1L, 8L, 7L, 8L, 6L, 8L, 4L, 3L, 4L, 5L, 
9L, 2L, 6L, 6L, 1L, 5L, 7L), .Label = c("10-14", "15-19", "20-24", 
"25-29", "30-34", "35-39", "40-44", "45-49", "50-54"), class = "factor"), 
ZONA91OK = c(101L, 4079L, 712L, 8205L, 11022L, 14021L, 1714L, 
20067L, 2414L, 2810L, 300799L, 3305L, 36026L, 41024L, 4405L, 
4607L, 48015L, 308L, 610L, 8121L, 1006L, 1307L, 1511L, 1813L, 
2308L, 2605L, 2910L, 310799L, 35026L, 3811L, 411199L, 4601L, 
4708L, 202L, 405L, 8015L, 837L, 11033L, 1502L, 1702L, 2112L, 
2408L, 28047L, 30015L, 3305L, 3709L, 410199L, 4511L, 1202L, 
490699L, 3063L, 610L, 827L, 1006L, 1301L, 15036L, 1901L, 
2310L, 2709L, 29025L, 3201L, 36008L, 390899L, 4301L, 46184L, 
4805L, 206L, 504L, 817L, 813L, 12135L, 1519L, 1810L, 2104L, 
2402L, 28130L, 30030L, 3305L, 3707L, 411399L, 45165L, 46181L, 
5008L, 305L, 7026L, 803L, 1006L, 1413L, 16078L, 200999L, 
2312L, 2712L, 29069L, 3210L, 3616L, 391199L, 4313L, 46105L, 
4805L, 310L, 6153L, 8252L, 8205L, 1205L, 1505L, 1808L, 2110L, 
2508L, 2810L, 311399L, 3405L, 3807L, 41024L, 4507L, 46102L, 
500599L, 3014L, 706L, 8121L, 11028L, 14042L, 1712L, 20045L, 
2314L, 27031L, 29901L, 33024L, 3614L, 400199L, 4307L, 46021L, 
4805L, 3066L, 6153L, 8015L, 901L, 12040L, 1522L, 1806L, 2203L, 
2508L, 28047L, 311099L, 35004L, 3801L, 410199L, 4515L, 46017L, 
501199L, 407L, 7027L, 827L, 1102L, 1404L, 17155L, 200599L, 
24089L, 2812L, 30019L, 33024L, 3612L, 41038L, 4301L, 4628L, 
4805L, 307L, 6153L, 817L, 1004L, 1309L, 1508L, 1804L, 2206L, 
2606L, 28130L, 310799L, 35011L, 38022L, 411399L, 4622L, 4701L, 
1036L, 4079L, 807L, 803L, 1108L, 1410L, 1708L, 201399L, 2410L, 
28058L, 30043L, 33024L, 3610L, 410399L, 4401L, 4621L, 490499L, 
3059L, 6153L), VARIABLE = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L), .Label = c("SPAIN_1998", 
"EU15DC_1998", "ROE_1998", "MAGREB_1998", "SSA_1998", "LA_1998", 
"ASIA_1998", "ROW_1998", "Total_1998", "SPAIN_1999", "EU15DC_1999", 
"ROE_1999", "MAGREB_1999", "SSA_1999", "LA_1999", "ASIA_1999", 
"ROW_1999", "Total_1999", "SPAIN_2000", "EU15DC_2000", "ROE_2000", 
"MAGREB_2000", "SSA_2000", "LA_2000", "ASIA_2000", "ROW_2000", 
"Total_2000", "SPAIN_2001", "EU15DC_2001", "ROE_2001", "MAGREB_2001", 
"SSA_2001", "LA_2001", "ASIA_2001", "ROW_2001", "Total_2001", 
"SPAIN_2002", "EU15DC_2002", "ROE_2002", "MAGREB_2002", "SSA_2002", 
"LA_2002", "ASIA_2002", "ROW_2002", "Total_2002", "SPAIN_2003", 
"EU15DC_2003", "ROE_2003", "MAGREB_2003", "SSA_2003", "LA_2003", 
"ASIA_2003", "ROW_2003", "Total_2003", "SPAIN_2004", "EU15DC_2004", 
"ROE_2004", "MAGREB_2004", "SSA_2004", "LA_2004", "ASIA_2004", 
"ROW_2004", "Total_2004", "SPAIN_2005", "EU15DC_2005", "ROE_2005", 
"MAGREB_2005", "SSA_2005", "LA_2005", "ASIA_2005", "ROW_2005", 
"Total_2005", "SPAIN_2006", "EU15DC_2006", "ROE_2006", "MAGREB_2006", 
"SSA_2006", "LA_2006", "ASIA_2006", "ROW_2006", "Total_2006", 
"SPAIN_2007", "EU15DC_2007", "ROE_2007", "MAGREB_2007", "SSA_2007", 
"LA_2007", "ASIA_2007", "ROW_2007", "Total_2007", "SPAIN_2008", 
"EU15DC_2008", "ROE_2008", "MAGREB_2008", "SSA_2008", "LA_2008", 
"ASIA_2008", "ROW_2008", "Total_2008", "SPAIN_2009", "EU15DC_2009", 
"ROE_2009", "MAGREB_2009", "SSA_2009", "LA_2009", "ASIA_2009", 
"ROW_2009", "Total_2009", "SPAIN_2010", "EU15DC_2010", "ROE_2010", 
"MAGREB_2010", "SSA_2010", "LA_2010", "ASIA_2010", "ROW_2010", 
"Total_2010", "SPAIN_2011", "EU15DC_2011", "ROE_2011", "MAGREB_2011", 
"SSA_2011", "LA_2011", "ASIA_2011", "ROW_2011", "Total_2011", 
"SPAIN_2012", "EU15DC_2012", "ROE_2012", "MAGREB_2012", "SSA_2012", 
"LA_2012", "ASIA_2012", "ROW_2012", "Total_2012", "NOTSPAIN_1998", 
"NOTSPAIN_1999", "NOTSPAIN_2000", "NOTSPAIN_2001", "NOTSPAIN_2002", 
"NOTSPAIN_2003", "NOTSPAIN_2004", "NOTSPAIN_2005", "NOTSPAIN_2006", 
"NOTSPAIN_2007", "NOTSPAIN_2008", "NOTSPAIN_2009", "NOTSPAIN_2010", 
"NOTSPAIN_2011", "NOTSPAIN_2012", "AFRICA_1998", "AFRICA_1999", 
"AFRICA_2000", "AFRICA_2001", "AFRICA_2002", "AFRICA_2003", 
"AFRICA_2004", "AFRICA_2005", "AFRICA_2006", "AFRICA_2007", 
"AFRICA_2008", "AFRICA_2009", "AFRICA_2010", "AFRICA_2011", 
"AFRICA_2012", "DWC_1998", "DWC_1999", "DWC_2000", "DWC_2001", 
"DWC_2002", "DWC_2003", "DWC_2004", "DWC_2005", "DWC_2006", 
"DWC_2007", "DWC_2008", "DWC_2009", "DWC_2010", "DWC_2011", 
"DWC_2012"), class = "factor"), FREQUENCY = c(614, 1943, 
59, 201, 188, 10859, 93, 
1494, 60, 1001, 1000, 689, 675, 934, 51, 
1240, 165, 13, 0, 14, 2, 2, 
2, 0, 3, 0, 40, 1, 18, 41, 1, 0, 3, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0, 
0, 0, 4, 0, 0, 15, 0, 0, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 11, 0, 0, 0, 3, 2, 1, 5, 
64, 1, 4, 1, 3, 4, 8, 1, 1, 1, 1, 0, 0, 0, 
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 2173, 907, 9059, 839, 
4303, 100, 1727, 663, 694, 1210, 623, 
1261, 772, 697, 490, 1031, 490, 956, 704, 
1293, 1011, 739, 927, 755, 3340, 1190, 1254, 12880, 528, 
3244, 277, 892, 837, 1, 2, 10, 1, 1, 2, 2, 0, 0, 1, 8, 3, 
12, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 12, 0, 7, 0, 0, 0, 
0, 0, 5, 2)), .Names = c("PROVINCE", "AGE5", "ZONA91OK", 
"VARIABLE", "FREQUENCY"), row.names = c(1L, 501L, 1001L, 1501L, 
2001L, 2501L, 3001L, 3501L, 4001L, 4501L, 5001L, 5501L, 6001L, 
6501L, 7001L, 7501L, 8001L, 8501L, 9001L, 9501L, 10001L, 10501L, 
11001L, 11501L, 12001L, 12501L, 13001L, 13501L, 14001L, 14501L, 
15001L, 15501L, 16001L, 16501L, 17001L, 17501L, 18001L, 18501L, 
19001L, 19501L, 20001L, 20501L, 21001L, 21501L, 22001L, 22501L, 
23001L, 23501L, 24001L, 24501L, 25001L, 25501L, 26001L, 26501L, 
27001L, 27501L, 28001L, 28501L, 29001L, 29501L, 30001L, 30501L, 
31001L, 31501L, 32001L, 32501L, 33001L, 33501L, 34001L, 34501L, 
35001L, 35501L, 36001L, 36501L, 37001L, 37501L, 38001L, 38501L, 
39001L, 39501L, 40001L, 40501L, 41001L, 41501L, 42001L, 42501L, 
43001L, 43501L, 44001L, 44501L, 45001L, 45501L, 46001L, 46501L, 
47001L, 47501L, 48001L, 48501L, 49001L, 49501L, 50001L, 50501L, 
51001L, 51501L, 52001L, 52501L, 53001L, 53501L, 54001L, 54501L, 
55001L, 55501L, 56001L, 56501L, 57001L, 57501L, 58001L, 58501L, 
59001L, 59501L, 60001L, 60501L, 61001L, 61501L, 62001L, 62501L, 
63001L, 63501L, 64001L, 64501L, 65001L, 65501L, 66001L, 66501L, 
67001L, 67501L, 68001L, 68501L, 69001L, 69501L, 70001L, 70501L, 
71001L, 71501L, 72001L, 72501L, 73001L, 73501L, 74001L, 74501L, 
75001L, 75501L, 76001L, 76501L, 77001L, 77501L, 78001L, 78501L, 
79001L, 79501L, 80001L, 80501L, 81001L, 81501L, 82001L, 82501L, 
83001L, 83501L, 84001L, 84501L, 85001L, 85501L, 86001L, 86501L, 
87001L, 87501L, 88001L, 88501L, 89001L, 89501L, 90001L, 90501L, 
91001L, 91501L, 92001L, 92501L, 93001L, 93501L, 94001L, 94501L, 
95001L, 95501L, 96001L, 96501L, 97001L, 97501L, 98001L, 98501L, 
99001L, 99501L), class = "data.frame")

Try this instead: 试试这个:

library(data.table)
dt = data.table(mydata)

dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
          YEAR        = sub('(.*)_(.*)', '\\2', VARIABLE))]

It seems like I need to look into updating my concat.split functions! 好像我需要考虑更新我的concat.split函数!

The version of the function that you tried to use makes use of read.table , which does tend to struggle with large datasets. 您尝试使用的函数版本使用了read.table ,它确实倾向于使用大型数据集。 I had used read.table because it has a convenient text argument that lets you specify a column in a data.frame as the input. 我使用了read.table因为它有一个方便的text参数,允许您指定data.frame中的列作为输入。 This is really convenient when working with small-ish datasets, but evidently not with larger ones :) 使用小型数据集时这非常方便,但显然不适合使用较大的数据集:)

As far as I can tell, fread from the "data.table" package doesn't have a similar feature, but since R tends to write files pretty quickly, I thought that it would be worth trying a similar approach as what I used in concat.split with fread instead of read.table . 据我所知,来自“data.table”包的fread没有类似的功能,但由于R倾向于很快写入文件,我认为尝试类似于我用过的方法是值得的。 concat.splitfread而不是read.table

Here's the concept: 这是概念:

  1. Write the variable that needs to be split to a new file. 编写需要拆分为新文件的变量。
  2. Use the blazing fast fread to read it back in. 使用炽热的快速fread读回来。
  3. Wait for fread to get a text argument somewhere down the line? 等待fread在某处获得text参数?

Here's that concept as a function (updated with edits as per @eddi's suggestions in the comments): 这是作为一个函数的概念(根据@ eddi在评论中的建议进行编辑更新):

csDataTable <- function(dataset, splitcol, sep, drop = FALSE) {
  if (is.numeric(splitcol)) splitcol <- names(dataset)[splitcol]
  if (!is.data.table(dataset)) dataset <- data.table(dataset)
  if (sep == ".") {
    dataset[, (splitcol) := gsub(".", "|", get(splitcol), fixed = TRUE)]
    sep <- "|"
  }
  if (!is.character(dataset[[splitcol]])) {
    dataset[, (splitcol) := as.character(get(splitcol))]
  }
  x <- tempfile()
  writeLines(dataset[[splitcol]], x)
  Split <- fread(x, sep=sep, header = FALSE)
  setnames(Split, paste(splitcol, seq_along(Split), sep = "_"))
  if (isTRUE(drop)) dataset[, (splitcol) := NULL]
  cbind(dataset, Split)
}

Here's the function in action: 这是行动中的功能:

## Expand your sample data to 1.5 million rows to test
out <- mydata[rep(rownames(mydata), 1500000/nrow(mydata)), ]

csDataTable(out, "VARIABLE", "_")
#          PROVINCE  AGE5 ZONA91OK    VARIABLE FREQUENCY VARIABLE_1 VARIABLE_2
#       1:        1 10-14      101  SPAIN_1998       614      SPAIN       1998
#       2:        4 30-34     4079  SPAIN_1998      1943      SPAIN       1998
#       3:        7 50-54      712  SPAIN_1998        59      SPAIN       1998
#       4:        8 40-44     8205  SPAIN_1998       201      SPAIN       1998
#       5:       11 35-39    11022  SPAIN_1998       188      SPAIN       1998
#      ---                                                                    
# 1499996:       44 35-39     4401    ROE_1999         0        ROE       1999
# 1499997:       46 35-39     4621    ROE_1999         0        ROE       1999
# 1499998:       49 10-14   490499    ROE_1999         0        ROE       1999
# 1499999:        3 30-34     3059 MAGREB_1999         5     MAGREB       1999
# 1500000:        6 40-44     6153 MAGREB_1999         2     MAGREB       1999

In this test, at least, the solution fares much better than I expected: 在这个测试中,至少,解决方案比我预期的要好得多:

subFun <- function() {
  dt = data.table(out)
  dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
            YEAR        = sub('(.*)_(.*)', '\\2', VARIABLE))]
} 
freadFun <- function() {
  csDataTable(out, "VARIABLE", "_")
}

library(microbenchmark)
microbenchmark(subFun(), freadFun(), times = 20)
# Unit: seconds
#        expr      min       lq   median       uq      max neval
#    subFun() 3.814174 4.244820 4.273834 4.345358 4.480520    20
#  freadFun() 1.356533 2.064262 2.152159 2.226465 2.300886    20

Here is some solution with splitting factor labels 这是一些分裂因子标签的解决方案

VARIABLE_LEVELS <- cbind("VARIABLE"=levels(mydata$VARIABLE),
                         as.data.frame(do.call("rbind",
                                       strsplit(levels(mydata$VARIABLE), split="_")))
mydata <- merge(mydata, VARIABLE_LEVELS)
#
# Insted of merege you can use VARIABLE (in mydata) as index
#
mydata <- cbind(mydata, VARIABLE_LEVELS[as.integer(mydata$VARIABLE),c("V1","V2")])

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM