[英]Reading dat file with two delimiters using R
我正在嘗試讀取如下數據示例:
1344428:-1,1,-1,415,-649,0.00; -1,2,-1,1090,-2167,0.00; -1,3,-1,-881,-3164,0.00; -1,4 ,-1,-624,1529,0.00; -1,5,-1,-849,-2875,0.00; -1,6,-1,856,-2341,0.00; -1,7,-1,758,-2408 ,0.00; -1,8,-1,-201,-2307,0.00; -1,9,-1,-963,-2807,0.00; -1,10,-1,-460,-3309,0.00 ; -1,11,-1,-1645,-1773,0.00; -1,12,-1,1487,-518,0.00; -1,13,-1,685,-3113,0.00; -1,14, -1,-935,-3217,0.00; -1,15,-1,-1101,-2430,0.00; -1,16,-1,754,-2946,0.00; -1,17,-1,823,-2497 ,0.00; -1,18,-1,-948,-2431,0.00; -1,19,-1,774,-2242,0.00; -1,20,-1,861,-2192,0.00; -1,21, -1,433,-3391,0.00; -1,22,-1,133,-2190,0.00; -1,23,-1,-977,-2585,0.00; -1,24,-1,-968,-2107 ,0.00; -1,25,-1,175,-3062,0.00; -1,26,-1,265,-2736,0.00; -1,27,-1,67,-2735,0.00; -1,28,- 1,-281,-2752,0.00; 4,29,-1,5550,4400,0.00;:174,-2563,11,28.67,A,Dead,SetAway ;: 1344429:-1,1,-1,415, -649,0.00; -1,2,-1,1090,-2167,0.00; -1,3,-1,-885,-3169,0.00; -1,4,-1,-626,1527,0.00 ; -1,5,-1,-852,-2887,0.00; -1,6,-1,854,-2340,0.00; -1,7,-1,761,-2411,0.00; -1,8,-1 ,-201,-2307,0.00; -1,9,-1,-967,-2808,0.00; -1,10,-1,-460,-3309,0.00; -1,11,-1,- 16 47,-1777,0.00; -1,12,-1,1485,-518,0.00; -1,13,-1,687,-3118,0.00; -1,14,-1,-938,-3222,0.00 ; -1,15,-1,-1100,-2430,0.00; -1,16,-1,744,-2946,0.00; -1,17,-1,815,-2505,0.00; -1,18,-1 ,-950,-2429,0.00; -1,19,-1,773,-2237,0.00; -1,20,-1,861,-2190,0.00; -1,21,-1,433,-3392,0.00; -1 ,22,-1,133,-2189,0.00; -1,23,-1,-980,-2593,0.00; -1,24,-1,-961,-2109,0.00; -1,25,-1,176 ,-3056,0.00; -1,26,-1,265,-2731,0.00; -1,27,-1,67,-2736,0.00; -1,28,-1,-283,-2746,0.00; 4,29,-1,5550,4400,0.00;:174,-2563,11,28.67,A,Dead,SetAway ;:
數據分為3個塊:
是否有一種優雅的方法將此數據讀入R數據幀? 我試過了
read.table("855360.dat",
header = FALSE,
sep = ";")
但是需要大量的操作才能將元素設置為3個大塊,我可以將它們加入並操縱?
如果單個數據幀結果正常,則只需用逗號替換冒號和分號,然后將其讀取:
L <- readLines("myfile")
read.table(text = gsub("[:;]+", ",", L), sep = ",", as.is = TRUE)
或者,如果您想生成一個嵌套列表結構,則從上方使用L
:
lapply(lapply(strsplit(L, ":"), strsplit, ";"), lapply, strsplit, ",")
如果將其轉換為多遍處理,則可能會花費更長的時間,但從長遠來看,可能更易於修改和維護。
如果先將字符串用";:"
字符分隔,您會注意到奇數索引是主要數據(包括時間戳記),偶數索引是您的“第三塊”,其中包含混合的num / char項。 分解后,您可能會意識到我們仍然有一個解析問題,但這要簡單一些。
txt <- "1344428:-1,1,-1,415,-649,0.00;-1,2,-1,1090,-2167,0.00;-1,3,-1,-881,-3164,0.00;-1,4,-1,-624,1529,0.00;-1,5,-1,-849,-2875,0.00;-1,6,-1,856,-2341,0.00;-1,7,-1,758,-2408,0.00;-1,8,-1,-201,-2307,0.00;-1,9,-1,-963,-2807,0.00;-1,10,-1,-460,-3309,0.00;-1,11,-1,-1645,-1773,0.00;-1,12,-1,1487,-518,0.00;-1,13,-1,685,-3113,0.00;-1,14,-1,-935,-3217,0.00;-1,15,-1,-1101,-2430,0.00;-1,16,-1,754,-2946,0.00;-1,17,-1,823,-2497,0.00;-1,18,-1,-948,-2431,0.00;-1,19,-1,774,-2242,0.00;-1,20,-1,861,-2192,0.00;-1,21,-1,433,-3391,0.00;-1,22,-1,133,-2190,0.00;-1,23,-1,-977,-2585,0.00;-1,24,-1,-968,-2107,0.00;-1,25,-1,175,-3062,0.00;-1,26,-1,265,-2736,0.00;-1,27,-1,67,-2735,0.00;-1,28,-1,-281,-2752,0.00;4,29,-1,5550,4400,0.00;:174,-2563,11,28.67,A,Dead,SetAway;: 1344429:-1,1,-1,415,-649,0.00;-1,2,-1,1090,-2167,0.00;-1,3,-1,-885,-3169,0.00;-1,4,-1,-626,1527,0.00;-1,5,-1,-852,-2887,0.00;-1,6,-1,854,-2340,0.00;-1,7,-1,761,-2411,0.00;-1,8,-1,-201,-2307,0.00;-1,9,-1,-967,-2808,0.00;-1,10,-1,-460,-3309,0.00;-1,11,-1,-1647,-1777,0.00;-1,12,-1,1485,-518,0.00;-1,13,-1,687,-3118,0.00;-1,14,-1,-938,-3222,0.00;-1,15,-1,-1100,-2430,0.00;-1,16,-1,744,-2946,0.00;-1,17,-1,815,-2505,0.00;-1,18,-1,-950,-2429,0.00;-1,19,-1,773,-2237,0.00;-1,20,-1,861,-2190,0.00;-1,21,-1,433,-3392,0.00;-1,22,-1,133,-2189,0.00;-1,23,-1,-980,-2593,0.00;-1,24,-1,-961,-2109,0.00;-1,25,-1,176,-3056,0.00;-1,26,-1,265,-2731,0.00;-1,27,-1,67,-2736,0.00;-1,28,-1,-283,-2746,0.00;4,29,-1,5550,4400,0.00;:174,-2563,11,28.67,A,Dead,SetAway;:"
x <- strsplit(txt, ";:")[[1]]
x <- sapply(x, trimws, USE.NAMES = FALSE)
x[1]
# [1] "1344428:-1,1,-1,415,-649,0.00;-1,2,-1,1090,-2167,0.00;-1,3,-1,-881,-3164,0.00;-1,4,-1,-624,1529,0.00;-1,5,-1,-849,-2875,0.00;-1,6,-1,856,-2341,0.00;-1,7,-1,758,-2408,0.00;-1,8,-1,-201,-2307,0.00;-1,9,-1,-963,-2807,0.00;-1,10,-1,-460,-3309,0.00;-1,11,-1,-1645,-1773,0.00;-1,12,-1,1487,-518,0.00;-1,13,-1,685,-3113,0.00;-1,14,-1,-935,-3217,0.00;-1,15,-1,-1101,-2430,0.00;-1,16,-1,754,-2946,0.00;-1,17,-1,823,-2497,0.00;-1,18,-1,-948,-2431,0.00;-1,19,-1,774,-2242,0.00;-1,20,-1,861,-2192,0.00;-1,21,-1,433,-3391,0.00;-1,22,-1,133,-2190,0.00;-1,23,-1,-977,-2585,0.00;-1,24,-1,-968,-2107,0.00;-1,25,-1,175,-3062,0.00;-1,26,-1,265,-2736,0.00;-1,27,-1,67,-2735,0.00;-1,28,-1,-281,-2752,0.00;4,29,-1,5550,4400,0.00"
x[2]
# [1] "174,-2563,11,28.67,A,Dead,SetAway"
這里的一個重要假設是,我們將始終有一對時間戳/數據和后續塊:
if (length(x) %% 2 != 0) stop("oops, uneven pairs")
odds <- seq(1, length(x), by = 2)
str(x[odds])
# chr [1:2] "1344428:-1,1,-1,415,-649,0.00;-1,2,-1,1090,-2167,0.00;-1,3,-1,-881,-3164,0.00;-1,4,-1,-624,1529,0.00;-1,5,-1,-849,-2875,0.00;-1"| __truncated__ ...
x[-odds]
# [1] "174,-2563,11,28.67,A,Dead,SetAway" "174,-2563,11,28.67,A,Dead,SetAway"
從這里開始,意識到我們可以輕松地用另一個strsplit
提取時間戳,然后可以通過替換";"
將其余部分轉換成類似read.csv
";"
與換行符(與第三塊相同):
timestamps <- lapply(firstsplit, function(z) data.frame(timestamp = as.numeric(z[1])))
data1 <- lapply(firstsplit, function(lst) read.csv(textConnection(gsub(";", "\n", lst[[2]])), header = FALSE))
data2 <- lapply(secondsplit, function(z) read.csv(textConnection(z), header = FALSE))
看一下其中一對數據:
bothlst <- mapply(list, timestamps, data1, data2, SIMPLIFY = FALSE)
str(bothlst[[1]])
# List of 3
# $ :'data.frame': 1 obs. of 1 variable:
# ..$ timestamp: num 1344428
# $ :'data.frame': 29 obs. of 6 variables:
# ..$ V1: int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# ..$ V2: int [1:29] 1 2 3 4 5 6 7 8 9 10 ...
# ..$ V3: int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# ..$ V4: int [1:29] 415 1090 -881 -624 -849 856 758 -201 -963 -460 ...
# ..$ V5: int [1:29] -649 -2167 -3164 1529 -2875 -2341 -2408 -2307 -2807 -3309 ...
# ..$ V6: num [1:29] 0 0 0 0 0 0 0 0 0 0 ...
# $ :'data.frame': 1 obs. of 7 variables:
# ..$ V1: int 174
# ..$ V2: int -2563
# ..$ V3: int 11
# ..$ V4: num 28.7
# ..$ V5: Factor w/ 1 level "A": 1
# ..$ V6: Factor w/ 1 level "Dead": 1
# ..$ V7: Factor w/ 1 level "SetAway": 1
這是一個很好的嵌套列表數據描述。 我故意將timestamp
為data.frame
以簡化以后的步驟,盡管這當然不是data.frame
。
如果要在包含所有數據的單個data.frame
進行描述, data.frame
兩件事:
timestamp
和“第三塊數據”將在數據中的所有行中重復。 根據您打算如何使用數據,這可能不是問題。 如果“第三塊”中單行數據的假設無效,則此方法會中斷 。 data.frame
從V1
命名更改為X1
命名。 牢記2:
data2mod <- lapply(data2, function(df) setNames(df, paste("X", seq_along(df), sep = "")))
bothlst2 <- mapply(list, timestamps, data1, data2mod, SIMPLIFY = FALSE)
現在,對於每個元素,我們可以將元素“列綁定”到單個data.frame
:
# bothdf <- lapply(bothlst2, cbind.data.frame)
str(bothdf)
# List of 2
# $ :'data.frame': 29 obs. of 14 variables:
# ..$ timestamp: num [1:29] 1344428 1344428 1344428 1344428 1344428 ...
# ..$ V1 : int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# ..$ V2 : int [1:29] 1 2 3 4 5 6 7 8 9 10 ...
# ..$ V3 : int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# ..$ V4 : int [1:29] 415 1090 -881 -624 -849 856 758 -201 -963 -460 ...
# ..$ V5 : int [1:29] -649 -2167 -3164 1529 -2875 -2341 -2408 -2307 -2807 -3309 ...
# ..$ V6 : num [1:29] 0 0 0 0 0 0 0 0 0 0 ...
# ..$ X1 : int [1:29] 174 174 174 174 174 174 174 174 174 174 ...
# ..$ X2 : int [1:29] -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 ...
# ..$ X3 : int [1:29] 11 11 11 11 11 11 11 11 11 11 ...
# ..$ X4 : num [1:29] 28.7 28.7 28.7 28.7 28.7 ...
# ..$ X5 : Factor w/ 1 level "A": 1 1 1 1 1 1 1 1 1 1 ...
# ..$ X6 : Factor w/ 1 level "Dead": 1 1 1 1 1 1 1 1 1 1 ...
# ..$ X7 : Factor w/ 1 level "SetAway": 1 1 1 1 1 1 1 1 1 1 ...
# $ :'data.frame': 29 obs. of 14 variables:
# ..$ timestamp: num [1:29] 1344429 1344429 1344429 1344429 1344429 ...
# ..$ V1 : int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# ..$ V2 : int [1:29] 1 2 3 4 5 6 7 8 9 10 ...
# ..$ V3 : int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# ..$ V4 : int [1:29] 415 1090 -885 -626 -852 854 761 -201 -967 -460 ...
# ..$ V5 : int [1:29] -649 -2167 -3169 1527 -2887 -2340 -2411 -2307 -2808 -3309 ...
# ..$ V6 : num [1:29] 0 0 0 0 0 0 0 0 0 0 ...
# ..$ X1 : int [1:29] 174 174 174 174 174 174 174 174 174 174 ...
# ..$ X2 : int [1:29] -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 ...
# ..$ X3 : int [1:29] 11 11 11 11 11 11 11 11 11 11 ...
# ..$ X4 : num [1:29] 28.7 28.7 28.7 28.7 28.7 ...
# ..$ X5 : Factor w/ 1 level "A": 1 1 1 1 1 1 1 1 1 1 ...
# ..$ X6 : Factor w/ 1 level "Dead": 1 1 1 1 1 1 1 1 1 1 ...
# ..$ X7 : Factor w/ 1 level "SetAway": 1 1 1 1 1 1 1 1 1 1 ...
從這里開始,直接處理它們或以類似的方式將它們組合起來是很直接的:
head(do.call("rbind", bothdf))
# timestamp V1 V2 V3 V4 V5 V6 X1 X2 X3 X4 X5 X6 X7
# 1 1344428 -1 1 -1 415 -649 0 174 -2563 11 28.67 A Dead SetAway
# 2 1344428 -1 2 -1 1090 -2167 0 174 -2563 11 28.67 A Dead SetAway
# 3 1344428 -1 3 -1 -881 -3164 0 174 -2563 11 28.67 A Dead SetAway
# 4 1344428 -1 4 -1 -624 1529 0 174 -2563 11 28.67 A Dead SetAway
# 5 1344428 -1 5 -1 -849 -2875 0 174 -2563 11 28.67 A Dead SetAway
# 6 1344428 -1 6 -1 856 -2341 0 174 -2563 11 28.67 A Dead SetAway
基於上面的第一個項目符號,您會注意到timestamp
列和所有X*
列都是多余的,類似於表的聯接。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.