简体   繁体   English

合并列表中的数据框

[英]Merge data frames within a list

I have a list which looks like, 我有一个看起来像的清单,

lapply(sample_list, head, 3)

$`2016-04-24 00:00:00.tcp`
   ports freq
8    443  296
12    80  170
5     23   92

$`2016-04-24 00:00:00.udp`
  ports freq
4   161  138
7    53   45
1   123   28

$`2016-04-24 01:00:00.tcp`
   ports freq
13   443  342
20    80  215
10    25   60

$`2016-04-24 01:00:00.udp`
   ports freq
4    161   85
8     53   42
12   902   27

I want to merge the data frames that come from the same protocol (ie the tcp together and udp together) So the final result would be a new list with 2 data frames; 我想合并来自同一协议的数据帧(即tcpudp在一起),所以最终结果将是一个包含2个数据帧的新列表。 One for tcp and one for udp such that, 一个用于tcp ,另一个用于udp这样,

lapply(final_list, head, 3)

$tcp
  ports freq.00:00:00 freq.01:00:00
1   443           296           342
2    80           170           215
3    23            92            51

$udp
  ports freq.00:00:00 freq.01:00:00
1   161           138            85
2    53            45            42
3   123            28            19

DATA 数据

dput(sample_list)
structure(list(`2016-04-24 00:00:00.tcp` = structure(list(ports = c("443", 
"80", "23", "21", "22", "25", "445", "110", "389", "135", "465", 
"514", "91", "995", "84", "902"), freq = structure(c(296L, 170L, 
92L, 18L, 16L, 15L, 14L, 4L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Dim = 16L)), .Names = c("ports", 
"freq"), row.names = c(8L, 12L, 5L, 3L, 4L, 6L, 9L, 1L, 7L, 2L, 
10L, 11L, 15L, 16L, 13L, 14L), class = "data.frame"), `2016-04-24 00:00:00.udp` = structure(list(
    ports = c("161", "53", "123", "902", "137", "514", "138", 
    "623", "69", "88", "500"), freq = structure(c(138L, 45L, 
    28L, 26L, 24L, 24L, 6L, 6L, 5L, 4L, 1L), .Dim = 11L)), .Names = c("ports", 
"freq"), row.names = c(4L, 7L, 1L, 11L, 2L, 6L, 3L, 8L, 9L, 10L, 
5L), class = "data.frame"), `2016-04-24 01:00:00.tcp` = structure(list(
    ports = c("443", "80", "25", "23", "88", "21", "161", "22", 
    "445", "135", "389", "993", "548", "110", "143", "502", "514", 
    "81", "995", "102", "111", "311", "444", "789", "902", "91"
    ), freq = structure(c(342L, 215L, 60L, 51L, 42L, 32L, 31L, 
    18L, 18L, 6L, 5L, 4L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Dim = 26L)), .Names = c("ports", "freq"
), row.names = c(13L, 20L, 10L, 9L, 22L, 7L, 6L, 8L, 15L, 4L, 
12L, 25L, 18L, 2L, 5L, 16L, 17L, 21L, 26L, 1L, 3L, 11L, 14L, 
19L, 23L, 24L), class = "data.frame"), `2016-04-24 01:00:00.udp` = structure(list(
    ports = c("161", "53", "902", "514", "123", "137", "69", 
    "138", "389", "443", "88", "623"), freq = structure(c(85L, 
    42L, 27L, 24L, 19L, 15L, 15L, 4L, 2L, 2L, 2L, 1L), .Dim = 12L)), .Names = c("ports", 
"freq"), row.names = c(4L, 8L, 12L, 7L, 1L, 2L, 10L, 3L, 5L, 
6L, 11L, 9L), class = "data.frame")), .Names = c("2016-04-24 00:00:00.tcp", 
"2016-04-24 00:00:00.udp", "2016-04-24 01:00:00.tcp", "2016-04-24 01:00:00.udp"
))

Bonus question: What is the structure of freq ? 奖金问题: freq的结构是什么? I never saw int [1:16(1d)] before. 我以前从未见过int [1:16(1d)]

str(sample_list$`2016-04-24 00:00:00.tcp`)
'data.frame':   16 obs. of  2 variables:
 $ ports: chr  "443" "80" "23" "21" ...
 $ freq : int [1:16(1d)] 296 170 92 18 16 15 14 4 3 2 ...

The code I used to create the list (In this case called try1 ) 我用来创建列表的代码(在本例中为try1

protocol_list <- lapply(per_hour1, function(i) split(i, i$protocol))
Analytic_Protocol_List <- lapply(protocol_list, function(i) lapply(i, dest.ports))
try1 <- lapply(unlist(Analytic_Protocol_List, recursive = FALSE), `[[`, 1)

Note that solutions from similar questions do not work for this case. 请注意, 类似问题的解决方案不适用于这种情况。 Maybe because of the structure? 也许是因为结构?

For the rbind ing you can try the following: 对于rbind您可以尝试以下操作:

do.call(rbind, sample_list[grep("tcp", names(sample_list))])

and: 和:

do.call(rbind, sample_list[grep("udp", names(sample_list))])

and as refined by Marat below: 并由以下Marat完善:

d <- do.call(rbind, sample_list)
d2 <- data.frame(d,do.call(rbind,strsplit(rownames((d)),'[.]')))
lapply(split(d2,d2$X2),dcast,ports~X1,value.var='freq')

Another alternative: 另一种选择:

library(dplyr)
library(tidyr)

data.table::melt(sample_list) %>%
  separate(L1, into = c("time", "protocol"), sep = "\\.") %>%
  unite(f, variable, time) %>%
  spread(f, value) %>%
  split(.$protocol)

Which, using your data, gives: 使用您的数据可以得出:

$tcp
   ports protocol freq_2016-04-24 00:00:00 freq_2016-04-24 01:00:00
1    102      tcp                       NA                        1
2    110      tcp                        4                        2
3    111      tcp                       NA                        1
5    135      tcp                        2                        6
8    143      tcp                       NA                        2
9    161      tcp                       NA                       31
11    21      tcp                       18                       32
12    22      tcp                       16                       18
13    23      tcp                       92                       51
14    25      tcp                       15                       60
15   311      tcp                       NA                        1
16   389      tcp                        3                        5
18   443      tcp                      296                      342
20   444      tcp                       NA                        1
21   445      tcp                       14                       18
22   465      tcp                        2                       NA
24   502      tcp                       NA                        2
25   514      tcp                        2                        2
28   548      tcp                       NA                        3
31   789      tcp                       NA                        1
32    80      tcp                      170                      215
33    81      tcp                       NA                        2
34    84      tcp                        1                       NA
35    88      tcp                       NA                       42
37   902      tcp                        1                        1
39    91      tcp                        2                        1
40   993      tcp                       NA                        4
41   995      tcp                        2                        2

$udp
   ports protocol freq_2016-04-24 00:00:00 freq_2016-04-24 01:00:00
4    123      udp                       28                       19
6    137      udp                       24                       15
7    138      udp                        6                        4
10   161      udp                      138                       85
17   389      udp                       NA                        2
19   443      udp                       NA                        2
23   500      udp                        1                       NA
26   514      udp                       24                       24
27    53      udp                       45                       42
29   623      udp                        6                        1
30    69      udp                        5                       15
36    88      udp                        4                        2
38   902      udp                       26                       27

Update : 更新

If you want to sort by freq , you could do: 如果要按freq排序,可以执行以下操作:

data.table::melt(sample_list) %>%
  separate(L1, into = c("time", "protocol"), sep = "\\.") %>%
  unite(f, variable, time) %>%
  spread(f, value) %>%
  arrange(protocol, desc(`freq_2016-04-24 00:00:00`)) %>%
  split(.$protocol) 

you can just merge by ID create a ID for each row of the data frame let lappy(X) = x 您只需按ID合并即可为数据帧的每一行创建一个ID,让lappy(X)= x

  x$1 <- cbind(ID=1:nrow(x$1)) 

same for x1,x2,x3....,xN 对于x1,x2,x3 ....,xN相同

  newx <- merge(x$1,x$2,...,x$N, by=ID)

since id merging is used overlapping won't occur, jusıt there each list$(X) as a data frame itself 由于使用了ID合并,因此不会发生重叠,因此将每个list $(X)当作数据框本身

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM