![](/img/trans.png)
[英]Error in grouping variable trying to create a function to get frequency counts using dplyr in R
[英]Using R - frequency counts with variable binwidths and factors
我有一個很大的數據集(超過一百萬行),這里有一個小樣本:
structure(list(Feret = c(0.017, 0.016, 2.12, 0.016, 0.02, 0.023,
0.017, 0.021, 0.02, 0.016, 0.027, 0.052, 0.061, 0.033, 0.041,
0.017, 6.561, 7.123, 0.027, 0.018, 0.024, 4.099, 0.022, 0.025,
0.037, 0.037, 0.018, 0.039, 0.027, 0.053, 0.016, 0.107, 0.52,
0.041, 0.038, 0.039, 0.03, 0.071, 0.022, 0.118, 0.032, 0.018,
0.027, 0.035, 8.113, 0.078, 4.089, 0.035, 0.057, 6.905, 2.5,
0.282, 0.045, 0.039, 0.071, 0.037, 0.029, 0.027, 0.016, 0.02,
0.026, 0.025, 0.026, 0.016, 0.016, 0.021), sample.type = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("flower", "leaf"), class = "factor"), leaf.side = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("lower", "upper"), class = "factor"), canopy = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("bottom", "top"), class = "factor"), treatment = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("blue", "green", "grey", "white", "yel-green"
), class = "factor")), .Names = c("Feret", "sample.type", "leaf.side",
"canopy", "treatment"), row.names = c(500000L, 500001L, 500002L,
500003L, 500004L, 500005L, 500006L, 500007L, 500008L, 500009L,
500010L, 800000L, 800001L, 800002L, 800003L, 800004L, 800005L,
800006L, 800007L, 800008L, 800009L, 800010L, 1000L, 1001L, 1002L,
1003L, 1004L, 1005L, 1006L, 1007L, 1008L, 1009L, 1010L, 10000L,
10001L, 10002L, 10003L, 10004L, 10005L, 10006L, 10007L, 10008L,
10009L, 10010L, 100000L, 100001L, 100002L, 100003L, 100004L,
100005L, 100006L, 100007L, 100008L, 100009L, 100010L, 1160000L,
1160001L, 1160002L, 1160003L, 1160004L, 1160005L, 1160006L, 1160007L,
1160008L, 1160009L, 1160010L), class = "data.frame")
我一直在嘗試使用以下binswidths創建'Feret'變量的頻率計數:
bins <- c(0.01,0.03,0.1,0.3,1,3,10)
然后使用:
freq<-hist(df_temp$Feret, breaks=bins)
ranges<-paste(head(bins,-1),bins[-1],sep=" - ")
freq$counts
df5<-data.frame(ranges = ranges, frequency = freq$counts)
df5
但是我真正需要做的是按各種因素(“ sample.type”,“ leaf.side”,“ canopy”,“ treatment”)將data.frame拆分,並提取每個子集的頻率計數。 我可以通過手動創建每個子集來實現這一長遠目標,但是我想做一個更好的方法。 我嘗試使用循環創建子集,然后將hist()函數應用於每個子集,但是這花費了很長時間。 有沒有使用Dplyr或Apply的更好方法? 我只希望將結果存儲在表格中,然后可以根據需要繪制它們。
以下代碼片段應執行您想要的操作:
我將您的樣本加載到df
。
library("dplyr")
df %>% group_by(sample.type, leaf.side, canopy, treatment) %>%
dplyr::select(Feret) %>%
do(data.frame(table(cut(.$Feret, breaks=bins, include.lowest=T))))
我請您參考dplyr文檔 。 簡而言之, x %>% f
是f(x)
, x -> f(a)
是f(x,a)
。
請注意, dplyr::select
只是select
,但是我遇到了很多次命名空間問題,以至於現在我總是指定包。
table(cut(df$Feret, breaks=bins))
只是做hist
更好的方法。 使用cut
,您可以創建一個因子變量(如果您的值可以達到下限,請記住添加include.lowest = T),使用table
,您可以計算每個級別的頻率。
這給出:
sample.type leaf.side canopy treatment Var1 Freq
1 flower upper top green (0.01,0.03] 0
2 flower upper top green (0.03,0.1] 6
3 flower upper top green (0.1,0.3] 1
4 flower upper top green (0.3,1] 0
5 flower upper top green (1,3] 1
6 flower upper top green (3,10] 3
7 flower upper top white (0.01,0.03] 4
8 flower upper top white (0.03,0.1] 4
9 flower upper top white (0.1,0.3] 0
10 flower upper top white (0.3,1] 0
11 flower upper top white (1,3] 0
12 flower upper top white (3,10] 3
13 leaf lower bottom white (0.01,0.03] 5
14 leaf lower bottom white (0.03,0.1] 4
15 leaf lower bottom white (0.1,0.3] 1
16 leaf lower bottom white (0.3,1] 1
17 leaf lower bottom white (1,3] 0
18 leaf lower bottom white (3,10] 0
19 leaf lower top grey (0.01,0.03] 10
20 leaf lower top grey (0.03,0.1] 1
21 leaf lower top grey (0.1,0.3] 0
22 leaf lower top grey (0.3,1] 0
23 leaf lower top grey (1,3] 0
24 leaf lower top grey (3,10] 0
25 leaf upper bottom white (0.01,0.03] 4
26 leaf upper bottom white (0.03,0.1] 6
27 leaf upper bottom white (0.1,0.3] 1
28 leaf upper bottom white (0.3,1] 0
29 leaf upper bottom white (1,3] 0
30 leaf upper bottom white (3,10] 0
31 leaf upper top blue (0.01,0.03] 10
32 leaf upper top blue (0.03,0.1] 0
33 leaf upper top blue (0.1,0.3] 0
34 leaf upper top blue (0.3,1] 0
35 leaf upper top blue (1,3] 1
36 leaf upper top blue (3,10] 0
(實際上,由於這是tbl,因此不會像這樣打印,但是您可以使用print.data.frame以舊方式打印tbl。)
從這里可以輕松提取所需的信息。
首先定義一個具有因子名稱的字符向量:
factors <- c("sample.type","leaf.side","canopy", "treatment")
然后使用此向量將hist()
函數應用於每個因子(假定數據存儲在名為df
的數據幀對象中):
res <- sapply(factors, function(factor) {
lapply(split(df[, c("Feret", factor)], df[[factor]]), function(group) {
hist(group$Feret, breaks = bins, plot = FALSE)
})
}, simplify = FALSE)
現在,您有了一個列表,其中每個因子都有一個元素,而每個因子又是一個列表,其中每個因子都有一個元素:
> names(res)
[1] "sample.type" "leaf.side" "canopy" "treatment"
> names(res$sample.type)
[1] "flower" "leaf"
> res$sample.type$flower
$breaks
[1] 0.01 0.03 0.10 0.30 1.00 3.00 10.00
$counts
[1] 4 10 1 0 1 6
$density
[1] 9.09090909 6.49350649 0.22727273 0.00000000 0.02272727 0.03896104
$mids
[1] 0.020 0.065 0.200 0.650 2.000 6.500
$xname
[1] "group$Feret"
$equidist
[1] FALSE
attr(,"class")
[1] "histogram"
>
您可以將其格式化為適合繪圖的格式。
如果我們對沒有出現的垃圾箱不感興趣,我們只需要:
df %>%
group_by(sample.type, leaf.side, canopy, treatment, groups = cut(Feret, bins)) %>%
summarise(freq =n())
輸出:
sample.type leaf.side canopy treatment groups freq
1 flower upper top green (0.03,0.1] 6
2 flower upper top green (0.1,0.3] 1
3 flower upper top green (1,3] 1
4 flower upper top green (3,10] 3
5 flower upper top white (0.01,0.03] 4
6 flower upper top white (0.03,0.1] 4
7 flower upper top white (3,10] 3
8 leaf lower bottom white (0.01,0.03] 5
9 leaf lower bottom white (0.03,0.1] 4
10 leaf lower bottom white (0.1,0.3] 1
11 leaf lower bottom white (0.3,1] 1
12 leaf lower top grey (0.01,0.03] 10
13 leaf lower top grey (0.03,0.1] 1
14 leaf upper bottom white (0.01,0.03] 4
15 leaf upper bottom white (0.03,0.1] 6
16 leaf upper bottom white (0.1,0.3] 1
17 leaf upper top blue (0.01,0.03] 10
18 leaf upper top blue (1,3] 1
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.