[英]Apply a match function repeatedly to multiple dataframes in R
我需要总结三个数据帧之间的重叠并多次重复此操作。
更具体地说,我想计算昆虫觅食的地点、它们觅食的植物以及发现这些植物的地点之间的重叠。
到目前为止,我可以使用索引对物种列表中的一个昆虫物种执行此操作,但我需要对多个物种执行此操作(稍后将针对更多物种和植物科执行此操作,所以我不想要手动完成)。
示例数据框:
植物栖息地(位置):
plantloc <- data.frame(matrix(ncol=5,nrow=5, dimnames=list(NULL, c("plantfamily", "hedge", "margin", "arablefields", "grassfields"))))
plantloc$plantfamily <- c("Aceraceae", "Boraginaceae", "Dipsacaceae", "Lamiaceae", "Umbelliferae")
plantloc$hedge <- c(1,1,1,1,1)
plantloc$margin <- c(0,1,0,1,1)
plantloc$arablefields <- c(0,1,0,1,1)
plantloc$grassfields <- c(0,1,1,1,1)
大黄蜂饲料植物:
bbfpl <-data.frame(matrix(ncol=6,nrow=14, dimnames=list(NULL, c("species","Aceraceae", "Boraginaceae", "Dipsacaceae", "Lamiaceae", "Umbelliferae"))))
bbfpl$species <- c("Bombus_distinguendus","Bombus_hortorum","Bombus_humilis","Bombus_jonellus","Bombus_lapidarius","Bombus_lucorum","Bombus_muscorum","Bombus_pascuorum","Bombus_pratorum","Bombus_ruderarius","Bombus_ruderatus","Bombus_soroeensis","Bombus_sylvarum","Bombus_terrestris")
bbfpl$Aceraceae <- c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0)
bbfpl$Boraginaceae <- c(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1)
bbfpl$Dipsacaceae <- c(1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1)
bbfpl$Lamiaceae <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1)
bbfpl$Umbelliferae <- c(1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0)
大黄蜂觅食栖息地:
bbfhab <- data.frame(matrix(ncol=5,nrow=14, dimnames=list(NULL, c("species", "hedge", "margin", "arablefields", "grassfields"))))
bbfhab$species <- c("Bombus_distinguendus","Bombus_hortorum","Bombus_humilis","Bombus_jonellus","Bombus_lapidarius","Bombus_lucorum","Bombus_muscorum","Bombus_pascuorum","Bombus_pratorum","Bombus_ruderarius","Bombus_ruderatus","Bombus_soroeensis","Bombus_sylvarum","Bombus_terrestris")
bbfhab$hedge <- c(0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1)
bbfhab$margin <- c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
bbfhab$arablefields <- c(0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1)
bbfhab$grassfields <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
以 Bombus_sylvarum 为例:
match(colnames(bbfpl)[which(bbfpl[13,] == 1)],plantloc$plantfamily)
match(colnames(bbfhab)[which(bbfhab[13,] == 1)],colnames(plantloc))
sum(plantloc[match(colnames(bbfpl)[which(bbfpl[13,] == 1)],plantloc$plantfamily) , match(colnames(bbfhab)[which(bbfhab[13,] == 1)],colnames(plantloc))])
我可以为所有 14 个物种做每个单独的步骤,例如:
apply(bbfpl, 1,
function(x){
match(colnames(bbfpl)[which(x==1)],plantloc$plantfamily)
})
但我不知道如何将这两个步骤放在一起。
问题:
如何对所有 14 种大黄蜂执行此操作并将结果放入新的 dataframe(第一列 = 大黄蜂种类,第二列 = 结果)?
应用? 环形? 呼噜声?
我已经搜索了类似的问题,但没有设法解决这个问题。
谢谢你。
也许我误解了这个问题...我不知道什么会阻止您将步骤 3. 包装到 function 中,该 function 遍历seq_len(nrow(bbfpl))
以将 13 替换为 1:14 以获得这些总和,但在这里是一种可能更有效的data.table
方法:
library(data.table)
BBFPL <- melt(data.table(bbfpl), id.vars = "species", variable.name = "plantfamily", value.name = "fam.value")
BBFHAB <- melt(data.table(bbfhab), id.vars = "species")
BBF <- merge(BBFPL, BBFHAB, by="species", allow.cartesian=TRUE)
BBF[, idx := .I] # to preserve order
PL <- melt(data.table(plantloc), id.vars = "plantfamily")[value==1]
setkey(PL, plantfamily, variable)
setkey(BBF, plantfamily, variable)
out <- BBF[PL][fam.value*value*i.value==1]
setkey(out, idx) # reorder result
out[, list(sum=.N), by="species"][]
#> species sum
#> 1: Bombus_distinguendus 3
#> 2: Bombus_hortorum 10
#> 3: Bombus_humilis 6
#> 4: Bombus_jonellus 5
#> 5: Bombus_lapidarius 10
#> 6: Bombus_lucorum 10
#> 7: Bombus_muscorum 3
#> 8: Bombus_pascuorum 10
#> 9: Bombus_pratorum 9
#> 10: Bombus_ruderarius 5
#> 11: Bombus_ruderatus 4
#> 12: Bombus_soroeensis 3
#> 13: Bombus_sylvarum 8
#> 14: Bombus_terrestris 10
我建议使用tidyverse
包dplyr
和tidyr
将数据转换为方便的形式以便连接在一起。
library(tidyverse) # Load tidyverse packages (including dplyr and tidyr)
# Create three matching tables:
## plantfamilies matched to habitats
pf_hab <-
plantloc %>%
pivot_longer(cols = -plantfamily, names_to = "habitat") %>% # Convert to one row per (plantfamily, habitat)
filter(value == 1) %>% # Keep the 1's, remove the 0's
select(-value) # Drop the value column (since it's now all 1's)
## Bumblebee species matched to plant families
bb_pf <-
bbfpl %>%
pivot_longer(cols = -species, names_to = "plantfamily") %>% # Convert to one row per (species, plantfamily)
filter(value == 1) %>% # Keep the 1's, remove the 0's
select(-value) # Drop the value column (since it's now all 1's)
## Bumblebee species matched to habitats
bb_hab <-
bbfhab %>%
pivot_longer(cols = -species, names_to = "habitat") %>% # Convert to one row per (species, habitat)
filter(value == 1) %>% # Keep the 1's, remove the 0's
select(-value) # Drop the value column (since it's now all 1's)
# Join them into a single table
bb_pf_hab <-
bb_pf %>%
inner_join(bb_hab) %>% # Join bb_pf to bb_hab (automatically joins on the matching column name, "species")
inner_join(pf_hab) # Join that result to pf_hab (automatically joins on the matching column names, "plantfamily" and "habitat")
# If we want, we can use this table to look up the matches for a given species, e.g., Bombus sylvarum
bb_pf_hab %>% filter(species == 'Bombus_sylvarum')
# Output:
# species plantfamily habitat
# <chr> <chr> <chr>
# 1 Bombus_sylvarum Dipsacaceae hedge
# 2 Bombus_sylvarum Dipsacaceae grassfields
# 3 Bombus_sylvarum Lamiaceae hedge
# 4 Bombus_sylvarum Lamiaceae margin
# 5 Bombus_sylvarum Lamiaceae grassfields
# 6 Bombus_sylvarum Umbelliferae hedge
# 7 Bombus_sylvarum Umbelliferae margin
# 8 Bombus_sylvarum Umbelliferae grassfields
# Count the matches for each species
bb_pf_hab %>%
group_by(species) %>% # Assign groups for aggregation
summarize(Count = n()) # Count the number in each group
# Output:
# species Count
# <chr> <int>
# 1 Bombus_distinguendus 3
# 2 Bombus_hortorum 10
# 3 Bombus_humilis 6
# 4 Bombus_jonellus 5
# 5 Bombus_lapidarius 10
# 6 Bombus_lucorum 10
# 7 Bombus_muscorum 3
# 8 Bombus_pascuorum 10
# 9 Bombus_pratorum 9
# 10 Bombus_ruderarius 5
# 11 Bombus_ruderatus 4
# 12 Bombus_soroeensis 3
# 13 Bombus_sylvarum 8
# 14 Bombus_terrestris 10
这是相同方法的更简洁版本:
library(tidyverse)
# Create matching table based on which numeric columns have non-zero value
get_matching_table <- function(data, names_to) {
data %>% pivot_longer(where(is.numeric), names_to) %>%
filter(value != 0) %>% select(-value)
}
# Join matching tables, joining on columns with matching names
bb_pf_hab <-
list(get_matching_table(bbfpl, names_to = 'plantfamily'),
get_matching_table(bbfhab, names_to = 'habitat'),
get_matching_table(plantloc, names_to = 'habitat')) %>%
reduce(inner_join)
# Count matches by species
bb_pf_hab %>% group_by(species) %>% summarize(Count = n())
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.