将匹配 function 重复应用于 R 中的多个数据帧

Question

我需要总结三个数据帧之间的重叠并多次重复此操作。

更具体地说，我想计算昆虫觅食的地点、它们觅食的植物以及发现这些植物的地点之间的重叠。

到目前为止，我可以使用索引对物种列表中的一个昆虫物种执行此操作，但我需要对多个物种执行此操作（稍后将针对更多物种和植物科执行此操作，所以我不想要手动完成）。

示例数据框：

植物栖息地（位置）：

plantloc <- data.frame(matrix(ncol=5,nrow=5, dimnames=list(NULL, c("plantfamily", "hedge", "margin", "arablefields", "grassfields"))))
plantloc$plantfamily <- c("Aceraceae", "Boraginaceae", "Dipsacaceae", "Lamiaceae", "Umbelliferae")
plantloc$hedge <- c(1,1,1,1,1)
plantloc$margin <- c(0,1,0,1,1)
plantloc$arablefields <- c(0,1,0,1,1)
plantloc$grassfields <- c(0,1,1,1,1)

大黄蜂饲料植物：

bbfpl <-data.frame(matrix(ncol=6,nrow=14, dimnames=list(NULL, c("species","Aceraceae", "Boraginaceae", "Dipsacaceae", "Lamiaceae", "Umbelliferae"))))
bbfpl$species <- c("Bombus_distinguendus","Bombus_hortorum","Bombus_humilis","Bombus_jonellus","Bombus_lapidarius","Bombus_lucorum","Bombus_muscorum","Bombus_pascuorum","Bombus_pratorum","Bombus_ruderarius","Bombus_ruderatus","Bombus_soroeensis","Bombus_sylvarum","Bombus_terrestris")
bbfpl$Aceraceae <- c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0)
bbfpl$Boraginaceae <- c(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1)
bbfpl$Dipsacaceae <- c(1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1)
bbfpl$Lamiaceae <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1)
bbfpl$Umbelliferae <- c(1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0)

大黄蜂觅食栖息地：

bbfhab <- data.frame(matrix(ncol=5,nrow=14, dimnames=list(NULL, c("species", "hedge", "margin", "arablefields", "grassfields"))))
bbfhab$species <- c("Bombus_distinguendus","Bombus_hortorum","Bombus_humilis","Bombus_jonellus","Bombus_lapidarius","Bombus_lucorum","Bombus_muscorum","Bombus_pascuorum","Bombus_pratorum","Bombus_ruderarius","Bombus_ruderatus","Bombus_soroeensis","Bombus_sylvarum","Bombus_terrestris")
bbfhab$hedge <- c(0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1)
bbfhab$margin <- c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
bbfhab$arablefields <- c(0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1)
bbfhab$grassfields <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

以 Bombus_sylvarum 为例：

将 Bombus_sylvarum 使用的植物科（即 B_sylv 为 1 的bbfpl的列名）与plantloc的植物科列匹配，并返回这些植物科的行索引

    match(colnames(bbfpl)[which(bbfpl[13,] == 1)],plantloc$plantfamily)

将 Bombus_sylvarum 使用的觅食栖息地（即 B_sylv 为 1 的bbfhab的列名）与plantloc的栖息地列匹配，并返回这些栖息地的列索引

    match(colnames(bbfhab)[which(bbfhab[13,] == 1)],colnames(plantloc))

使用上面的两个匹配项来索引Bombus_sylvarum使用的 Plantloc 中的栖息地（列）和植物科（行），并将这些索引位置中的所有值相加。 这给出了重叠的总数。 在这种情况下，它是 8。

    sum(plantloc[match(colnames(bbfpl)[which(bbfpl[13,] == 1)],plantloc$plantfamily) , match(colnames(bbfhab)[which(bbfhab[13,] == 1)],colnames(plantloc))])

我可以为所有 14 个物种做每个单独的步骤，例如：

apply(bbfpl, 1,
      function(x){
      match(colnames(bbfpl)[which(x==1)],plantloc$plantfamily)
      })

但我不知道如何将这两个步骤放在一起。

问题：

如何对所有 14 种大黄蜂执行此操作并将结果放入新的 dataframe（第一列 = 大黄蜂种类，第二列 = 结果）？

应用？ 环形？ 呼噜声？

我已经搜索了类似的问题，但没有设法解决这个问题。

谢谢你。

Answer 1

也许我误解了这个问题...我不知道什么会阻止您将步骤 3. 包装到 function 中，该 function 遍历seq_len(nrow(bbfpl))以将 13 替换为 1:14 以获得这些总和，但在这里是一种可能更有效的data.table方法：

library(data.table)
BBFPL <- melt(data.table(bbfpl), id.vars = "species", variable.name = "plantfamily", value.name = "fam.value")
BBFHAB <- melt(data.table(bbfhab), id.vars = "species")
BBF <- merge(BBFPL, BBFHAB, by="species", allow.cartesian=TRUE)
BBF[, idx := .I] # to preserve order
PL <- melt(data.table(plantloc), id.vars = "plantfamily")[value==1]
setkey(PL, plantfamily, variable)
setkey(BBF, plantfamily, variable)
out <- BBF[PL][fam.value*value*i.value==1]
setkey(out, idx) # reorder result
out[, list(sum=.N), by="species"][]
#>                  species sum
#>  1: Bombus_distinguendus   3
#>  2:      Bombus_hortorum  10
#>  3:       Bombus_humilis   6
#>  4:      Bombus_jonellus   5
#>  5:    Bombus_lapidarius  10
#>  6:       Bombus_lucorum  10
#>  7:      Bombus_muscorum   3
#>  8:     Bombus_pascuorum  10
#>  9:      Bombus_pratorum   9
#> 10:    Bombus_ruderarius   5
#> 11:     Bombus_ruderatus   4
#> 12:    Bombus_soroeensis   3
#> 13:      Bombus_sylvarum   8
#> 14:    Bombus_terrestris  10

Answer 2

我建议使用tidyverse包dplyr和tidyr将数据转换为方便的形式以便连接在一起。

library(tidyverse)    # Load tidyverse packages (including dplyr and tidyr)

# Create three matching tables:

## plantfamilies matched to habitats
pf_hab <-
    plantloc %>%
    pivot_longer(cols = -plantfamily, names_to = "habitat") %>%    # Convert to one row per (plantfamily, habitat)
    filter(value == 1) %>%                                         # Keep the 1's, remove the 0's
    select(-value)                                                 # Drop the value column (since it's now all 1's)

## Bumblebee species matched to plant families
bb_pf <-
    bbfpl %>%
    pivot_longer(cols = -species, names_to = "plantfamily") %>%    # Convert to one row per (species, plantfamily)
    filter(value == 1) %>%                                         # Keep the 1's, remove the 0's
    select(-value)                                                 # Drop the value column (since it's now all 1's)

## Bumblebee species matched to habitats
bb_hab <-
    bbfhab %>%
    pivot_longer(cols = -species, names_to = "habitat") %>%    # Convert to one row per (species, habitat)
    filter(value == 1) %>%                                     # Keep the 1's, remove the 0's
    select(-value)                                             # Drop the value column (since it's now all 1's)

# Join them into a single table 
bb_pf_hab <-
    bb_pf %>%
    inner_join(bb_hab) %>%    # Join bb_pf to bb_hab (automatically joins on the matching column name, "species")
    inner_join(pf_hab)        # Join that result to pf_hab (automatically joins on the matching column names, "plantfamily" and "habitat")

# If we want, we can use this table to look up the matches for a given species, e.g., Bombus sylvarum
bb_pf_hab %>% filter(species == 'Bombus_sylvarum')

# Output:
#   species         plantfamily  habitat    
#   <chr>           <chr>        <chr>      
# 1 Bombus_sylvarum Dipsacaceae  hedge      
# 2 Bombus_sylvarum Dipsacaceae  grassfields
# 3 Bombus_sylvarum Lamiaceae    hedge      
# 4 Bombus_sylvarum Lamiaceae    margin     
# 5 Bombus_sylvarum Lamiaceae    grassfields
# 6 Bombus_sylvarum Umbelliferae hedge      
# 7 Bombus_sylvarum Umbelliferae margin     
# 8 Bombus_sylvarum Umbelliferae grassfields

# Count the matches for each species
bb_pf_hab %>%
    group_by(species) %>%     # Assign groups for aggregation
    summarize(Count = n())    # Count the number in each group

# Output:
#   species               Count
#   <chr>                 <int>
# 1 Bombus_distinguendus      3
# 2 Bombus_hortorum          10
# 3 Bombus_humilis            6
# 4 Bombus_jonellus           5
# 5 Bombus_lapidarius        10
# 6 Bombus_lucorum           10
# 7 Bombus_muscorum           3
# 8 Bombus_pascuorum         10
# 9 Bombus_pratorum           9
# 10 Bombus_ruderarius        5
# 11 Bombus_ruderatus         4
# 12 Bombus_soroeensis        3
# 13 Bombus_sylvarum          8
# 14 Bombus_terrestris       10

这是相同方法的更简洁版本：

library(tidyverse)    

# Create matching table based on which numeric columns have non-zero value
get_matching_table <- function(data, names_to) {
    data %>% pivot_longer(where(is.numeric), names_to) %>%
             filter(value != 0) %>% select(-value)
}

# Join matching tables, joining on columns with matching names
bb_pf_hab <-
    list(get_matching_table(bbfpl, names_to = 'plantfamily'),
         get_matching_table(bbfhab, names_to = 'habitat'),
         get_matching_table(plantloc, names_to = 'habitat')) %>%
    reduce(inner_join)

# Count matches by species
bb_pf_hab %>% group_by(species) %>% summarize(Count = n())

将匹配 function 重复应用于 R 中的多个数据帧

问题描述

2 个解决方案

解决方案1
0 已采纳 2020-06-12 21:35:57

解决方案2
0 2020-06-12 22:00:29

将匹配 function 重复应用于 R 中的多个数据帧

问题描述

2 个解决方案

解决方案1 0 已采纳 2020-06-12 21:35:57

解决方案2 0 2020-06-12 22:00:29

解决方案1
0 已采纳 2020-06-12 21:35:57

解决方案2
0 2020-06-12 22:00:29