[英]R code to iteratively and randomly delete entire rows from a data frame based on a column value, and saving as a new data frame each time
[英]filtering entire data frame based on a unique value and create a new column in R
我一直試圖解決這個問題幾個小時,但我似乎無法得到我正在尋找的解決方案。 如何在R中過濾某個唯一值的數據框,然后創建一個新列,其列名滿足該條件?
我有這個數據框:
dput(head(df1,10))
structure(list(WMA = c("20", "19", "19", "19", "18", "19", "20",
"20", "20", "19"), Waterbody = c("02040201070010-01", "02040202060040-01",
"02040202060050-01", "02040202060060-01", "02040202150070-01",
"02040202030080-01", "02040201080010-01", "02040201080020-01",
"02040201080030-01", "02040202070010-01"), Name = c("Back Creek (above Yardville-H Sq Road)",
"Barton Run (above Kettle Run Road)", "Barton Run (below Kettle Run Road)",
"Bear Swamp River", "Birch Creek", "Bisphams Mill Creek (below McDonalds Br)",
"Blacks Creek (above 40d06m10s)", "Blacks Creek (Bacons Run to 40d06m10s)",
"Blacks Creek (below Bacons Run)", "Bobbys Run"), DO = c("Insufficient Data",
"Non-attaining", "Non-attaining", "Insufficient Data", "Attaining",
"Attaining", "Attaining", "Attaining", "Attaining", "Insufficient Data"
), `DO Trout` = c("N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A",
"N/A", "N/A", "N/A"), pH = c("Insufficient Data", "Non-attaining",
"Non-attaining", "Insufficient Data", "Attaining", "Attaining",
"Attaining", "Attaining", "Attaining", "Insufficient Data"),
`Total Phosphorus` = c("Non-attaining", "Attaining", "Non-attaining",
"Insufficient Data", "Insufficient Data", "Non-attaining",
"Non-attaining", "Non-attaining", "Non-attaining", "Insufficient Data"
), Nitrate = c("Attaining", "Attaining", "Attaining", "Insufficient Data",
"Insufficient Data", "Attaining", "Attaining", "Attaining",
"Attaining", "Insufficient Data"), `Total Suspended Solids` = c("Attaining",
"Attaining", "Attaining", "Insufficient Data", "Insufficient Data",
"Insufficient Data", "Attaining", "Attaining", "Non-attaining",
"Insufficient Data"), `Total Dissolved Solids` = c("Insufficient Data",
"Attaining", "Attaining", "Insufficient Data", "Insufficient Data",
"Insufficient Data", "Attaining", "Attaining", "Attaining",
"Insufficient Data"), Turbidity = c("Insufficient Data",
"Attaining", "Attaining", "Insufficient Data", "Attaining",
"Insufficient Data", "Attaining", "Attaining", "Attaining",
"Insufficient Data"), `Unionized Ammonia` = c("Attaining",
"Attaining", "Attaining", "Insufficient Data", "Attaining",
"Insufficient Data", "Attaining", "Attaining", "Attaining",
"Insufficient Data"), `Unionized Ammonia Trout` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"
), E.coli = c("Insufficient Data", "Attaining", "Attaining",
"Attaining", "Attaining", "Attaining", "Attaining", "Non-attaining",
"Non-attaining", "Attaining"), Enterococcus = c("N/A", "N/A",
"N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"),
`Total Coliform` = c("N/A", "N/A", "N/A", "N/A", "N/A", "N/A",
"N/A", "N/A", "N/A", "N/A")), .Names = c("WMA", "Waterbody",
"Name", "DO", "DO Trout", "pH", "Total Phosphorus", "Nitrate",
"Total Suspended Solids", "Total Dissolved Solids", "Turbidity",
"Unionized Ammonia", "Unionized Ammonia Trout", "E.coli", "Enterococcus",
"Total Coliform"), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
我想對每一列執行過濾,以查看哪些具有唯一值“非實現”。 然后我想基於它創建一個新列,顯示滿足此要求的列名稱。
這是我想要的電子表格:
您可以應用如下自定義函數:
myfun <- function(x) {
paste(colnames(df)[x == "Non-attaining"], collapse=", ")
}
df$newcol <- apply(df, 1, myfun)
或者一舉做到這一切:
df$newcol <- apply(df, 1, function(x) paste(colnames(df)[x == "Non-attaining"], collapse=", "))
如果您想要一個tidyverse解決方案,您可以:
library(dplyr)
library(tidyr)
df1 %>%
select(WMA, Waterbody, Name) %>%
left_join(df1 %>%
gather(ColName, Value, -WMA, -Waterbody, -Name) %>%
filter(grepl("Non-attaining", Value, ignore.case = TRUE, perl = TRUE)) %>%
group_by(WMA, Waterbody, Name) %>%
summarise(Imp = paste(ColName, collapse = ',')) %>%
ungroup(), by = c("WMA", "Waterbody", "Name"))
這給出了:
# A tibble: 10 x 4
WMA Waterbody Name Imp
<chr> <chr> <chr> <chr>
1 20 02040201070010-01 Back Creek (above Yardville-H Sq Road) Total Phosphorus
2 19 02040202060040-01 Barton Run (above Kettle Run Road) DO,pH
3 19 02040202060050-01 Barton Run (below Kettle Run Road) DO,pH,Total Phosphorus
4 19 02040202060060-01 Bear Swamp River NA
5 18 02040202150070-01 Birch Creek NA
6 19 02040202030080-01 Bisphams Mill Creek (below McDonalds Br) Total Phosphorus
7 20 02040201080010-01 Blacks Creek (above 40d06m10s) Total Phosphorus
8 20 02040201080020-01 Blacks Creek (Bacons Run to 40d06m10s) Total Phosphorus,E.coli
9 20 02040201080030-01 Blacks Creek (below Bacons Run) Total Phosphorus,Total Suspended Solids,E.coli
10 19 02040202070010-01 Bobbys Run NA
這是一個不涉及重塑數據的tidyverse
解決方案。 相反,我們可以使用pmap
映射數據幀的各行,然后折疊生成的字符向量。
library(tidyverse)
tbl <- structure(list(WMA = c("20", "19", "19", "19", "18", "19", "20", "20", "20", "19"), Waterbody = c("02040201070010-01", "02040202060040-01", "02040202060050-01", "02040202060060-01", "02040202150070-01", "02040202030080-01", "02040201080010-01", "02040201080020-01", "02040201080030-01", "02040202070010-01"), Name = c("Back Creek (above Yardville-H Sq Road)", "Barton Run (above Kettle Run Road)", "Barton Run (below Kettle Run Road)", "Bear Swamp River", "Birch Creek", "Bisphams Mill Creek (below McDonalds Br)", "Blacks Creek (above 40d06m10s)", "Blacks Creek (Bacons Run to 40d06m10s)", "Blacks Creek (below Bacons Run)", "Bobbys Run"), DO = c("Insufficient Data", "Non-attaining", "Non-attaining", "Insufficient Data", "Attaining", "Attaining", "Attaining", "Attaining", "Attaining", "Insufficient Data"), `DO Trout` = c("N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"), pH = c("Insufficient Data", "Non-attaining", "Non-attaining", "Insufficient Data", "Attaining", "Attaining", "Attaining", "Attaining", "Attaining", "Insufficient Data"), `Total Phosphorus` = c("Non-attaining", "Attaining", "Non-attaining", "Insufficient Data", "Insufficient Data", "Non-attaining", "Non-attaining", "Non-attaining", "Non-attaining", "Insufficient Data"), Nitrate = c("Attaining", "Attaining", "Attaining", "Insufficient Data", "Insufficient Data", "Attaining", "Attaining", "Attaining", "Attaining", "Insufficient Data"), `Total Suspended Solids` = c("Attaining", "Attaining", "Attaining", "Insufficient Data", "Insufficient Data", "Insufficient Data", "Attaining", "Attaining", "Non-attaining", "Insufficient Data"), `Total Dissolved Solids` = c("Insufficient Data", "Attaining", "Attaining", "Insufficient Data", "Insufficient Data", "Insufficient Data", "Attaining", "Attaining", "Attaining", "Insufficient Data"), Turbidity = c("Insufficient Data", "Attaining", "Attaining", "Insufficient Data", "Attaining", "Insufficient Data", "Attaining", "Attaining", "Attaining", "Insufficient Data"), `Unionized Ammonia` = c("Attaining", "Attaining", "Attaining", "Insufficient Data", "Attaining", "Insufficient Data", "Attaining", "Attaining", "Attaining", "Insufficient Data"), `Unionized Ammonia Trout` = c("N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"), E.coli = c("Insufficient Data", "Attaining", "Attaining", "Attaining", "Attaining", "Attaining", "Attaining", "Non-attaining", "Non-attaining", "Attaining"), Enterococcus = c("N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"), `Total Coliform` = c("N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A")), .Names = c("WMA", "Waterbody", "Name", "DO", "DO Trout", "pH", "Total Phosphorus", "Nitrate", "Total Suspended Solids", "Total Dissolved Solids", "Turbidity", "Unionized Ammonia", "Unionized Ammonia Trout", "E.coli", "Enterococcus", "Total Coliform"), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
with_imp <- tbl %>%
mutate(
Imp = pmap(., function(...) names(list(...))[which(c(...) == "Non-attaining")]),
Imp = Imp %>%
map(str_c, collapse = ",") %>%
map_if(~ length(.) == 0, ~ NA_character_) %>%
flatten_chr
)
with_imp[, c(1:3, 17)]
#> # A tibble: 10 x 4
#> WMA Waterbody Name Imp
#> <chr> <chr> <chr> <chr>
#> 1 20 0204020107001~ Back Creek (above Yard~ Total Phosphorus
#> 2 19 0204020206004~ Barton Run (above Kett~ DO,pH
#> 3 19 0204020206005~ Barton Run (below Kett~ DO,pH,Total Phosphorus
#> 4 19 0204020206006~ Bear Swamp River <NA>
#> 5 18 0204020215007~ Birch Creek <NA>
#> 6 19 0204020203008~ Bisphams Mill Creek (b~ Total Phosphorus
#> 7 20 0204020108001~ Blacks Creek (above 40~ Total Phosphorus
#> 8 20 0204020108002~ Blacks Creek (Bacons R~ Total Phosphorus,E.coli
#> 9 20 0204020108003~ Blacks Creek (below Ba~ Total Phosphorus,Total Su~
#> 10 19 0204020207001~ Bobbys Run <NA>
由reprex包創建於2018-08-13(v0.2.0)。
你可以使用which(cond, arr.ind=TRUE)
。 以下是data.table的方法:
library(data.table)
DT = as.data.table(df1)
w = as.data.table(which(DT == "Non-attaining", arr.ind = TRUE))[, .(cols = toString(names(DT)[col])), by=row]
DT[w$row, NAcols := w$cols]
這使...
> DT[, c(1, 17)]
WMA NAcols
1: 20 Total Phosphorus
2: 19 DO, pH
3: 19 DO, pH, Total Phosphorus
4: 19 <NA>
5: 18 <NA>
6: 19 Total Phosphorus
7: 20 Total Phosphorus
8: 20 Total Phosphorus, E.coli
9: 20 Total Phosphorus, Total Suspended Solids, E.coli
10: 19 <NA>
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.