
[英]using case_when inside dplyr's mutate to create a new column in dataframe based on conditions
[英]Create a new column based on vectors pre-determined using case_when
我有一个名为“ sim
”的大数据框,它有一个名为“ CAUSABAS
”的字符列,如下所示:
CAUSABAS OBITOGRAV OBITOPARTO OBITOPUERP ANO idade idade_dias
1 I110 <NA> <NA> <NA> 2013 95.58 34909
2 C349 <NA> <NA> <NA> 2013 80.70 29474
3 C490 <NA> <NA> <NA> 2013 97.90 35757
4 I219 <NA> <NA> <NA> 2013 87.60 31995
5 I259 <NA> <NA> <NA> 2013 62.57 22853
6 I678 <NA> <NA> <NA> 2013 51.99 18988
此 data.frame 是使用 package microdatasus
创建的,如下所示:
sim_mg <- fetch_datasus(year_start = 2013, year_end = 2018,
information_system = "SIM-DO",
uf = "MG")
sim_es <- fetch_datasus(year_start = 2013, year_end = 2018,
information_system = "SIM-DO",
uf = "ES")
sim <- bind_rows(sim_es, sim_mg) %>%
select(TIPOBITO, DTOBITO, DTNASC, CODMUNOCOR, CODMUNRES, TPMORTEOCO, TPOBITOCOR, LINHAA, LINHAB, LINHAC, LINHAD, LINHAII, CAUSABAS, OBITOGRAV, OBITOPARTO, OBITOPUERP) %>%
mutate(ANO = substr(DTOBITO, 5, 8))
sim$CAUSABAS <- unfactor(sim$CAUSABAS)
我想创建以下列: grupo_causa_basica
,对这些CAUSABAS
进行分类。
一些观察有 3 个字符串,如B50
,但另一个有 4 个字符串,如B501
。
例如, B50
组有 3 个其他CAUSABAS
、 B500
、 B508
和B509
。 有时输入是B50
,但并非总是如此。 所有的观察都遵循这种组织。
我创建了一些向量来指定什么是什么:
doen_cardio <- c("F01","G45","G46","I10","I11","I13","I15","I21","I22","I23","I24","I25",
"I26","I27","I28","I37","I49","I50","I51","I52","I63","I64","I67","I68",
"I69","I71","I72","I73","I80","K55","O10","O16","P29","P60","R931","R943")
diabetes <- c("E10","E11","E12","E13","E14","O24","P70")
doencas_respiratorias <- c("R91","R942","J06","J16","J20","J21","J22","J30","J31","J34",
"J40","J41","J42","J45","J46","J60","J61","J62","J63","J64",
"J66","J67","J68","J69","J70","J80","J81","J82","J84","J96",
"J98","J99","P22","P26","P27","P28","R04","R06","R09","R84",
"T17","W77","W83","W84")
doen_renais <- c("R934","R944","E27","I12","N07","N17","N18","N19","N25","N27","Q60")
intox_exogenas <- c("T45","T46","T47","T48","T57","T61","T62","T65","T97","X47","X49",
"Y13","Y17","Y19")
doen_infec_veic_hidrica <- c("A01","A02","A03","A04","A05","A06","A07","A08","A09","A27",
"B15","B58","B65","B77","R10","A00","B68","B69","B76","B80",
"B82","B89","N220")
arbov <- c("A92","P354","A90","A91","A95","B50","B51","B52","B53","B54")
doen_pele <- c("R23","L08","L98","L99")
之后我尝试使用mutate
plus case_when
:
sim <- sim %>%
mutate(grupo_causa_basica = case_when(CAUSABAS %in% doen_cardio ~ "cardio",
CAUSABAS %in% diabetes ~ "diabetes",
CAUSABAS %in% doencas_respiratorias ~ "doen_resp",
CAUSABAS %in% doen_renais ~ "doen_renais",
CAUSABAS %in% intox_exogenas ~ "intox_exogenas",
CAUSABAS %in% doen_infec_veic_hidrica ~ "doen_infec_veic_hidrica",
CAUSABAS %in% arbov ~ "arbov",
CAUSABAS %in% doen_pele ~ "doen_pele"))
这些有效,但是,我的代码忘记了B500
、 B508
、 B509
等情况。 正如我在矢量“ arbov
”中指定的只有B50
,它只分类B50
,而不是B500
或B508
。
这样,我想知道我必须对向量做些什么:获取向量中以这些规范开头的所有字符串。 所以我意识到我可以使用startsWith
:
sim <- sim %>%
mutate(grupo_causa_basica = case_when(startsWith(CAUSABAS, doen_cardio) ~ "cardio",
startsWith(CAUSABAS, diabetes) ~ "diabetes",
startsWith(CAUSABAS, doencas_respiratorias) ~ "doen_resp",
startsWith(CAUSABAS, doen_renais) ~ "doen_renais",
startsWith(CAUSABAS, intox_exogenas) ~ "intox_exogenas",
startsWith(CAUSABAS, doen_infec_veic_hidrica) ~ "doen_infec_veic_hidrica",
startsWith(CAUSABAS, arbov) ~ "arbov",
startsWith(CAUSABAS, doen_pele) ~ "doen_pele"))
但是,这不是理想的解决方案。 当我第一次尝试时,它给了我类似的东西:
table(sim$grupo_causa_basica)
arbov cardio doen_infec_veic_hidrica doen_renais doen_resp
638 51087 2514 2614 2895
当我第二次尝试时,它给我带来了:
table(sim$grupo_causa_basica)
arbov cardio diabetes doen_infec_veic_hidrica doen_pele
103 5650 5445 173 254
doen_renais doen_resp intox_exogenas
1371 656 17
考虑到所有以向量中指定的字符串开头的字符串,我该怎么做?
API在我尝试使用数据下载功能时超时,但是像这样:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(stringr)
doen_cardio <- c("F01","G45","G46","I10","I11","I13","I15","I21","I22","I23","I24","I25",
"I26","I27","I28","I37","I49","I50","I51","I52","I63","I64","I67","I68",
"I69","I71","I72","I73","I80","K55","O10","O16","P29","P60","R931","R943")
diabetes <- c("E10","E11","E12","E13","E14","O24","P70")
doencas_respiratorias <- c("R91","R942","J06","J16","J20","J21","J22","J30","J31","J34",
"J40","J41","J42","J45","J46","J60","J61","J62","J63","J64",
"J66","J67","J68","J69","J70","J80","J81","J82","J84","J96",
"J98","J99","P22","P26","P27","P28","R04","R06","R09","R84",
"T17","W77","W83","W84")
doen_renais <- c("R934","R944","E27","I12","N07","N17","N18","N19","N25","N27","Q60")
intox_exogenas <- c("T45","T46","T47","T48","T57","T61","T62","T65","T97","X47","X49",
"Y13","Y17","Y19")
doen_infec_veic_hidrica <- c("A01","A02","A03","A04","A05","A06","A07","A08","A09","A27",
"B15","B58","B65","B77","R10","A00","B68","B69","B76","B80",
"B82","B89","N220")
arbov <- c("A92","P354","A90","A91","A95","B50","B51","B52","B53","B54")
doen_pele <- c("R23","L08","L98","L99")
doen_cardio <- ifelse(nchar(doen_cardio) == 3, paste0(doen_cardio, ".*"), doen_cardio)
diabetes <- ifelse(nchar(diabetes) == 3, paste0(diabetes, ".*"), diabetes)
doencas_respiratorias <- ifelse(nchar(doencas_respiratorias) == 3, paste0(doencas_respiratorias, ".*"), doencas_respiratorias)
doen_renais <- ifelse(nchar(doen_renais) == 3, paste0(doen_renais, ".*"), doen_renais)
intox_exogenas <- ifelse(nchar(intox_exogenas) == 3, paste0(intox_exogenas, ".*"), intox_exogenas)
doen_infec_veic_hidrica <- ifelse(nchar(doen_infec_veic_hidrica) == 3, paste0(doen_infec_veic_hidrica, ".*"), doen_infec_veic_hidrica)
arbov <- ifelse(nchar(arbov) == 3, paste0(arbov, ".*"), arbov)
doen_pele <- ifelse(nchar(doen_pele) == 3, paste0(doen_pele, ".*"), doen_pele)
doen_cardio <- paste(doen_cardio, collapse="|")
diabetes <- paste(diabetes, collapse="|")
doencas_respiratorias <- paste(doencas_respiratorias, collapse="|")
doen_renais <- paste(doen_renais, collapse="|")
intox_exogenas <- paste(intox_exogenas, collapse="|")
doen_infec_veic_hidrica <- paste(doen_infec_veic_hidrica, collapse="|")
arbov <- paste(arbov, collapse="|")
doen_pele <- paste(doen_pele, collapse="|")
tib <- tibble::tribble(
~CAUSABAS, ~OBITOGRAV, ~OBITOPARTO, ~OBITOPUERP, ~ANO, ~idade, ~idade_dias,
"I110", NA, NA, NA, 2013, 95.58, 34909,
"C349", NA, NA, NA, 2013, 80.70, 29474,
"C490", NA, NA, NA, 2013, 97.90, 35757,
"I219", NA, NA, NA, 2013, 87.60, 31995,
"I259", NA, NA, NA, 2013, 62.57, 22853,
"I678", NA, NA, NA, 2013, 51.99, 18988)
tib %>%
mutate(grupo_causa_basica = case_when(str_detect(CAUSABAS, doen_cardio) ~ "cardio",
str_detect(CAUSABAS, diabetes) ~ "diabetes",
str_detect(CAUSABAS, doencas_respiratorias) ~ "doen_resp",
str_detect(CAUSABAS, doen_renais) ~ "doen_renais",
str_detect(CAUSABAS, intox_exogenas) ~ "intox_exogenas",
str_detect(CAUSABAS, doen_infec_veic_hidrica) ~ "doen_infec_veic_hidrica",
str_detect(CAUSABAS, arbov) ~ "arbov",
str_detect(CAUSABAS, doen_pele) ~ "doen_pele"))
#> # A tibble: 6 × 8
#> CAUSABAS OBITOGRAV OBITOPARTO OBITOPUERP ANO idade idade_dias grupo_causa_…¹
#> <chr> <lgl> <lgl> <lgl> <dbl> <dbl> <dbl> <chr>
#> 1 I110 NA NA NA 2013 95.6 34909 cardio
#> 2 C349 NA NA NA 2013 80.7 29474 <NA>
#> 3 C490 NA NA NA 2013 97.9 35757 <NA>
#> 4 I219 NA NA NA 2013 87.6 31995 cardio
#> 5 I259 NA NA NA 2013 62.6 22853 cardio
#> 6 I678 NA NA NA 2013 52.0 18988 cardio
#> # … with abbreviated variable name ¹grupo_causa_basica
由reprex package (v2.0.1) 创建于 2023-02-01
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.