I have a large data.frame called " sim
" that has a character colunm named " CAUSABAS
", something like this:
CAUSABAS OBITOGRAV OBITOPARTO OBITOPUERP ANO idade idade_dias
1 I110 <NA> <NA> <NA> 2013 95.58 34909
2 C349 <NA> <NA> <NA> 2013 80.70 29474
3 C490 <NA> <NA> <NA> 2013 97.90 35757
4 I219 <NA> <NA> <NA> 2013 87.60 31995
5 I259 <NA> <NA> <NA> 2013 62.57 22853
6 I678 <NA> <NA> <NA> 2013 51.99 18988
This data.frame is created using the package microdatasus
, like that:
sim_mg <- fetch_datasus(year_start = 2013, year_end = 2018,
information_system = "SIM-DO",
uf = "MG")
sim_es <- fetch_datasus(year_start = 2013, year_end = 2018,
information_system = "SIM-DO",
uf = "ES")
sim <- bind_rows(sim_es, sim_mg) %>%
select(TIPOBITO, DTOBITO, DTNASC, CODMUNOCOR, CODMUNRES, TPMORTEOCO, TPOBITOCOR, LINHAA, LINHAB, LINHAC, LINHAD, LINHAII, CAUSABAS, OBITOGRAV, OBITOPARTO, OBITOPUERP) %>%
mutate(ANO = substr(DTOBITO, 5, 8))
sim$CAUSABAS <- unfactor(sim$CAUSABAS)
I want to create the following column: grupo_causa_basica
, that classifies these CAUSABAS
.
Some of the observation has 3 strings, like B50
, but another has 4, like B501
.
The group of B50
has, for example, 3 other CAUSABAS
, B500
, B508
and B509
. Sometimes the input is B50
, but not always. All the observations follow this kind of organization.
I've created some vectors to specify what is what:
doen_cardio <- c("F01","G45","G46","I10","I11","I13","I15","I21","I22","I23","I24","I25",
"I26","I27","I28","I37","I49","I50","I51","I52","I63","I64","I67","I68",
"I69","I71","I72","I73","I80","K55","O10","O16","P29","P60","R931","R943")
diabetes <- c("E10","E11","E12","E13","E14","O24","P70")
doencas_respiratorias <- c("R91","R942","J06","J16","J20","J21","J22","J30","J31","J34",
"J40","J41","J42","J45","J46","J60","J61","J62","J63","J64",
"J66","J67","J68","J69","J70","J80","J81","J82","J84","J96",
"J98","J99","P22","P26","P27","P28","R04","R06","R09","R84",
"T17","W77","W83","W84")
doen_renais <- c("R934","R944","E27","I12","N07","N17","N18","N19","N25","N27","Q60")
intox_exogenas <- c("T45","T46","T47","T48","T57","T61","T62","T65","T97","X47","X49",
"Y13","Y17","Y19")
doen_infec_veic_hidrica <- c("A01","A02","A03","A04","A05","A06","A07","A08","A09","A27",
"B15","B58","B65","B77","R10","A00","B68","B69","B76","B80",
"B82","B89","N220")
arbov <- c("A92","P354","A90","A91","A95","B50","B51","B52","B53","B54")
doen_pele <- c("R23","L08","L98","L99")
After that I tried to use mutate
plus case_when
:
sim <- sim %>%
mutate(grupo_causa_basica = case_when(CAUSABAS %in% doen_cardio ~ "cardio",
CAUSABAS %in% diabetes ~ "diabetes",
CAUSABAS %in% doencas_respiratorias ~ "doen_resp",
CAUSABAS %in% doen_renais ~ "doen_renais",
CAUSABAS %in% intox_exogenas ~ "intox_exogenas",
CAUSABAS %in% doen_infec_veic_hidrica ~ "doen_infec_veic_hidrica",
CAUSABAS %in% arbov ~ "arbov",
CAUSABAS %in% doen_pele ~ "doen_pele"))
These worked, however, my code forgot about the cases like B500
, B508
, B509
. As I specified in the vector " arbov
" only B50
, it only classified B50
, not B500
or B508
.
In that way, I was wondering that I had to do something with the vectors: take all strings in the vectors that starts with these specifications. So I realized that I could use startsWith
:
sim <- sim %>%
mutate(grupo_causa_basica = case_when(startsWith(CAUSABAS, doen_cardio) ~ "cardio",
startsWith(CAUSABAS, diabetes) ~ "diabetes",
startsWith(CAUSABAS, doencas_respiratorias) ~ "doen_resp",
startsWith(CAUSABAS, doen_renais) ~ "doen_renais",
startsWith(CAUSABAS, intox_exogenas) ~ "intox_exogenas",
startsWith(CAUSABAS, doen_infec_veic_hidrica) ~ "doen_infec_veic_hidrica",
startsWith(CAUSABAS, arbov) ~ "arbov",
startsWith(CAUSABAS, doen_pele) ~ "doen_pele"))
However, it was not the desirable solution. When I table the first try, it gives me something like:
table(sim$grupo_causa_basica)
arbov cardio doen_infec_veic_hidrica doen_renais doen_resp
638 51087 2514 2614 2895
and when I table the second try, it brings me that:
table(sim$grupo_causa_basica)
arbov cardio diabetes doen_infec_veic_hidrica doen_pele
103 5650 5445 173 254
doen_renais doen_resp intox_exogenas
1371 656 17
How can I do that considering all the strings that initiate with the specified in the vectors?
The API timed out when I tried to use the data downloading functions, but what about something like this:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(stringr)
doen_cardio <- c("F01","G45","G46","I10","I11","I13","I15","I21","I22","I23","I24","I25",
"I26","I27","I28","I37","I49","I50","I51","I52","I63","I64","I67","I68",
"I69","I71","I72","I73","I80","K55","O10","O16","P29","P60","R931","R943")
diabetes <- c("E10","E11","E12","E13","E14","O24","P70")
doencas_respiratorias <- c("R91","R942","J06","J16","J20","J21","J22","J30","J31","J34",
"J40","J41","J42","J45","J46","J60","J61","J62","J63","J64",
"J66","J67","J68","J69","J70","J80","J81","J82","J84","J96",
"J98","J99","P22","P26","P27","P28","R04","R06","R09","R84",
"T17","W77","W83","W84")
doen_renais <- c("R934","R944","E27","I12","N07","N17","N18","N19","N25","N27","Q60")
intox_exogenas <- c("T45","T46","T47","T48","T57","T61","T62","T65","T97","X47","X49",
"Y13","Y17","Y19")
doen_infec_veic_hidrica <- c("A01","A02","A03","A04","A05","A06","A07","A08","A09","A27",
"B15","B58","B65","B77","R10","A00","B68","B69","B76","B80",
"B82","B89","N220")
arbov <- c("A92","P354","A90","A91","A95","B50","B51","B52","B53","B54")
doen_pele <- c("R23","L08","L98","L99")
doen_cardio <- ifelse(nchar(doen_cardio) == 3, paste0(doen_cardio, ".*"), doen_cardio)
diabetes <- ifelse(nchar(diabetes) == 3, paste0(diabetes, ".*"), diabetes)
doencas_respiratorias <- ifelse(nchar(doencas_respiratorias) == 3, paste0(doencas_respiratorias, ".*"), doencas_respiratorias)
doen_renais <- ifelse(nchar(doen_renais) == 3, paste0(doen_renais, ".*"), doen_renais)
intox_exogenas <- ifelse(nchar(intox_exogenas) == 3, paste0(intox_exogenas, ".*"), intox_exogenas)
doen_infec_veic_hidrica <- ifelse(nchar(doen_infec_veic_hidrica) == 3, paste0(doen_infec_veic_hidrica, ".*"), doen_infec_veic_hidrica)
arbov <- ifelse(nchar(arbov) == 3, paste0(arbov, ".*"), arbov)
doen_pele <- ifelse(nchar(doen_pele) == 3, paste0(doen_pele, ".*"), doen_pele)
doen_cardio <- paste(doen_cardio, collapse="|")
diabetes <- paste(diabetes, collapse="|")
doencas_respiratorias <- paste(doencas_respiratorias, collapse="|")
doen_renais <- paste(doen_renais, collapse="|")
intox_exogenas <- paste(intox_exogenas, collapse="|")
doen_infec_veic_hidrica <- paste(doen_infec_veic_hidrica, collapse="|")
arbov <- paste(arbov, collapse="|")
doen_pele <- paste(doen_pele, collapse="|")
tib <- tibble::tribble(
~CAUSABAS, ~OBITOGRAV, ~OBITOPARTO, ~OBITOPUERP, ~ANO, ~idade, ~idade_dias,
"I110", NA, NA, NA, 2013, 95.58, 34909,
"C349", NA, NA, NA, 2013, 80.70, 29474,
"C490", NA, NA, NA, 2013, 97.90, 35757,
"I219", NA, NA, NA, 2013, 87.60, 31995,
"I259", NA, NA, NA, 2013, 62.57, 22853,
"I678", NA, NA, NA, 2013, 51.99, 18988)
tib %>%
mutate(grupo_causa_basica = case_when(str_detect(CAUSABAS, doen_cardio) ~ "cardio",
str_detect(CAUSABAS, diabetes) ~ "diabetes",
str_detect(CAUSABAS, doencas_respiratorias) ~ "doen_resp",
str_detect(CAUSABAS, doen_renais) ~ "doen_renais",
str_detect(CAUSABAS, intox_exogenas) ~ "intox_exogenas",
str_detect(CAUSABAS, doen_infec_veic_hidrica) ~ "doen_infec_veic_hidrica",
str_detect(CAUSABAS, arbov) ~ "arbov",
str_detect(CAUSABAS, doen_pele) ~ "doen_pele"))
#> # A tibble: 6 × 8
#> CAUSABAS OBITOGRAV OBITOPARTO OBITOPUERP ANO idade idade_dias grupo_causa_…¹
#> <chr> <lgl> <lgl> <lgl> <dbl> <dbl> <dbl> <chr>
#> 1 I110 NA NA NA 2013 95.6 34909 cardio
#> 2 C349 NA NA NA 2013 80.7 29474 <NA>
#> 3 C490 NA NA NA 2013 97.9 35757 <NA>
#> 4 I219 NA NA NA 2013 87.6 31995 cardio
#> 5 I259 NA NA NA 2013 62.6 22853 cardio
#> 6 I678 NA NA NA 2013 52.0 18988 cardio
#> # … with abbreviated variable name ¹grupo_causa_basica
Created on 2023-02-01 by the reprex package (v2.0.1)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.