[英]How to keep labels of a factor variable after transforming it to numeric in R?
我有以下三个主要变量的数据:
i) 教育(因子):代表三种不同的教育水平(1、2、3)
ii) 份额(数字):代表该国每个教育水平的人口百分比
iii) 国家(因素):代表 30 个国家。
其目的是通过将最高教育水平 (3) 按教育比例最低的国家重新排序到教育水平最高的国家,从而达到每个国家/地区的教育份额。 问题是我最后会丢失国家标签,因为我必须将变量转换为数字才能重新排序。 绘制图表后,国家被标记为 (25, 6, 26, 17) 而不是正确的标签 (PT, CZ, RO, IT)。 我在stackoverflow中阅读了不同的线程,但没有解决这个问题。 我有办法在重新订购后保留国家/地区的标签,这样我以后就不必手动输入它们了吗?
library(forcats)
library(ggplot2)
library(dplyr)
x$country = as.numeric(x$cntry2)
x$educ = as.integer(x$educ)
x$educ = as.factor(x$educ)
country_order <- x %>%
filter(educ == 3) %>%
mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>%
pull(country) %>%
levels()
df2 <- x %>%
mutate(country = fct_relevel(factor(country), country_order))
ggplot(df2, aes(x=country, y=share)) +
geom_col(aes(fill=educ), color = "black") +
labs(fill= "Education") +
theme_classic() +
xlab("Country")
这是下面的数据:
structure(list(educ = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2",
"3"), class = "factor"), cntry2 = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L,
12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L,
16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L,
21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L,
25L, 25L, 26L, 26L, 26L, 27L, 27L, 27L), .Label = c("AU", "BE",
"BG", "CH", "CZ", "DK", "EE", "ES", "FI", "FR", "GR", "HU", "IE",
"IS", "IT", "LT", "LU", "LV", "NL", "NO", "PO", "PT", "RO", "SE",
"SK", "SV", "UK"), class = "factor"), share = c(14.9585723390695,
64.8311026131294, 20.2103250478011, 20.3203525363306, 37.9050825638106,
41.7745648998589, 20.5482068669118, 58.6719831908696, 20.7798099422186,
11.0478359908884, 52.7334851936219, 36.2186788154898, 8.1806499751285,
77.2156358812801, 14.6037141435914, 18.43684842358, 44.6831364124597,
36.8800151639603, 13.0425889732285, 58.1996272896687, 28.7577837371029,
42.6625051189251, 24.1934234264148, 33.1440714546602, 16.4821228232769,
46.3050582898395, 37.2128188868836, 22.0117072122872, 47.7342785027657,
30.2540142849471, 31.6958715347475, 40.8370856615852, 27.4670428036673,
15.620426612099, 63.1486925776748, 21.2308808102263, 27.79203576455,
33.4878715125424, 38.7200927229075, 29.0666986564299, 41.950575815739,
28.9827255278311, 36.0270124068613, 47.1984225312789, 16.7745650618598,
8.20398339670027, 60.9892218075273, 30.8067947957724, 37.0050817095017,
37.4766935985084, 25.5182246919899, 15.7399902739504, 59.1482759419216,
25.111733784128, 19.2624176167015, 43.4944817814291, 37.2431006018693,
17.6501727404436, 44.6784798840967, 37.6713473754597, 10.0098831213475,
69.2849776555517, 20.7051392231007, 64.5019644095216, 21.5391726369309,
13.9588629535475, 21.8434913468774, 62.6661650363682, 15.4903436167545,
11.4840104928012, 55.3435190932938, 33.172470413905, 4.23006072183939,
74.1147574537763, 21.6551818243843, 15.6869892409901, 61.3851490387442,
22.9278617202657, 14.2357801080394, 49.3703276303246, 36.393892261636
), country = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6, 6, 6, 7,
7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 14,
14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
19, 20, 20, 20, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25,
26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30)), row.names = c(NA,
-81L), class = c("tbl_df", "tbl", "data.frame"))
试试这种热带方法:
library(ggplot2)
library(dplyr)
#Data
x$lab <- as.character(x$cntry2)
x$country = as.numeric(x$cntry2)
x$educ = as.integer(x$educ)
x$educ = as.factor(x$educ)
现在,我们存储标签:
#Labels
labs <- x[!duplicated(x$country),]
labs <- labs[,c('country','lab')]
然后更多的数据处理:
#Data
country_order <- x %>%
filter(educ == 3) %>%
mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>%
pull(country) %>%
levels()
df2 <- x %>%
mutate(country = fct_relevel(factor(country), country_order))
和 plot:
#Plot
ggplot(df2, aes(x=country, y=share)) +
geom_col(aes(fill=educ), color = "black") +
labs(fill= "Education") +
theme_classic() +
xlab("Country")+
scale_x_discrete(labels=labs$lab[match(country_order,labs$country)])
Output:
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.