繁体   English   中英

如何在 R 中将因子变量转换为数字后保留其标签?

[英]How to keep labels of a factor variable after transforming it to numeric in R?

我有以下三个主要变量的数据:

i) 教育(因子):代表三种不同的教育水平(1、2、3)

ii) 份额(数字):代表该国每个教育水平的人口百分比

iii) 国家(因素):代表 30 个国家。

其目的是通过将最高教育水平 (3) 按教育比例最低的国家重新排序到教育水平最高的国家,从而达到每个国家/地区的教育份额。 问题是我最后会丢失国家标签,因为我必须将变量转换为数字才能重新排序。 绘制图表后,国家被标记为 (25, 6, 26, 17) 而不是正确的标签 (PT, CZ, RO, IT)。 我在stackoverflow中阅读了不同的线程,但没有解决这个问题。 我有办法在重新订购后保留国家/地区的标签,这样我以后就不必手动输入它们了吗?

library(forcats)
library(ggplot2)
library(dplyr)

 x$country = as.numeric(x$cntry2)
 x$educ = as.integer(x$educ)
 x$educ = as.factor(x$educ)
    
 country_order <- x %>% 
  filter(educ == 3) %>%
  mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>% 
  pull(country) %>%
  levels()

df2 <- x %>%
  mutate(country = fct_relevel(factor(country), country_order))

ggplot(df2, aes(x=country, y=share)) + 
  geom_col(aes(fill=educ), color = "black") +
  labs(fill= "Education") +
  theme_classic() +
  xlab("Country") 

这是下面的数据:

structure(list(educ = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", 
"3"), class = "factor"), cntry2 = structure(c(1L, 1L, 1L, 2L, 
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 
7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 
12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 
16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 
21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
25L, 25L, 26L, 26L, 26L, 27L, 27L, 27L), .Label = c("AU", "BE", 
"BG", "CH", "CZ", "DK", "EE", "ES", "FI", "FR", "GR", "HU", "IE", 
"IS", "IT", "LT", "LU", "LV", "NL", "NO", "PO", "PT", "RO", "SE", 
"SK", "SV", "UK"), class = "factor"), share = c(14.9585723390695, 
64.8311026131294, 20.2103250478011, 20.3203525363306, 37.9050825638106, 
41.7745648998589, 20.5482068669118, 58.6719831908696, 20.7798099422186, 
11.0478359908884, 52.7334851936219, 36.2186788154898, 8.1806499751285, 
77.2156358812801, 14.6037141435914, 18.43684842358, 44.6831364124597, 
36.8800151639603, 13.0425889732285, 58.1996272896687, 28.7577837371029, 
42.6625051189251, 24.1934234264148, 33.1440714546602, 16.4821228232769, 
46.3050582898395, 37.2128188868836, 22.0117072122872, 47.7342785027657, 
30.2540142849471, 31.6958715347475, 40.8370856615852, 27.4670428036673, 
15.620426612099, 63.1486925776748, 21.2308808102263, 27.79203576455, 
33.4878715125424, 38.7200927229075, 29.0666986564299, 41.950575815739, 
28.9827255278311, 36.0270124068613, 47.1984225312789, 16.7745650618598, 
8.20398339670027, 60.9892218075273, 30.8067947957724, 37.0050817095017, 
37.4766935985084, 25.5182246919899, 15.7399902739504, 59.1482759419216, 
25.111733784128, 19.2624176167015, 43.4944817814291, 37.2431006018693, 
17.6501727404436, 44.6784798840967, 37.6713473754597, 10.0098831213475, 
69.2849776555517, 20.7051392231007, 64.5019644095216, 21.5391726369309, 
13.9588629535475, 21.8434913468774, 62.6661650363682, 15.4903436167545, 
11.4840104928012, 55.3435190932938, 33.172470413905, 4.23006072183939, 
74.1147574537763, 21.6551818243843, 15.6869892409901, 61.3851490387442, 
22.9278617202657, 14.2357801080394, 49.3703276303246, 36.393892261636
), country = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6, 6, 6, 7, 
7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 14, 
14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 
19, 20, 20, 20, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 
26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30)), row.names = c(NA, 
-81L), class = c("tbl_df", "tbl", "data.frame"))

试试这种热带方法:

library(ggplot2)
library(dplyr)
#Data
x$lab <- as.character(x$cntry2)
x$country = as.numeric(x$cntry2)
x$educ = as.integer(x$educ)
x$educ = as.factor(x$educ)

现在,我们存储标签:

#Labels
labs <- x[!duplicated(x$country),]
labs <- labs[,c('country','lab')]

然后更多的数据处理:

#Data
country_order <- x %>% 
  filter(educ == 3) %>%
  mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>% 
  pull(country) %>%
  levels()
df2 <- x %>%
  mutate(country = fct_relevel(factor(country), country_order))

和 plot:

#Plot
ggplot(df2, aes(x=country, y=share)) + 
  geom_col(aes(fill=educ), color = "black") +
  labs(fill= "Education") +
  theme_classic() +
  xlab("Country")+
  scale_x_discrete(labels=labs$lab[match(country_order,labs$country)])

Output:

在此处输入图像描述

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM