[英]How add labels to the PCA plot from my dataframe
我有一個數據集並想運行 PCA 圖。 在此圖中,觀測值應根據name
列 ( habillage = a$name
) 以相同顏色分組。 此外,我希望單個觀察顯示它在Age
方面對應於哪個組。 我發現label = "none"
沒有顯示它,但如果我寫label = a$Age
沒有任何變化。 最后,如何避免在圖例中顯示與habillage = a$name
重復的黑/白文本?
libary(plyr)
library(dplyr)
library(factoextra)
df<-structure(list(effective_status = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"),
Age = structure(c(3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
), .Label = c("13-17", "18-24", "25-34", "35-44", "45-54",
"55-64", "65+", "Unknown"), class = "factor"), name = structure(c(19L,
23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L,
15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L,
19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L,
21L, 22L), .Label = c("Automated Boost", "Competitors January",
"Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April",
"Marketing August", "Marketing December", "Marketing February",
"Marketing January", "Marketing July", "Marketing June",
"Marketing March", "Marketing May", "Upsell April", "Upsell August",
"Upsell Boost", "Upsell February", "Upsell January", "Upsell July",
"Upsell June", "Upsell March", "Upsell May"), class = "factor"),
n_obs = c(1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), Clicks = c(1364L,
0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L,
206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L,0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L,
0L, 68L, 0L, 0L, 0L), Impressions = c(12409L, 0L, 58222L,
30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L,
98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L,
0L, 948L, 0L, 2972L, 0L, 0L, 0L), Reach = c(12164L, 0L, 46142L,
25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L,
58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L,
948L, 0L, 2782L, 0L, 0L, 0L), Spend = c(1153.11, 0, 9663.16,
3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22,
565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12,
0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6,
0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05), Purchase = c(140L,
163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), PurchaseValue = c(221595.22,
173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66,
6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Date_minus_start_time = c(9, 13, 15, 26.3055555555556, 29,
5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19,
43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19,
3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18,
21, 5)), row.names = c(NA, -40L), groups = structure(list(
effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c("13-17",
"18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
), class = "factor"), .rows = structure(list(c(8L, 11L, 12L
), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L),
c(6L, 19L, 20L, 21L), c(3L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L), 31:37, c(2L, 10L, 38L, 39L, 40L)), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
a <- subset(helmes[sample(nrow(helmes), 100), ], !(name %in% c("Upsell Boost","Marketing 0-25","Dynamic Ad"))) %>%
group_by(effective_status,Age,name) %>%
summarise(
n_obs = n(),
Clicks = sum(Clicks,na.rm = TRUE),
Impressions = sum(Impressions,na.rm = TRUE),
Reach = sum(Reach,na.rm = TRUE),
Spend = sum(Spend,na.rm = TRUE),
Purchase = sum(Purchase,na.rm = TRUE),
PurchaseValue = sum(PurchaseValue,na.rm = TRUE),
Date_minus_start_time = mean(Date_minus_start_time,na.rm = TRUE)
) %>% arrange(desc(PurchaseValue))
res.pca <- prcomp(a[4:ncol(a)], scale = TRUE)
fviz_pca_ind(res.pca,
#col.ind = a$name, # color by groups
label = "none",
#geom = c("point","text"),
habillage = a$name, # color by groups
#palette = c("#00AFBB", "#FC4E07", "#2CA25F"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = TRUE )
您可以提取計算出的 PCA 分數,然后執行自己的 ggplot:
library(tidyverse)
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at
df <- structure(list(
effective_status = structure(c(
1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L
), .Label = c("ACTIVE", "PAUSED"), class = "factor"),
Age = structure(c(
3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
), .Label = c(
"13-17", "18-24", "25-34", "35-44", "45-54",
"55-64", "65+", "Unknown"
), class = "factor"), name = structure(c(
19L,
23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L,
15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L,
19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L,
21L, 22L
), .Label = c(
"Automated Boost", "Competitors January",
"Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April",
"Marketing August", "Marketing December", "Marketing February",
"Marketing January", "Marketing July", "Marketing June",
"Marketing March", "Marketing May", "Upsell April", "Upsell August",
"Upsell Boost", "Upsell February", "Upsell January", "Upsell July",
"Upsell June", "Upsell March", "Upsell May"
), class = "factor"),
n_obs = c(
1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L
), Clicks = c(
1364L,
0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L,
206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L,
0L, 68L, 0L, 0L, 0L
), Impressions = c(
12409L, 0L, 58222L,
30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L,
98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L,
0L, 948L, 0L, 2972L, 0L, 0L, 0L
), Reach = c(
12164L, 0L, 46142L,
25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L,
58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L,
948L, 0L, 2782L, 0L, 0L, 0L
), Spend = c(
1153.11, 0, 9663.16,
3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22,
565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12,
0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6,
0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05
), Purchase = c(
140L,
163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), PurchaseValue = c(
221595.22,
173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66,
6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
),
Date_minus_start_time = c(
9, 13, 15, 26.3055555555556, 29,
5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19,
43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19,
3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18,
21, 5
)
), row.names = c(NA, -40L), groups = structure(list(
effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c(
"13-17",
"18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
), class = "factor"), .rows = structure(list(
c(8L, 11L, 12L), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L),
c(6L, 19L, 20L, 21L), c(
3L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L
), 31:37, c(2L, 10L, 38L, 39L, 40L)
), ptype = integer(0), class = c(
"vctrs_list_of",
"vctrs_vctr", "list"
))
), row.names = c(NA, -7L), class = c(
"tbl_df",
"tbl", "data.frame"
), .drop = TRUE), class = c(
"grouped_df",
"tbl_df", "tbl", "data.frame"
))
df
#> # A tibble: 40 x 11
#> # Groups: effective_status, Age [7]
#> effective_status Age name n_obs Clicks Impressions Reach Spend Purchase
#> <fct> <fct> <fct> <int> <int> <int> <int> <dbl> <int>
#> 1 ACTIVE 25-34 Upsell… 1 1364 12409 12164 1153. 140
#> 2 ACTIVE Unkno… Upsell… 1 0 0 0 0 163
#> 3 ACTIVE 55-64 Upsell… 1 4919 58222 46142 9663. 104
#> 4 ACTIVE 35-44 Upsell… 3 2597 30115 25282 3202. 33
#> 5 ACTIVE 35-44 Market… 2 2641 47119 35142 3393. 22
#> 6 ACTIVE 45-54 Market… 2 0 0 0 0 17
#> 7 ACTIVE 35-44 Market… 2 915 18817 14843 1739. 11
#> 8 ACTIVE 18-24 Market… 1 1104 17068 13533 1344. 13
#> 9 ACTIVE 35-44 Upsell… 1 63 4175 3624 502. 2
#> 10 ACTIVE Unkno… Market… 2 0 0 0 0 0
#> # … with 30 more rows, and 2 more variables: PurchaseValue <dbl>,
#> # Date_minus_start_time <dbl>
res.pca <- prcomp(df[, c("Clicks", "Impressions", "Reach", "Spend", "Purchase", "PurchaseValue", "Date_minus_start_time")], scale = TRUE)
fviz_pca_ind(res.pca)$data %>%
bind_cols(df) %>%
ggplot(aes(x, y, color = name...9)) +
geom_label(aes(label = Age)) +
labs(color = "Name")
#> New names:
#> * name -> name...1
#> * name -> name...9
由reprex 包(v2.0.1) 於 2021 年 9 月 17 日創建
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.