如何從我的數據框中向 PCA 圖添加標簽

Question

我有一個數據集並想運行 PCA 圖。 在此圖中，觀測值應根據name列 ( habillage = a$name ) 以相同顏色分組。 此外，我希望單個觀察顯示它在Age方面對應於哪個組。 我發現label = "none"沒有顯示它，但如果我寫label = a$Age沒有任何變化。 最后，如何避免在圖例中顯示與habillage = a$name重復的黑/白文本？

libary(plyr)
library(dplyr)
library(factoextra)
df<-structure(list(effective_status = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"), 
    Age = structure(c(3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L, 
    2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
    6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
    ), .Label = c("13-17", "18-24", "25-34", "35-44", "45-54", 
    "55-64", "65+", "Unknown"), class = "factor"), name = structure(c(19L, 
    23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L, 
    15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L, 
    19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L, 
    21L, 22L), .Label = c("Automated Boost", "Competitors January", 
    "Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April", 
    "Marketing August", "Marketing December", "Marketing February", 
    "Marketing January", "Marketing July", "Marketing June", 
    "Marketing March", "Marketing May", "Upsell April", "Upsell August", 
    "Upsell Boost", "Upsell February", "Upsell January", "Upsell July", 
    "Upsell June", "Upsell March", "Upsell May"), class = "factor"), 
    n_obs = c(1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), Clicks = c(1364L, 
    0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L, 
    206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L,0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L, 
    0L, 68L, 0L, 0L, 0L), Impressions = c(12409L, 0L, 58222L, 
    30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L, 
    98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L, 
    150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L, 
    0L, 948L, 0L, 2972L, 0L, 0L, 0L), Reach = c(12164L, 0L, 46142L, 
    25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L, 
    58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L, 
    150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L, 
    948L, 0L, 2782L, 0L, 0L, 0L), Spend = c(1153.11, 0, 9663.16, 
    3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22, 
    565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12, 
    0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6, 
    0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05), Purchase = c(140L, 
    163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), PurchaseValue = c(221595.22, 
    173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66, 
    6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    Date_minus_start_time = c(9, 13, 15, 26.3055555555556, 29, 
    5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19, 
    43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19, 
    3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18, 
    21, 5)), row.names = c(NA, -40L), groups = structure(list(
    effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c("13-17", 
"18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
    ), class = "factor"), .rows = structure(list(c(8L, 11L, 12L
    ), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L), 
        c(6L, 19L, 20L, 21L), c(3L, 22L, 23L, 24L, 25L, 26L, 
        27L, 28L, 29L, 30L), 31:37, c(2L, 10L, 38L, 39L, 40L)), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -7L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))


a <- subset(helmes[sample(nrow(helmes), 100), ], !(name %in% c("Upsell Boost","Marketing 0-25","Dynamic Ad"))) %>% 
group_by(effective_status,Age,name)  %>%  
summarise(
  n_obs = n(),
  Clicks = sum(Clicks,na.rm = TRUE),
  Impressions = sum(Impressions,na.rm = TRUE),
  Reach = sum(Reach,na.rm = TRUE),
  Spend = sum(Spend,na.rm = TRUE),
  Purchase = sum(Purchase,na.rm = TRUE),
  PurchaseValue = sum(PurchaseValue,na.rm = TRUE),
  Date_minus_start_time = mean(Date_minus_start_time,na.rm = TRUE)
)  %>% arrange(desc(PurchaseValue))


res.pca <- prcomp(a[4:ncol(a)],  scale = TRUE)

fviz_pca_ind(res.pca, 
                          #col.ind = a$name, # color by groups 
                          label = "none",
                          #geom = c("point","text"),
                          habillage = a$name, # color by groups
                          #palette = c("#00AFBB", "#FC4E07", "#2CA25F"), 
                          addEllipses = TRUE, # Concentration ellipses 
                          ellipse.type = "confidence", 
                          legend.title = "Groups", 
                          repel = TRUE )

Answer 1

您可以提取計算出的 PCA 分數，然后執行自己的 ggplot：

library(tidyverse)
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at

df <- structure(list(
  effective_status = structure(c(
    1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L
  ), .Label = c("ACTIVE", "PAUSED"), class = "factor"),
  Age = structure(c(
    3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L,
    2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
    6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
  ), .Label = c(
    "13-17", "18-24", "25-34", "35-44", "45-54",
    "55-64", "65+", "Unknown"
  ), class = "factor"), name = structure(c(
    19L,
    23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L,
    15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L,
    19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L,
    21L, 22L
  ), .Label = c(
    "Automated Boost", "Competitors January",
    "Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April",
    "Marketing August", "Marketing December", "Marketing February",
    "Marketing January", "Marketing July", "Marketing June",
    "Marketing March", "Marketing May", "Upsell April", "Upsell August",
    "Upsell Boost", "Upsell February", "Upsell January", "Upsell July",
    "Upsell June", "Upsell March", "Upsell May"
  ), class = "factor"),
  n_obs = c(
    1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L
  ), Clicks = c(
    1364L,
    0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L,
    206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L,
    0L, 68L, 0L, 0L, 0L
  ), Impressions = c(
    12409L, 0L, 58222L,
    30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L,
    98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L,
    150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L,
    0L, 948L, 0L, 2972L, 0L, 0L, 0L
  ), Reach = c(
    12164L, 0L, 46142L,
    25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L,
    58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L,
    150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L,
    948L, 0L, 2782L, 0L, 0L, 0L
  ), Spend = c(
    1153.11, 0, 9663.16,
    3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22,
    565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12,
    0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6,
    0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05
  ), Purchase = c(
    140L,
    163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L,
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
  ), PurchaseValue = c(
    221595.22,
    173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66,
    6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  ),
  Date_minus_start_time = c(
    9, 13, 15, 26.3055555555556, 29,
    5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19,
    43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19,
    3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18,
    21, 5
  )
), row.names = c(NA, -40L), groups = structure(list(
  effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c(
    "13-17",
    "18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
  ), class = "factor"), .rows = structure(list(
    c(8L, 11L, 12L), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L),
    c(6L, 19L, 20L, 21L), c(
      3L, 22L, 23L, 24L, 25L, 26L,
      27L, 28L, 29L, 30L
    ), 31:37, c(2L, 10L, 38L, 39L, 40L)
  ), ptype = integer(0), class = c(
    "vctrs_list_of",
    "vctrs_vctr", "list"
  ))
), row.names = c(NA, -7L), class = c(
  "tbl_df",
  "tbl", "data.frame"
), .drop = TRUE), class = c(
  "grouped_df",
  "tbl_df", "tbl", "data.frame"
))

df
#> # A tibble: 40 x 11
#> # Groups:   effective_status, Age [7]
#>    effective_status Age    name    n_obs Clicks Impressions Reach Spend Purchase
#>    <fct>            <fct>  <fct>   <int>  <int>       <int> <int> <dbl>    <int>
#>  1 ACTIVE           25-34  Upsell…     1   1364       12409 12164 1153.      140
#>  2 ACTIVE           Unkno… Upsell…     1      0           0     0    0       163
#>  3 ACTIVE           55-64  Upsell…     1   4919       58222 46142 9663.      104
#>  4 ACTIVE           35-44  Upsell…     3   2597       30115 25282 3202.       33
#>  5 ACTIVE           35-44  Market…     2   2641       47119 35142 3393.       22
#>  6 ACTIVE           45-54  Market…     2      0           0     0    0        17
#>  7 ACTIVE           35-44  Market…     2    915       18817 14843 1739.       11
#>  8 ACTIVE           18-24  Market…     1   1104       17068 13533 1344.       13
#>  9 ACTIVE           35-44  Upsell…     1     63        4175  3624  502.        2
#> 10 ACTIVE           Unkno… Market…     2      0           0     0    0         0
#> # … with 30 more rows, and 2 more variables: PurchaseValue <dbl>,
#> #   Date_minus_start_time <dbl>

res.pca <- prcomp(df[, c("Clicks", "Impressions", "Reach", "Spend", "Purchase", "PurchaseValue", "Date_minus_start_time")], scale = TRUE)

fviz_pca_ind(res.pca)$data %>%
  bind_cols(df) %>%
  ggplot(aes(x, y, color = name...9)) +
  geom_label(aes(label = Age)) +
  labs(color = "Name")
#> New names:
#> * name -> name...1
#> * name -> name...9

^{由reprex 包(v2.0.1) 於 2021 年 9 月 17 日創建}

如何從我的數據框中向 PCA 圖添加標簽

問題描述

1 個解決方案

解決方案1
1 2021-09-17 12:29:16

如何從我的數據框中向 PCA 圖添加標簽

問題描述

1 個解決方案

解決方案1 1 2021-09-17 12:29:16

解決方案1
1 2021-09-17 12:29:16