简体   繁体   English

R根据参考表计算样本权重和加权汇总

[英]R Calculate sample weights and weighted aggregation based on reference table

After conducting a survey, Icollected the results in the form of a dataframe. 进行调查后,我以数据框的形式收集了结果。 Here's a reproducible version of what the actual data frame looks like. 这是实际数据帧外观的可复制版本。

library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
               "1105","1106","1107","1108",
               "1109","1110","1111","1112",
               "1113","1114","1115","1116",
               "1117","1118","1119","1120",
               "1121","1122","1123","1124",
               "1125","1126","1127","1128",
               "1129","1130","1131","1132",
               "1133","1134","1135","1136",
               "1137","1138","1139","1140",
               "1141","1142","1143","1144",
               "1145","1146","1147","1148",
               "1149","1150","1151","1152",
               "1153","1154","1155","1156"),
          Country=c("US","UK","Canada","Mexico",
                    "India","US","Peru","China",
                    "US","UK","Canada","Mexico",
                    "Portugal","India","Portugal","Mexico",
                    "Peru","India","Canada","Mexico",
                    "India","UK","India","Canada",
                    "US","UK","China","India",
                    "US","Mexico","Canada","Mexico",
                    "Canada","China","Canada","Canada",
                    "China","China","India","Mexico",
                    "Portugal","Portugal","Portugal","Portugal",
                    "UK","UK","UK","Peru",
                    "Peru","Mexico","US","US",
                    "Peru","Mexico","Peru","Mexico"),
          Gender=c("Male","Male","Male","Female",
                    "Female","Female","Male","Female",
                    "Female","Female","Male","Female",
                    "Male","Male","Female","Female",
                    "Female","Male","Female","Female",
                    "Female","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Female","Male","Female","Female",
                    "Male","Male","Male","Female",
                    "Male","Male","Female","Female",
                    "Male","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Male","Male","Male","Male"),
          Age=c("<25","25-35","25-35","36-45",
                ">55",">55","25-35",">55",
                "<25","25-35","25-35","36-45",
                "25-35","25-35","25-35","36-45",
                ">55","36-45","46-55","36-45",
                ">55","46-55","25-35","46-55",
                "<25","46-55","25-35","46-55",
                "25-35","25-35","46-55","36-45",
                "<25","<25",">55","36-45",
                "36-45","46-55","<25","<25",
                "<25",">55","36-45","46-55",
                "<25",">55","36-45","46-55",
                "36-45",">55","36-45","46-55",
                "<25","46-55","<25","46-55"),
          Score_Q1=c(4,4,3,2,
                  1,1,4,2,
                  1,1,1,2,
                  2,1,4,3,
                  4,3,1,1,
                  1,2,1,1,
                  1,4,1,4,
                  3,4,3,3,
                  1,3,3,1,
                  1,1,2,1,
                  1,2,1,2,
                  1,1,1,1,
                  2,2,2,2,
                  1,2,3,4),
          Score_Q2=c(1,4,1,1,
                     1,2,1,1,
                     1,4,4,4,
                     2,1,1,3,
                     4,3,1,1,
                     1,3,3,3,
                     2,4,1,2,
                     4,4,4,4,
                     1,1,1,1,
                     1,2,3,4,
                     4,4,2,1,
                     1,2,3,2,
                     1,2,1,2,
                     4,3,2,1))

The dataframe can be split the following parts- 数据帧可以分为以下几个部分:

1) ID : A respondent ID 1) ID :受访者ID

2) Country : Respondent's country of origin 2) 国家 :受访者的原籍国

3) Gender : The gender of the respondent 3) 性别 :受访者性别

4) Age : Respondent age 4) 年龄 :受访者年龄

5) Score_Q1 : The satisfaction score for Q1, on a scale from 1 (Very satisfied) to 4 (Very dissatisfied). 5) Score_Q1 :Q1的满意度得分,从1 (非常满意)到4 (非常不满意)。

6) Score_Q2 : The satisfaction score for Q2, on a scale from 1 (Very satisfied) to 4 (Very dissatisfied). 6) Score_Q2 :Q2的满意度得分,范围从1 (非常满意)到4 (非常不满意)。

First some data cleaning - 首先进行一些数据清理-

#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)

Now I check the ratios for Age and Gender in my dataset - 现在,我检查数据集中的年龄和性别比率-

Gender by Country Country性别

#1) Gender by Country
split_gender=df %>% select(Country,Gender) %>%
  group_by(Gender,Country) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  select(Country,Gender,n) %>%
  group_by(Country,add=TRUE) %>%
  spread(Country,n)

split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"

Age by Country Country Age

#2) Age by Country
split_age=df %>% select(Country,Age) %>%
  group_by(Age,Country) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  select(Country,Age,n) %>%
  group_by(Country,add=TRUE) %>%
  spread(Country,n)

split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"

#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))

The above two steps give me two data frames - split_age_sample & split_gender_sample . 上述两个步骤为我提供了两个数据帧split_age_samplesplit_gender_sample These dataframes contain the sample ratios for age and gender by country for my 56 respondents. 这些数据框包含了我的56位受访者的国家年龄和性别抽样比例。

My Objective: Calculating Sampling Weights Based on Population Statistics 我的目标:根据人口统计数据计算抽样权重

In order to make my data frame more representative of reality , I would like to attribute weights to my respondents based on the official population ratios for age and gender by country. 为了使我的数据框架更能代表现实 ,我想根据国家的年龄和性别的官方人口比率将权重分配给我的受访者

These are the official population ratios I found for the countries I surveyed. 这些是我在接受调查的国家中发现的官方人口比率。

#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
                                 Canada=c(0.4,0.6),
                                 China=c(0.3,0.7),
                                 India=c(0.3,0.7),
                                 Mexico=c(0.5,0.5),
                                 Peru=c(0.6,0.4),
                                 Portugal=c(0.5,0.5),
                                 UK=c(0.4,0.6),
                                 US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
                                 Canada=c(0.1,0.3,0.3,0.2,0.1),
                                 China=c(0.3,0.05,0.35,0.1,0.2),
                                 India=c(0.5,0.05,0.35,0.05,0.05),
                                 Mexico=c(0.2,0.3,0.2,0.1,0.2),
                                 Peru=c(0.1,0.3,0.2,0.2,0.2),
                                 Portugal=c(0.2,0.1,0.05,0.05,0.6),
                                 UK=c(0.2,0.3,0.1,0.3,0.1),
                                 US=c(0.2,0.3,0.1,0.3,0.1))

Desired Output 期望的输出

Based on my sample ratios and the offical population ratios for both age & gender, I'd like to attribute weights to my respondents, in a separate column called weights . 根据我的抽样比例和按年龄和性别划分的正式人口比例,我想在一个单独的列中将权重分配给受访者weights

Currently I am unable to figure out how to do this calculation. 目前,我无法弄清楚如何进行此计算。

Then, once the weights are calculated, I'd like to summarize the scores using the weights column. 然后,一旦计算了权重,我想使用weights列来汇总分数。 The aggregation would look something like this (except with the weights factored into the calculation) - 聚合看起来像这样(除了权重已计入计算)-

Example: Weighted aggregated scores for the UK 示例:英国的加权汇总分数

#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
  select(-Age) %>%
  group_by(Country,Gender) %>%
  filter(Country=='UK') %>%
  summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
            Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))

I'd really appreciate any help I can get on the weight calculation and its usage in the weighted aggregation step that follows. 在接下来的加权聚合步骤中,我可以对权重计算及其用法获得任何帮助,我将不胜感激。

Not sure if its exactly what you are looking for, but here is what I figure out. 不知道它是否正是您要寻找的东西,但这是我想出的。 You need to merge the national weights with your dataframe, and then you can calculate the KPI. 您需要将国家权重与数据框合并,然后才能计算KPI。

> # Reshape national weights

> Nombres <- cbind.data.frame("Country" = colnames(split_gender_official)[colnames(split_gender_official) != "Gender"],
+                              "time" = 1:length(colnames(split_gender_official)[colnames(split_gender_official) != "Gender"]))
> Nombres$Country <- as.character(Nombres$Country)
> 
> split_gender_official_resh <- reshape(split_gender_official, direction = "long", varying = Nombres$Country, v.names = "Weights_gend")
> split_age_official_resh <- reshape(split_age_official, direction = "long", varying = Nombres$Country, v.names = "Weights_age")
> 
> split_gender_official_resh$id <- NULL
> split_age_official_resh$id <- NULL
> 
> split_gender_official_resh <- merge(split_gender_official_resh, Nombres, by = "time", all.x = TRUE)
> split_age_official_resh <- merge(split_age_official_resh, Nombres, by = "time", all.x = TRUE)
> 
> split_gender_official_resh$time <- NULL
> split_age_official_resh$time <- NULL

> # Merge weights with df

> df <- merge(df, split_gender_official_resh, by = c("Gender", "Country"), all.x = TRUE)
> df <- merge(df, split_age_official_resh, by = c("Age", "Country"), all.x = TRUE)
> 

> # Print tables
>
> # Without weights
>
> prop.table(table(df$Gender, df$Country), 2)

            Canada     China     India    Mexico      Peru  Portugal        UK        US
  Female 0.5000000 0.2000000 0.5714286 0.7000000 0.3333333 0.5000000 0.5714286 0.5714286
  Male   0.5000000 0.8000000 0.4285714 0.3000000 0.6666667 0.5000000 0.4285714 0.4285714
> prop.table(table(df$Age, df$Country), 2)

           Canada     China     India    Mexico      Peru  Portugal        UK        US
  <25   0.1250000 0.2000000 0.1428571 0.1000000 0.3333333 0.1666667 0.1428571 0.4285714
  >55   0.1250000 0.2000000 0.2857143 0.1000000 0.1666667 0.1666667 0.1428571 0.1428571
  25-35 0.2500000 0.2000000 0.2857143 0.1000000 0.1666667 0.3333333 0.2857143 0.1428571
  36-45 0.1250000 0.2000000 0.1428571 0.5000000 0.1666667 0.1666667 0.1428571 0.1428571
  46-55 0.3750000 0.2000000 0.1428571 0.2000000 0.1666667 0.1666667 0.2857143 0.1428571
> 
> # With weights
> prop.table(xtabs(Weights_gend ~ Gender + Country, df), 2)
        Country
Gender       Canada      China      India     Mexico       Peru   Portugal         UK         US
  Female 0.40000000 0.09677419 0.36363636 0.70000000 0.42857143 0.50000000 0.47058824 0.47058824
  Male   0.60000000 0.90322581 0.63636364 0.30000000 0.57142857 0.50000000 0.52941176 0.52941176
> prop.table(xtabs(Weights_age ~ Gender + Country, df), 2)
        Country
Gender      Canada     China     India    Mexico      Peru  Portugal        UK        US
  Female 0.3333333 0.0500000 0.4642857 0.6250000 0.4545455 0.7142857 0.5000000 0.5000000
  Male   0.6666667 0.9500000 0.5357143 0.3750000 0.5454545 0.2857143 0.5000000 0.5000000
> 
> #  Means with weights and scores
> tapply(df$Score_Q1 * df$Weights_gend, list(df$Gender, df$Country), mean)
       Canada China    India   Mexico Peru  Portugal  UK  US
Female    0.6  0.60 0.600000 1.000000  1.5 1.3333333 0.8 0.7
Male      1.2  1.05 1.166667 1.666667  1.0 0.6666667 1.2 1.4
> tapply(df$Score_Q1 * df$Weights_age, list(df$Age, df$Country), mean)
         Canada China India Mexico Peru Portugal   UK  US
<25   0.1000000  0.90  1.00   0.20  0.2     0.20 0.20 0.4
>55   0.9000000  0.10  0.05   0.60  1.2     0.20 0.30 0.3
25-35 0.6000000  0.35  0.35   0.80  0.8     0.15 0.25 0.3
36-45 0.2000000  0.10  0.15   0.22  0.4     0.05 0.30 0.6
46-55 0.1666667  0.20  0.20   0.60  0.2     1.20 0.30 0.2
> 

Hope it helps. 希望能帮助到你。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM