简体   繁体   English

如何在 R 中创建计算列

[英]How to create a calculated column in R

Below is the sample data set and the desired manipulations.下面是示例数据集和所需的操作。 As of yet, all works fine.到目前为止,一切正常。 Attempting to create a new calculated column.正在尝试创建新的计算列。 Some context, the smb stands for small business.在某些情况下,smb 代表小型企业。 1,2,3,4 represent differing thresholds of what would be considered small. 1,2,3,4 代表被认为是小的不同阈值。 The desired column would what percentage of total employment is taken up by smb = 1 for a given area, for example.例如,所需的列是 smb = 1 占给定区域的总就业百分比。 For area 001, this would be 46/1927 for example.例如,对于区域 001,这将是 46/1927。 I can figure out how to have it to be appear once but not as a complete column.我可以弄清楚如何让它出现一次,而不是作为一个完整的列。 How would I go about doing this?我将如何 go 这样做? The desired result is at the bottom.想要的结果在底部。

library(readxl)
library(dplyr)
library(data.table)
library(DBI)
library(stringr)
library(tidyverse)
library(gt)


 employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
 small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
 area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
 year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
 qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)

 smbtest <- data.frame(employment,small,area,year,qtr)

 smbtest$smb <-0

 smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))


 smbsummary2<-smbtest %>% 
 mutate(period = paste0(year,"q",qtr)) %>%
 group_by(area,period,smb) %>%
 summarise(employment = sum(employment), worksites = n(), 
        .groups = 'drop_last') %>% 
 mutate(employment = cumsum(employment),
     worksites = cumsum(worksites))

 smbsummary2<- smbsummary2%>%
 group_by(area,smb)%>%
 mutate(empprevyear=lag(employment),
     empprevyearpp=employment-empprevyear,
     empprevyearpct=((employment/empprevyear)-1), 
 empprevyearpct=scales::percent(empprevyearpct,accuracy = 0.01)
 )




smblonger2<-smbsummary2 %>%
dplyr::select(area,period,employment,worksites,smb) %>%
ungroup() %>%
pivot_longer(cols = employment:worksites, names_to = "measure", values_to = "value") %>%
group_by(area,measure) %>%
pivot_wider(names_from = period, values_from = value)%>%filter(smb %in% 
c("1","2","3","4"))%>%gt()%>%cols_label(
smb = md("**Category**"))


smblonger2

area    period   smb    employment    worksites    pcttotal
 1      2020q1    1         46           2          46/1927 (total employment)
 2      2020q2    2        301           4          301/1927
 3      2020q3    3        466           5          466/1927
 4      2020q4    4        726           6          726/1927

 schema
 smb      employment range
  1         0 to 100
  2         0 to 150
  3         0 to 250
  4         0 to 500

OK so here is my solution (someone will now come along with a 1 line function!)好的,这是我的解决方案(现在有人会提供 1 行功能!)

library(dplyr)
library(tidyr)

employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)

smbtest <- data.frame(employment,small,area,year,qtr)

smbtest$smb <-0  # I think this line is redundent

smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))

smbsummary2<-smbtest %>% 
    mutate(period = paste0(year,"q",qtr)) %>%
    group_by(area,period,smb) %>%
    summarise(employment = sum(employment), worksites = n(), 
              .groups = 'drop_last') %>% 
    mutate(employment = cumsum(employment),
           worksites = cumsum(worksites))



smbsummary2 %>%
    # Make the data wider (a column for each smb)
    pivot_wider(
        id_cols=c("area", "period"), 
        names_from = "smb", 
        values_from = c("employment", "worksites"),
        names_prefix = "SMB"
        ) %>%
    # calculate the %
    mutate(across(starts_with("employment_SMB"), 
                  ~(100*(.x/employment_SMBNA)),
                  .names = "pcttotal_{.col}")) %>%

    # Now make the data longer
    pivot_longer(
        cols = contains("SMB")
    ) %>%
    # rework the data names so the smb is a value
    separate(name, into=c("name", "smb"), sep="_SMB") %>%
    # Make the date wider again to match the shape requested
    pivot_wider(
        id_cols=c("area", "period", "smb"), 
        names_from = "name", 
        values_from = "value"
    ) -> smbsummary3

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM