簡體   English   中英

通過提取相似的列名跨列應用 function

[英]applying a function across columns by extracting similar column names

我的數據看起來像:

[[1]]
        date germany france germany_mean france_mean germany_sd france_sd
1 2016-01-01      17     25     21.29429    48.57103   30.03026  47.05169

我要做的是使用map對所有列表進行以下計算。

germany_calc = (germany - germany_mean) / germany_sd 
france_calc = (france - france_mean) / france_sd

但是列的數量可以改變 - 這里有兩個類別/國家,但在另一個列表中可能有 1 或 3 或 N。這些國家始終遵循相同的結構。 那是,

"country1", "country2", ... , "countryN", "country1_mean", "country2_mean", ... , "countryN_mean", "country1_sd", "country2_sd", ... , "countryN_sd".

預期 Output(對於第一個列表):

Germany: -0.1429988 =  (17 - 21.29429) / 30.03026 
France: -0.5009603 = (25 - 48.57103) / 47.05169

編輯:道歉 - 預計 output:

-0.1429988
-0.5009603

Function:

Scale_Me <- function(x){
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

數據:

    my_list <- list(structure(list(date = structure(16801, class = "Date"), 
    germany = 17, france = 25, germany_mean = 21.2942922374429, 
    france_mean = 48.5710301846855, germany_sd = 30.030258443028, 
    france_sd = 47.0516928425878), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16802, class = "Date"), 
    germany = 9, france = 29, germany_mean = 21.2993150684932, 
    france_mean = 48.5605316914534, germany_sd = 30.0286190461173, 
    france_sd = 47.0543871206842), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16803, class = "Date"), 
    germany = 8, france = 18, germany_mean = 21.2947488584475, 
    france_mean = 48.551889593794, germany_sd = 30.0297291333284, 
    france_sd = 47.0562416513092), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16804, class = "Date"), 
    germany = 3, france = 11, germany_mean = 21.2778538812785, 
    france_mean = 48.5382545766386, germany_sd = 30.0267943793948, 
    france_sd = 47.0607680244109), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16805, class = "Date"), 
    germany = 4, france = 13, germany_mean = 21.2614155251142, 
    france_mean = 48.5214531240057, germany_sd = 30.0269420596686, 
    france_sd = 47.0676011750263), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16806, class = "Date"), 
    germany = 4, france = 9, germany_mean = 21.253196347032, 
    france_mean = 48.5055948249362, germany_sd = 30.0292032528186, 
    france_sd = 47.0737183354519), class = "data.frame", row.names = c(NA, 
-1L)))

為什么不直接rbind呢?

with(do.call(rbind, my_list), 
     cbind(germany=(germany - germany_mean) / germany_sd,
           france=(france - france_mean) / france_sd))
#         germany     france
# [1,] -0.1429988 -0.5009603
# [2,] -0.4095864 -0.4157005
# [3,] -0.4427196 -0.6492633
# [4,] -0.6087181 -0.7976550
# [5,] -0.5748642 -0.7546901
# [6,] -0.5745473 -0.8392283

你必須使用map嗎? 在這里,我使用兩個for循環而不是使用map

Result_list = vector("list",length(my_list))
for(i in 1:length(my_list))
{
  df = my_list[[i]]
  # identifier number of countries
  countries = colnames(df)[grep('mean',colnames(df))]
  countries = gsub("_mean","",countries)

  df_result = NULL
  for(j in 1:length(countries))
  {
    country = countries[j]
    value_country = df[1,match(country,colnames(df))]
    mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
    sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]

    result_country = (value_country - mean_country) / sd_country
    Sentence = paste0(country,": ",round(result_country,5)," = (",value_country," - ",round(mean_country,5),") / ",round(sd_country,5))
    df_result = c(df_result,Sentence)
  }
  Result_list[[i]] = df_result
}

output Result_list看起來像:

> Result_list
[[1]]
[1] "germany: -0.143 = (17 - 21.29429) / 30.03026" 
[2] "france: -0.50096 = (25 - 48.57103) / 47.05169"

[[2]]
[1] "germany: -0.40959 = (9 - 21.29932) / 30.02862"
[2] "france: -0.4157 = (29 - 48.56053) / 47.05439" 

[[3]]
[1] "germany: -0.44272 = (8 - 21.29475) / 30.02973"
[2] "france: -0.64926 = (18 - 48.55189) / 47.05624"

[[4]]
[1] "germany: -0.60872 = (3 - 21.27785) / 30.02679"
[2] "france: -0.79765 = (11 - 48.53825) / 47.06077"

[[5]]
[1] "germany: -0.57486 = (4 - 21.26142) / 30.02694"
[2] "france: -0.75469 = (13 - 48.52145) / 47.0676" 

[[6]]
[1] "germany: -0.57455 = (4 - 21.2532) / 30.0292" 
[2] "france: -0.83923 = (9 - 48.50559) / 47.07372"

是你要找的嗎?

編輯:僅提取結果

要僅提取結果值,您可以執行以下操作:

Df_result_value = NULL
for(i in 1:length(my_list))
{
  df = my_list[[i]]
  # identifier number of countries
  countries = colnames(df)[grep('mean',colnames(df))]
  countries = gsub("_mean","",countries)

  for(j in 1:length(countries))
  {
    country = countries[j]
    value_country = df[1,match(country,colnames(df))]
    mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
    sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]

    result_country = (value_country - mean_country) / sd_country

    Df_result_value = rbind(Df_result_value,c(country,result_country))
  }
}
Df_result_value = data.frame(Df_result_value)
colnames(Df_result_value) = c("Country","Result")

並得到這個 output:

> Df_result_value
   Country             Result
1  germany -0.142998843835787
2   france -0.500960300483614
3  germany -0.409586436512588
4   france -0.415700488060442
5  germany -0.442719572974515
6   france -0.649263275639099
7  germany -0.608718121899195
8   france -0.797654950237258
9  germany -0.574864249939699
10  france -0.754690110335453
11 germany -0.574547256608035
12  france -0.839228262008441

關於 output 的確切形式的問題尚不清楚,因此我們假設需要的是一個數據框,其中包含日期列和每個國家/地區值標准化的列。 在這種情況下,這意味着我們需要 output 中的 3 列。

1) pivot_longer/_widermy_list列表組件綁定在一起,創建一個數據框,每個組件都有一行。 然后對於列 append _root 中的每個裸國家名稱,以便除date之外的每個列名稱都采用 country_suffix 的形式。 然后轉換為長格式,執行規范化並轉換回寬格式:

library(dplyr)
library(tidyr)
library(purrr)

my_list %>%
  bind_rows %>%
  set_names(names(.)[1], sub("^([^_]*)$", "\\1_root", names(.)[-1])) %>%
  pivot_longer(-date, names_to = c("country", ".value"), names_sep = "_") %>%
  mutate(root = (root - mean) / sd) %>%
  pivot_wider(id_cols = "date", names_from = "country", values_from = "root")

給予:

# A tibble: 6 x 3
  date       germany france
  <date>       <dbl>  <dbl>
1 2016-01-01  -0.143 -0.501
2 2016-01-02  -0.410 -0.416
3 2016-01-03  -0.443 -0.649
4 2016-01-04  -0.609 -0.798
5 2016-01-05  -0.575 -0.755
6 2016-01-06  -0.575 -0.839

2) 底座 R

在將列表組件綁定在一起給出d之后,我們挑選出國家名稱nms ,作為那些不包含下划線的名稱,除了第一個這樣的名稱(即date )。 然后執行規范化並將datecbind到該列。

d <- do.call("rbind", my_list)
nms <- grep("_", names(d), invert = TRUE, value = TRUE)[-1]
cbind(d[1], (d[nms] - d[paste0(nms, "_mean")]) / d[paste0(nms, "_sd")])

給予:

        date    germany     france
1 2016-01-01 -0.1429988 -0.5009603
2 2016-01-02 -0.4095864 -0.4157005
3 2016-01-03 -0.4427196 -0.6492633
4 2016-01-04 -0.6087181 -0.7976550
5 2016-01-05 -0.5748642 -0.7546901
6 2016-01-06 -0.5745473 -0.8392283

我們也可以在base R中使用transform

transform(do.call(rbind, my_list), 
  germany = (germany - germany_mean)/germany_sd, 
   france = (france - france_mean)/france_sd)[c('date', 'germany', 'france')]
#     date    germany     france
#1 2016-01-01 -0.1429988 -0.5009603
#2 2016-01-02 -0.4095864 -0.4157005
#3 2016-01-03 -0.4427196 -0.6492633
#4 2016-01-04 -0.6087181 -0.7976550
#5 2016-01-05 -0.5748642 -0.7546901
#6 2016-01-06 -0.5745473 -0.8392283

或在dplyr中,無需任何整形,可以這樣做

library(dplyr)
bind_rows(my_list) %>% 
   transmute(date,
             germany = (germany - germany_mean)/germany_sd,
             france = (france - france_mean)/france_sd)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM