[英]multiply columns with similar names
我要解決的問題是將具有相似名稱的列逐行相乘。 考慮以下示例 df:
library(tidyverse)
library(lubridate)
sample_df <- data.frame(stringsAsFactors=FALSE,
Date = c("1/03/2018 0:00", "1/03/2018 4:00", "1/03/2018 8:00"),
EUR_USD_Open = c(0.892093896, 0.891937999, 0.891744285),
EUR_USD_High = c(0.89245654, 0.892283601, 0.892616906),
EUR_USD_Low = c(0.891803229, 0.89117644, 0.891483374),
EUR_USD_Close = c(0.891942211, 0.891748495, 0.892405914),
USD_JPY_Open = c(1.887128916, 1.887340614, 1.887501691),
USD_JPY_High = c(1.887470444, 1.887677094, 1.887746746),
USD_JPY_Low = c(1.886890576, 1.887246812, 1.887167418),
USD_JPY_Close = c(1.887338209, 1.887504095, 1.887210726),
USD_CHF_Open = c(0.997952231, 0.997969721, 0.997971242),
USD_CHF_High = c(0.99799974, 0.997989483, 0.998035047),
USD_CHF_Low = c(0.997949949, 0.997933211, 0.997968961),
USD_CHF_Close = c(0.997970102, 0.997970862, 0.99799936),
USD_SEK_Open = c(1.092929855, 1.092928195, 1.092853491),
USD_SEK_High = c(1.092993997, 1.092943686, 1.093004716),
USD_SEK_Low = c(1.09291292, 1.092803475, 1.092767679),
USD_SEK_Close = c(1.09292825, 1.09285338, 1.092896312),
USD_CAD_Open = c(1.022980632, 1.022990785, 1.022967577),
USD_CAD_High = c(1.023079216, 1.023053854, 1.02313861),
USD_CAD_Low = c(1.022959598, 1.022919695, 1.022958873),
USD_CAD_Close = c(1.02299151, 1.022966852, 1.023073419),
GBP_USD_Open = c(0.962767254, 0.962746434, 0.962811407),
GBP_USD_High = c(0.96287142, 0.962841409, 0.962998227),
GBP_USD_Low = c(0.962725618, 0.962629918, 0.962640732),
GBP_USD_Close = c(0.962747267, 0.962806408, 0.96284391)
) %>%
mutate(Date = dmy_hm(Date))
對於每個日期,我想將所有列與Open
、 Close
等相乘。
最終的 output 應如下所示:
output_df <- data.frame(stringsAsFactors=FALSE,
Date = c("1/03/2018 0:00", "1/03/2018 4:00", "1/03/2018 8:00"),
Open = c(1.808434992, 1.808329582, 1.808051308),
High = c(1.810060115, 1.80970432, 1.811075804),
Low = c(1.807469953, 1.806079386, 1.806720451),
Close = c(1.808339444, 1.808050604, 1.809484003)
)%>%
mutate(Date = dmy_hm(Date))
任何想法如何有效地完成這項工作?
對 DT 或 Tidyverse 解決方案感到滿意。
你可以在data.table
試試這個:
setDT(sample_df)
sample_df[ , melt(.SD, id.vars = 'Date', variable.name = 'x',
measure.vars = patterns(Open = 'Open$', Close = 'Close$',
High = 'High$', Low = 'Low$'))
][ , lapply(.SD, prod), by = Date, .SDcols = !'x']
# Date Open Close High Low
# 1: 2018-03-01 00:00:00 1.808435 1.808339 1.810060 1.807470
# 2: 2018-03-01 04:00:00 1.808330 1.808051 1.809704 1.806079
# 3: 2018-03-01 08:00:00 1.808051 1.809484 1.811076 1.806720
melt
重塑你的數據很長; measure.vars
中的patterns
會將與每個模式匹配的所有列“堆疊”到一個名為patterns
中提供的列中。
variable.name
只是在這里礙事,所以我們將它重命名為x
以使其在下一步中排除它更簡潔(默認情況下,它被命名為variable
,我們必須這樣做.SDcols = !'variable'
.
lapply(.SD, prod)
進行乘法——在每個Date
中,我們希望將所有值相乘; 這正是prod
所做的。
如果不進行重塑,最好的選擇是循環和Reduce
方法,例如:
out = data.table(Date = unique(sample_df$Date), key = 'Date')
cols = c('Open', 'Close', 'High', 'Low')
for (col in cols) {
prod_dt = sample_df[ , .(Date, v = Reduce(`*`, .SD)), .SDcols = patterns(col)]
# joins automatically since out is keyed
out[prod_dt, (col) := i.v]
}
在基礎 R 中,我們可以使用split.default
來拆分名稱的相似性
cbind(sample_df[1], sapply(split.default(sample_df[-1],
sub(".*_", "", names(sample_df)[-1])), Reduce, f = `*`))
# Date Close High Low Open
#1 2018-03-01 00:00:00 1.808 1.810 1.807 1.808
#2 2018-03-01 04:00:00 1.808 1.810 1.806 1.808
#3 2018-03-01 08:00:00 1.809 1.811 1.807 1.808
怎么樣使用tempdf <- sample_df[grepl('Open', names(sample_df))]
然后for (ii in 1:nrow(tempdf)) { sample_df$Open[[ii]] <- prod(tempdf[ii,])}
絕對不是最快或最干凈的,但應該完成工作。
轉換為長格式,將名稱分成僅保留第三個(最后一個)的部分,執行乘法並轉換回寬格式。
library(dplyr)
library(tidyr)
sample_df %>%
pivot_longer(-Date) %>%
separate(name, c(NA, NA, "name")) %>%
group_by(Date, name) %>%
summarize(value = prod(value)) %>%
ungroup %>%
pivot_wider
給予:
# A tibble: 3 x 5
Date Close High Low Open
<dttm> <dbl> <dbl> <dbl> <dbl>
1 2018-03-01 00:00:00 1.81 1.81 1.81 1.81
2 2018-03-01 04:00:00 1.81 1.81 1.81 1.81
3 2018-03-01 08:00:00 1.81 1.81 1.81 1.81
就代碼行而言不是最快的,但這也有效
library(tidyverse)
sample_df %>% pivot_longer(-Date,"Type",'Value') %>% # convert to long format
mutate(type_var=case_when(str_detect(Type, 'Open') ~ 'Open',
str_detect(Type, 'Close') ~ 'Close',
str_detect(Type, 'High') ~ 'High',
str_detect(Type, 'Low') ~ 'Low',
TRUE ~ 'Other')) %>% # identify type of value
group_by(Date,type_var) %>%
summarise(value=prod(value)) %>% # multiply all by group
pivot_wider(id_cols='Date',names_from=type_var,values_from=value) # convert lines into columns
另一種data.table
替代品:
setDT(sample_df)
sample_df[, melt(.SD, id.vars = "Date")
][, prod(value), by = .(Date, substring(variable, 9, 13)) # Or tstrsplit(variable, "_")[[3]]
][, dcast(.SD, Date ~ substring, value.var = "V1")]
Date Close High Low Open
1: 2018-03-01 00:00:00 1.808339 1.810060 1.807470 1.808435
2: 2018-03-01 04:00:00 1.808051 1.809704 1.806079 1.808330
3: 2018-03-01 08:00:00 1.809484 1.811076 1.806720 1.808051
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.