繁体   English   中英

R:最新日期的子集

[英]R: Subset by Latest Date

我有:

Keyword   Date   Pos   Bid
a       4/11/14   1   5.00
a       4/13/14   1   5.00
a       4/14/14   1   5.00
b        6/2/14   3   9.00
b        7/2/14   4   9.00  
b        8/2/14   4   9.00
c       8/29/14   2   3.00
c       8/30/14   2   3.00
c       8/31/14   2   3.00

我需要进行子集化,以便只保留具有最新日期的行:

Keyword   Date   Pos   Bid
a       4/14/14   1   5.00
b        8/2/14   4   9.00
c       8/31/14   2   3.00

我试过了:

Latest = ddply( df, 
                'Keyword', 
                function(x) c (
                    Date = max(as.Date(x$Date, '%m/%d/%y')), 
                    Pos = x$Pos[which(x$Date == max(as.Date(x$Date, '%m/%d/%y')))], 
                    Bid = x$Bid[which(x$Date == max(as.Date(x$Date, '%m/%d/%y')))]
                )
         )

Latest = subset( x, 
                 Date = max(as.Date(Date, '%m/%d/%y')), 
                 select = c('Identity', 'Date', 'Round.Avg.Pos.', 'Search.Bid')
         )

但这些或者给我错误或不是我想要的。 我错过了什么?

谢谢。

你可以试试

 library(dplyr)
 library(tidyr)

  df %>% 
     mutate(Date=as.Date(Date, format= "%m/%d/%y"))%>% 
     group_by(Keyword) %>%  
     arrange(desc(Date)) %>%
     slice(1)

  #   Keyword       Date Pos Bid
  #1       a 2014-04-14   1   5
  #2       b 2014-08-02   4   9
  #3       c 2014-08-31   2   3

要么

   df %>% 
      group_by(Keyword) %>%
      mutate(Date=as.Date(Date, format= "%m/%d/%y"))%>% 
      filter(Date==max(Date))

或使用base R

  indx <- with(df, ave(as.Date(Date, format="%m/%d/%y"), Keyword, FUN=max))
  df[with(df, as.Date(Date, format='%m/%d/%y')==indx),]
  #  Keyword    Date Pos Bid
  #3       a 4/14/14   1   5
  #6       b  8/2/14   4   9
  #9       c 8/31/14   2   3

或者使用ddply

  ddply(df, .(Keyword), function(x) {
                  Date=as.Date(x$Date, '%m/%d/%y')
                  x[Date==max(Date),]})

  #  Keyword    Date Pos Bid
  #1       a 4/14/14   1   5
  #2       b  8/2/14   4   9
  #3       c 8/31/14   2   3

数据

df <- structure(list(Keyword = c("a", "a", "a", "b", "b", "b", "c", 
 "c", "c"), Date = c("4/11/14", "4/13/14", "4/14/14", "6/2/14", 
 "7/2/14", "8/2/14", "8/29/14", "8/30/14", "8/31/14"), Pos = c(1L, 
1L, 1L, 3L, 4L, 4L, 2L, 2L, 2L), Bid = c(5, 5, 5, 9, 9, 9, 3, 
3, 3)), .Names = c("Keyword", "Date", "Pos", "Bid"), class = "data.frame", row.names = c(NA, 
-9L))

或者使用data.table

library(data.table)
setDT(df)[ ,.SD[which.max(as.Date(Date, format= "%m/%d/%y"))], by = Keyword]
#    Keyword    Date Pos Bid
# 1:       a 4/14/14   1   5
# 2:       b  8/2/14   4   9
# 3:       c 8/31/14   2   3

这是使用“拆分 - 应用 - 组合”方法的附加基础R解决方案

do.call(rbind, lapply(split(df, df$Keyword), 
        function(x) x[which.max(as.Date(x$Date, format='%m/%d/%y')), ]))
#   Keyword    Date Pos Bid
# a       a 4/14/14   1   5
# b       b  8/2/14   4   9
# c       c 8/31/14   2   3

注意:您所需的输出以与以前相同的格式离开Date列,因此我在两个解决方案的每次迭代中应用as.Date ,而最佳做法是将其转换为Date类一次,然后使用已转换的列聚合过程

尝试:

ddf$Date = as.Date(ddf$Date, format("%m/%d/%y"))
ddf= ddf[rev(order(ddf$Date)),]
ddf = ddf[!duplicated(ddf$Keyword),]
ddf[order(ddf$Keyword),]
  Keyword       Date Pos Bid
3       a 2014-04-14   1   5
6       b 2014-08-02   4   9
9       c 2014-08-31   2   3

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM