簡體   English   中英

r 中 dataframe 列條目的成對比較

[英]Pairwise comparison of dataframe column entries in r

數據:

df_dat = structure(list(code = c(1L, 10000L, 10001L), yr_1986 = c(NA, 10000L, 10001L), yr_1987 = c(NA, 10000L, 10001L), yr_1988 = c(NA, 10000L, 10001L), yr_1989 = c(NA, 10000L, NA), yr_1990 = c(NA, 10000L, NA), yr_1991 = c(1L, 10000L, 10001L), yr_1992 = c(NA, 10000L, 10001L), yr_1993 = c(NA, 10000L, 10001L), yr_1994 = c(NA, 10000L, NA), yr_1995 = c(NA, 10000L, NA), yr_1996 = c(NA, 10000L, NA), yr_1997 = c(NA, 10000L, 10001L), yr_1998 = c(NA, 10000L, 10001L), yr_1999 = c(NA, 10000L, 10001L), yr_2000 = c(NA, 10000L, 10001L), yr_2001 = c(NA, 10000L, NA), yr_2002 = c(NA, 10000L, NA), yr_2003 = c(NA, 10000L, NA), yr_2004 = c(NA, 10000L, NA), yr_2005 = c(NA, 10000L, NA), yr_2006 = c(NA, 10000L, NA), yr_2007 = c(NA, 10000L, NA), yr_2008 = c(NA, 10000L, NA), yr_2009 = c(NA, 10000L, 10001L), yr_2010 = c(NA, 10000L, 10001L), yr_2011 = c(NA, 10000L, 10001L), yr_2012 = c(NA, 10000L, 10001L), yr_2013 = c(NA, 10000L, 10001L), yr_2014 = c(NA, 10000L, NA), yr_2015 = c(NA, 10000L, NA), yr_2016 = c(NA, 10000L, NA), yr_2017 = c(NA, 10000L, NA), yr_2018 = c(NA, 10000L, NA)), .Names = c("code", "yr_1986", "yr_1987", "yr_1988", "yr_1989", "yr_1990", "yr_1991", "yr_1992", "yr_1993", "yr_1994", "yr_1995", "yr_1996", "yr_1997", "yr_1998", "yr_1999", "yr_2000", "yr_2001", "yr_2002", "yr_2003", "yr_2004", "yr_2005", "yr_2006", "yr_2007", "yr_2008", "yr_2009", "yr_2010", "yr_2011", "yr_2012", "yr_2013", "yr_2014", "yr_2015", "yr_2016", "yr_2017", "yr_2018"), class = "data.frame", row.names = c(NA, -3L))

問題:我正在嘗試對 dataframe 中的列執行條件成對比較,以檢查存儲在第一列code中的值是否再次出現,這些值是數字代碼。 在我的案例中,其余列實際上是 1986-2018 年的時間序列。 您在每年的列中看到的實際上是存儲在代碼列中的代碼隨時間的出現。

現在,到了問題的關鍵。 目標是創建一個新的 dataframe,其中將根據存儲在代碼列中的值隨時間的出現和消失,通過條件語句填充條目。 預期結果應如下所示:

結果:

df_out = structure(list(code = c(1L, 10000L), yr_1986 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1987 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1988 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1989 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1990 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1991 = structure(c(2L, 1L), .Label = c("EXIST", "NEW"), class = "factor"), yr_1992 = structure(1:2, .Label = c("CLOSED", "EXIST"), class = "factor"), yr_1993 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1994 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1995 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1996 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1997 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1998 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_1999 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2000 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2001 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2002 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2003 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2004 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2005 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2006 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2007 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2008 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2009 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2010 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2011 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2012 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2013 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2014 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2015 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2016 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2017 = structure(c(NA, 1L), .Label = "EXIST", class = "factor"), yr_2018 = structure(c(NA, 1L), .Label = "EXIST", class = "factor")), .Names = c("code", "yr_1986", "yr_1987", "yr_1988", "yr_1989", "yr_1990", "yr_1991", "yr_1992", "yr_1993", "yr_1994", "yr_1995", "yr_1996", "yr_1997", "yr_1998", "yr_1999", "yr_2000", "yr_2001", "yr_2002", "yr_2003", "yr_2004", "yr_2005", "yr_2006", "yr_2007", "yr_2008", "yr_2009", "yr_2010", "yr_2011", "yr_2012", "yr_2013", "yr_2014", "yr_2015", "yr_2016", "yr_2017", "yr_2018"), class = "data.frame", row.names = c(NA, -2L))

下面是對我打算實現的機制的簡要描述。 第一列代碼存儲感興趣的代碼。 每行一個代碼。 其余列實際上是年份列,它們將存儲在代碼列中的代碼隨時間的出現顯示為它們的條目。

現在,目的是通過時間(即年份列)檢查代碼列中每個代碼的出現,並將 output 中的條目重新編碼為:

  • 發生的第一年(t);
  • 如果代碼在第 (t) 年發生后在第 (t+1) 年停止再次出現,則關閉
  • 如果代碼多年來一直重復出現,則存在。

我希望我已經設法盡可能清楚地描述了這個問題。

編輯:我設法找到了解決問題的次優方法。 這是通過將數據分成兩種類型來實現的:1) 類型 1 將收集存儲在代碼中的代碼在幾年內出現的所有數據; 2)類型2是收集該期間每年重復出現的所有代碼。 以下是基於我提供的示例數據的代碼和 output。 但同樣,這不是最優的。

#Load packages
require(tidyverse)

#Select only the year columns in the input data
df_dat_year = df_dat %>%
select(-code)

#Select only the code column for later use
df_dat_code = df_dat %>%
select(code)

#Dataframe including all observations for code=1
df_dat1 = df_dat_year[1:1,]

#Dataframe including all observations for code=10000
df_dat2 = df_dat_year[2:2,]

#Create output dataframes
df_out1 = as.data.frame(matrix(nrow = nrow(df_dat1), ncol = ncol(df_dat1)))
df_out2 = as.data.frame(matrix(nrow = nrow(df_dat2), ncol = ncol(df_dat2)))

#Loop code for each output dataframe

##For output 1
for(i in 1:nrow(df_dat1)) {
for(j in 1:ncol(df_dat1)) {
if((!is.na(df_dat1[i,j])) & (is.na(lead(df_dat1[i,j],1)))) {
df_out1[i,j] = "new"
df_out1[i,j+1] = "closed"
}
}
}
print(df_out1) 

##For output 1
for(i in 1:nrow(df_dat2)) {
for(j in 1:ncol(df_dat2)) {
if((!is.na(df_dat2[i,j]))) {
df_out2[i,j] = "exists"
}
}
}
print(df_out2)

填寫 output 中的條目后,我只需使用rbind()加入數據幀。 隨后,我添加了帶有cbind()的代碼列。 最終 output 如下所示:

#Row-binding the output dataframes
df_out = rbind(df_out1,df_out2)

#Adding the code column to the final output dataframe
df_out_fin = cbind(code,df_out)

但同樣,這是解決問題的一種更加混亂和復雜的方式。 有沒有人有更好的解決方案,不需要我添加的大量步驟?

這是一個tidyverse方法:

library(tidyverse)
df_dat %>% 
  pivot_longer(-code) %>%
  group_by(code) %>%
  mutate(value = case_when(
    sum(!is.na(value)) == n() ~ "exists",
    !is.na(value) & is.na(lag(value)) ~ "new",
    is.na(value) & !is.na(lag(value)) ~ "closed",
    TRUE ~ NA_character_
  )) %>%
  ungroup() %>%
  pivot_wider(names_from = name, values_from = value)

結果

# A tibble: 2 x 34
   code yr_1986 yr_1987 yr_1988 yr_1989 yr_1990 yr_1991 yr_1992 yr_1993 yr_1994 yr_1995 yr_1996 yr_1997 yr_1998 yr_1999 yr_2000 yr_2001 yr_2002 yr_2003 yr_2004 yr_2005 yr_2006 yr_2007 yr_2008 yr_2009 yr_2010 yr_2011 yr_2012 yr_2013 yr_2014 yr_2015 yr_2016 yr_2017 yr_2018
  <int> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
1     1 NA      NA      NA      NA      NA      new     closed  NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA     
2 10000 exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists  exists 

如果是按行,也可以使用pmap選項

library(dplyr)
library(purrr)
pmap_dfr(df_dat[-1], ~ {
    tibble(v1 = c(...), v2 = lag(v1)) %>%
    transmute(out = case_when(all(!is.na(v1))  ~ 'EXISTS',
            !is.na(v1) & is.na(v2) ~ "NEW", 
            is.na(v1) & !is.na(v2) ~ "CLOSED")) %>%
    pull(out) %>% 
    set_names(names(df_dat)[-1])  }) %>%
    bind_cols(df_dat[1],.)
#   code yr_1986 yr_1987 yr_1988 yr_1989 yr_1990 yr_1991 yr_1992 yr_1993 yr_1994 yr_1995 yr_1996 yr_1997 yr_1998 yr_1999
#1     1    <NA>    <NA>    <NA>    <NA>    <NA>     NEW  CLOSED    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
#2 10000  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS
#  yr_2000 yr_2001 yr_2002 yr_2003 yr_2004 yr_2005 yr_2006 yr_2007 yr_2008 yr_2009 yr_2010 yr_2011 yr_2012 yr_2013 yr_2014
#1    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
#2  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS  EXISTS
#  yr_2015 yr_2016 yr_2017 yr_2018
#1    <NA>    <NA>    <NA>    <NA>
#2  EXISTS  EXISTS  EXISTS  EXISTS

一個簡單的解決方案是創建存儲時間序列前后值的向量:

x <- !is.na(df_dat[1,-1])
x.prec <- c(NA, x[-length(x)])
x.foll <- c(x[-1], NA)

然后你可以找到,例如,所有新標志(值已設置,但前身未設置)

x.new <- x & !x.prec

與 CLOSED(最后設定值)等類似。

這是一個基本的 R 方法:

mat = is.na(df_dat[, -1L])
res = matrix(NA_character_, ncol = ncol(mat), nrow = nrow(mat))

#code = 1 logic:
x = mat[1L, ]
ind_new = which(!x & c(x[-1L], FALSE))
ind_closed = ind_new + 1L
res[1L, c(ind_new, ind_closed)] = rep(c("new", 'closed'), each = length(ind_new))

#code = 10000 logic:
x = mat[2L, ]
res[2L, !x] = "exists"

res
cbind(df_dat[1L], res)

基本上,我們使用is.na(df_dat[, -1L])來評估您的邏輯。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM