根据另一个数据帧R中的值填充数据帧中的缺失值

Question

Given two dataframes df1 and df2 with 4 columns each, I would like to do the following:给定两个数据df1和df2 ，每个数据框有 4 列，我想执行以下操作：

Expand df1 to have a date range between 1948-2016 .展开df1使其日期范围介于1948-2016之间。 For example:例如：

seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"), 
                                     as.Date("2016-12-31"), 
                                     by="day"))

The expanded rows should have the columns filled with NA展开的行应该NA填充的列

Replace all NAs in df1 with corresponding cell-specific values from df2 .将df1中的所有NAs替换为df2中相应的特定于单元格的值。

I tried the following but it did not work:我尝试了以下但没有用：

seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"), 
                                     as.Date("2016-12-31"), 
                                     by="day"))


data1=full_join(df1, seq_df) %>% arrange(Date)

Sample Data样本数据

df1=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 
3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1, 
17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7, 
10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data. Frame")

df2=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 
3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877, 
15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396, 
15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226, 
12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263, 
13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001, 
10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004, 
0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966, 
-0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983, 
2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158, 
-9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999, 
2.36690000000004, 0.0375000000000227, -3.572, -2.93179999999995, 
-2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002, 
-2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data. Frame")

Answer 1

I think you want to use all of df1 's data where present, fill in df2 where df1 is missing, then fill out the Date for the rest of the timespan.我认为您想使用df1存在的所有数据，在缺少df1的地方填写df2 ，然后填写其余时间跨度的Date 。

The first step is to fill in df1 with data from df2 , coalesce (fill NA ) where needed, and remove the duplicate columns.第一步是用df2中的数据填充df1 ，在需要的地方合并（填充NA ），然后删除重复的列。

full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
  mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
  select(-ends_with(".y"))
#          Date       Pr    Tmax    Tmin
# 1  1980-06-01 0.000000 16.7579  2.3898
# 2  1980-06-02 0.000000 16.4250  3.7317
# 3  1980-06-03 0.000000 16.8751  2.3744
# 4  1980-06-04 0.000000 15.9877  0.7444
# 5  1980-06-05 0.000000 15.3752  1.3443
# 6  1980-06-06 0.000000 15.4138  0.9142
# 7  1980-06-07 0.000000 15.6038 -0.1077
# 8  1980-06-08 0.000000 14.1940 -0.1110
# 9  1980-06-09 0.000000 15.9599  1.2925
# 10 1980-06-10 0.000000 17.0566  2.4449
# 11 1980-06-11 0.000000 13.6396 -0.0419
# 12 1980-06-12 0.000000 15.8548  2.4895
# 13 1980-06-13 0.000000 15.9682  2.7414
# 14 1980-06-14 0.000000 17.4073  2.8424
# 15 1980-06-15 0.000000 17.9238  0.5135
# 16 1980-06-16 0.000000 10.1966 -5.2158
# 17 1980-06-17 0.000000 10.7029 -9.0718
# 18 1980-06-18 0.000000 13.5000  0.4438
# 19 1980-06-19 0.000000 13.0000 10.5000
# 20 1980-06-20 0.000000 12.0000 11.0000
# 21 1980-06-21 0.000000 11.2000 12.0000
# 22 1980-06-22 0.000000  8.5000 12.5000
# 23 1980-06-23 0.000000 12.0000  7.0000
# 24 1980-06-24 0.000000 13.0000 10.0000
# 25 1980-06-25 0.000000 17.5000 11.5000
# 26 1980-06-26 0.000000 17.1000  7.0000
# 27 1980-06-27 0.000000 17.1000  5.0000
# 28 1980-06-28 1.985656  9.5000  4.5000
# 29 1980-06-29 0.000000  6.5000  1.0000
# 30 1980-06-30 0.000000 10.3664  2.0000

From here, filling out the timespan is just another join:从这里开始，填写时间跨度只是另一个连接：

result <- full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
  mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
  select(-ends_with(".y")) %>%
  full_join(seq_df, by = "Date")

(optionally add %>% arrange(Date) ). （可选择添加%>% arrange(Date) ）。

-- --

Data (with the correct class= ):数据（具有正确的class= ）：

df1 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,  NA_real_, NA_real_, NA_real_), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1, 17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7, 10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data.frame")
df2 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877, 15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396, 15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226,  12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263, 13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001, 10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004, 0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966, -0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983, 2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158, -9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999, 2.36690000000004, 0.0375000000000227,  -3.572, -2.93179999999995, -2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002, -2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data.frame")
# data_frame is deprecated, use tibble
seq_df <- tibble(Date = seq.Date(as.Date("1948-01-01"), as.Date("2016-12-31"), by="day"))

根据另一个数据帧R中的值填充数据帧中的缺失值

问题描述

1 个解决方案

解决方案1
0 已采纳 2022-12-13 13:24:32

根据另一个数据帧R中的值填充数据帧中的缺失值

问题描述

1 个解决方案

解决方案1 0 已采纳 2022-12-13 13:24:32

解决方案1
0 已采纳 2022-12-13 13:24:32