[英]Fill missing values in a dataframe based on values from another dataframe R
給定兩個數據df1
和df2
,每個數據框有 4 列,我想執行以下操作:
df1
使其日期范圍介於1948-2016
之間。 例如:seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"),
as.Date("2016-12-31"),
by="day"))
展開的行應該NA
填充的列
df1
中的所有NAs
替換為df2
中相應的特定於單元格的值。我嘗試了以下但沒有用:
seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"),
as.Date("2016-12-31"),
by="day"))
data1=full_join(df1, seq_df) %>% arrange(Date)
樣本數據
df1=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808,
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819,
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830,
3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1,
17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7,
10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data. Frame")
df2=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808,
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819,
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830,
3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877,
15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396,
15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226,
12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263,
13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001,
10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004,
0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966,
-0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983,
2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158,
-9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999,
2.36690000000004, 0.0375000000000227, -3.572, -2.93179999999995,
-2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002,
-2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data. Frame")
我認為您想使用df1
存在的所有數據,在缺少df1
的地方填寫df2
,然后填寫其余時間跨度的Date
。
第一步是用df2
中的數據填充df1
,在需要的地方合並(填充NA
),然后刪除重復的列。
full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
select(-ends_with(".y"))
# Date Pr Tmax Tmin
# 1 1980-06-01 0.000000 16.7579 2.3898
# 2 1980-06-02 0.000000 16.4250 3.7317
# 3 1980-06-03 0.000000 16.8751 2.3744
# 4 1980-06-04 0.000000 15.9877 0.7444
# 5 1980-06-05 0.000000 15.3752 1.3443
# 6 1980-06-06 0.000000 15.4138 0.9142
# 7 1980-06-07 0.000000 15.6038 -0.1077
# 8 1980-06-08 0.000000 14.1940 -0.1110
# 9 1980-06-09 0.000000 15.9599 1.2925
# 10 1980-06-10 0.000000 17.0566 2.4449
# 11 1980-06-11 0.000000 13.6396 -0.0419
# 12 1980-06-12 0.000000 15.8548 2.4895
# 13 1980-06-13 0.000000 15.9682 2.7414
# 14 1980-06-14 0.000000 17.4073 2.8424
# 15 1980-06-15 0.000000 17.9238 0.5135
# 16 1980-06-16 0.000000 10.1966 -5.2158
# 17 1980-06-17 0.000000 10.7029 -9.0718
# 18 1980-06-18 0.000000 13.5000 0.4438
# 19 1980-06-19 0.000000 13.0000 10.5000
# 20 1980-06-20 0.000000 12.0000 11.0000
# 21 1980-06-21 0.000000 11.2000 12.0000
# 22 1980-06-22 0.000000 8.5000 12.5000
# 23 1980-06-23 0.000000 12.0000 7.0000
# 24 1980-06-24 0.000000 13.0000 10.0000
# 25 1980-06-25 0.000000 17.5000 11.5000
# 26 1980-06-26 0.000000 17.1000 7.0000
# 27 1980-06-27 0.000000 17.1000 5.0000
# 28 1980-06-28 1.985656 9.5000 4.5000
# 29 1980-06-29 0.000000 6.5000 1.0000
# 30 1980-06-30 0.000000 10.3664 2.0000
從這里開始,填寫時間跨度只是另一個連接:
result <- full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
select(-ends_with(".y")) %>%
full_join(seq_df, by = "Date")
(可選擇添加%>% arrange(Date)
)。
--
數據(具有正確的class=
):
df1 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1, 17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7, 10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data.frame")
df2 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877, 15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396, 15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226, 12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263, 13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001, 10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004, 0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966, -0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983, 2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158, -9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999, 2.36690000000004, 0.0375000000000227, -3.572, -2.93179999999995, -2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002, -2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data.frame")
# data_frame is deprecated, use tibble
seq_df <- tibble(Date = seq.Date(as.Date("1948-01-01"), as.Date("2016-12-31"), by="day"))
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.