Given two dataframes df1
and df2
with 4 columns each, I would like to do the following:
df1
to have a date range between 1948-2016
. For example:seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"),
as.Date("2016-12-31"),
by="day"))
The expanded rows should have the columns filled with NA
NAs
in df1
with corresponding cell-specific values from df2
.I tried the following but it did not work:
seq_df <- data_frame(Date = seq.Date(as.Date("1948-01-01"),
as.Date("2016-12-31"),
by="day"))
data1=full_join(df1, seq_df) %>% arrange(Date)
Sample Data
df1=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808,
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819,
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830,
3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1,
17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7,
10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data. Frame")
df2=structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808,
3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819,
3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830,
3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877,
15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396,
15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226,
12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263,
13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001,
10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004,
0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966,
-0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983,
2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158,
-9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999,
2.36690000000004, 0.0375000000000227, -3.572, -2.93179999999995,
-2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002,
-2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data. Frame")
I think you want to use all of df1
's data where present, fill in df2
where df1
is missing, then fill out the Date
for the rest of the timespan.
The first step is to fill in df1
with data from df2
, coalesce (fill NA
) where needed, and remove the duplicate columns.
full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
select(-ends_with(".y"))
# Date Pr Tmax Tmin
# 1 1980-06-01 0.000000 16.7579 2.3898
# 2 1980-06-02 0.000000 16.4250 3.7317
# 3 1980-06-03 0.000000 16.8751 2.3744
# 4 1980-06-04 0.000000 15.9877 0.7444
# 5 1980-06-05 0.000000 15.3752 1.3443
# 6 1980-06-06 0.000000 15.4138 0.9142
# 7 1980-06-07 0.000000 15.6038 -0.1077
# 8 1980-06-08 0.000000 14.1940 -0.1110
# 9 1980-06-09 0.000000 15.9599 1.2925
# 10 1980-06-10 0.000000 17.0566 2.4449
# 11 1980-06-11 0.000000 13.6396 -0.0419
# 12 1980-06-12 0.000000 15.8548 2.4895
# 13 1980-06-13 0.000000 15.9682 2.7414
# 14 1980-06-14 0.000000 17.4073 2.8424
# 15 1980-06-15 0.000000 17.9238 0.5135
# 16 1980-06-16 0.000000 10.1966 -5.2158
# 17 1980-06-17 0.000000 10.7029 -9.0718
# 18 1980-06-18 0.000000 13.5000 0.4438
# 19 1980-06-19 0.000000 13.0000 10.5000
# 20 1980-06-20 0.000000 12.0000 11.0000
# 21 1980-06-21 0.000000 11.2000 12.0000
# 22 1980-06-22 0.000000 8.5000 12.5000
# 23 1980-06-23 0.000000 12.0000 7.0000
# 24 1980-06-24 0.000000 13.0000 10.0000
# 25 1980-06-25 0.000000 17.5000 11.5000
# 26 1980-06-26 0.000000 17.1000 7.0000
# 27 1980-06-27 0.000000 17.1000 5.0000
# 28 1980-06-28 1.985656 9.5000 4.5000
# 29 1980-06-29 0.000000 6.5000 1.0000
# 30 1980-06-30 0.000000 10.3664 2.0000
From here, filling out the timespan is just another join:
result <- full_join(df1, df2, by = "Date", suffix = c("", ".y")) %>%
mutate(across(c(Pr, Tmax, Tmin), ~ coalesce(., cur_data()[[ paste0(cur_column(), ".y") ]]))) %>%
select(-ends_with(".y")) %>%
full_join(seq_df, by = "Date")
(optionally add %>% arrange(Date)
).
--
Data (with the correct class=
):
df1 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Tmax = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.5, 13, 12, 11.2, 8.5, 12, 13, 17.5, 17.1, 17.1, 9.5, 6.5, NA), Tmin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10.5, 11, 12, 12.5, 7, 10, 11.5, 7, 5, 4.5, 1, 2)), row.names = c(NA, 30L), class = "data.frame")
df2 <- structure(list(Date = structure(c(3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833), class = "Date"), Pr = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.985656032, 0, 0), Tmax = c(16.7579, 16.425, 16.8751, 15.9877, 15.3752, 15.4138, 15.6038, 14.194, 15.9599, 17.0566, 13.6396, 15.8548, 15.9682, 17.4073, 17.9238, 10.1966, 10.7029, 13.6226, 12.7758, 15.0315000000001, 14.5553, 10.0413, 12.5804, 11.7263, 13.9642, 20.8888, 19.1779, 6.32770000000005, 8.32380000000001, 10.3664), Tmin = c(2.38980000000004, 3.73170000000005, 2.37440000000004, 0.744400000000041, 1.34430000000003, 0.914200000000051, -0.107699999999966, -0.11099999999999, 1.29250000000002, 2.44490000000002, -0.0418999999999983, 2.48950000000002, 2.7414, 2.8424, 0.513500000000022, -5.2158, -9.0718, 0.44380000000001, -1.51529999999997, -2.18459999999999, 2.36690000000004, 0.0375000000000227, -3.572, -2.93179999999995, -2.54819999999995, 0.20700000000005, 5.50260000000003, 1.28820000000002, -2.12149999999997, -7.39839999999998)), row.names = 11841:11870, class = "data.frame")
# data_frame is deprecated, use tibble
seq_df <- tibble(Date = seq.Date(as.Date("1948-01-01"), as.Date("2016-12-31"), by="day"))
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.