简体   繁体   中英

How can I fill in missing rows for monthly time series data?

Here's the dput of my data:

structure(list(date = structure(c(8596, 8631, 8659, 8687, 8733, 
8743, 8796, 8806, 8853, 8880, 8908, 8932, 8971, 8999, 9027, 9069, 
9097, 9111, 9160, 9188, 9212, 9230, 9279, 9309, 9328, 9363, 9391, 
9434, 9449, 9482, 9519, 9541, 9580, 9610, 9643, 9672, 9708, 9736, 
9764, 9799, 9827, 9850, 9890, 9920, 9947, 9975, 10007, 10038, 
10072, 10100, 10122, 10163, 10191, 10213, 10254, 10282, 10310, 
10345, 10354, 10385, 10418, 10469, 10497, 10528, 10556, 10570, 
10612, 10641, 10668, 10710, 10742, 10759, 10802, 10830, 10858, 
10893, 10914, 10947, 10984, 11010, 11038, 11066, 11096, 11135, 
11164, 11193, 11229, 11257, 11285, 11313, 11346, 11374, 11411, 
11435, 11467, 11502, 11514, 11565, 11592, 11621, 11649, 11677, 
11718, 11746, 11776, 11797, 11838, 11867, 11894, 11923, 11951, 
11979, 12021, 12035, 12077, 12105, 12133, 12160, 12189, 12231, 
12259, 12273, 12315, 12356, 12385, 12399, 12441, 12472, 12497, 
12538, 12553, 12591, 12630, 12658, 12686, 12714, 12742, 12770, 
12804, 12832, 12860, 12903, 12917, 12938, 12986, 13015, 13056, 
13085, 13116, 13139, 13169, 13204, 13232, 13260, 13288, 13301, 
13357, 13385, 13414, 13442, 13470, 13498, 13533, 13561, 13603, 
13631, 13658, 13694, 13722, 13750, 13778, 13805, 13846, 13862, 
13896, 13925, 13967, 13995, 14009, 14050, 14078, 14121, 14149, 
14177, 14205, 14233, 14268, 14296, 14323, 14352, 14380, 14449, 
14474, 14506, 14548, 14575, 14590, 14618, 14661, 14688, 14729, 
14758, 14761, 14821, 14849, 14877, 14905, 14933, 14961, 14995, 
15024, 15038, 15093, 15121, 15135, 15185, 15212, 15241, 15269, 
15297, 15325, 15360, 15387, 15430, 15458, 15485, 15513, 15542, 
15583, 15611, 15639, 15667, 15696, 15731, 15745, 15786, 15815, 
15842, 15917, 15945, 15966, 16001, 16030, 16076, 16129, 16143, 
16184, 16276, 16303, 16343, 16374, 16400, 16417, 16455, 16482, 
16525, 16553, 16585, 16612, 16646, 16678, 16706, 16729, 16752, 
16777, 16819, 16860, 16891, 16916, 16925, 16976, 17002, 17042, 
17072, 17100, 17120, 17141, 17178, 17224, 17245, 17261, 17304, 
17330, 17373, 17401, 17459, 17488, 17512, 17548, 17581, 17598, 
17631), tzone = "UTC", tclass = "Date", class = "Date"), AverageTemp = c(16.5027083333333, 
17.325, 17.1888888888889, 15.8277777777778, 16.6583333333333, 
17.3333333333333, 16.64375, 17.1133333333333, 17.895119047619, 
18.5694444444444, 18.8222222222222, 17.4305555555556, 17.6555555555556, 
17.025, 17.3222222222222, 17.2770833333333, 17.4805555555556, 
16.9708333333333, 17.9666666666667, 17.1222222222222, 18.0166666666667, 
17.25, 18.1875, 17.6577777777778, 16.6541666666667, 17.1083333333333, 
16.4666666666667, 17.5972756410256, 17.2, 17.4444444444444, 16.95, 
17.7, 17.9222222222222, 18.4875, 17.8229166666667, 16.9166666666667, 
16.7083333333333, 17.1666666666667, 17.3111111111111, 18.2333333333333, 
16.6277777777778, 17.5875, 17.3833333333333, 17.4638888888889, 
17.725, 18.1388888888889, 17.7001111111111, 17.7222222222222, 
17.2041666666667, 17.8255952380952, 17.1833333333333, 17.8103070175439, 
17.8194444444444, 17.952, 18.158412414966, 18.4910714285714, 
18.3488562091503, 19.1341830065359, 18.45, 18.9107142857143, 
17.2275, 19.0828761904762, 18.1599701591512, 18.965739220457, 
18.6720606060606, 18.8786057692308, 18.602656449553, 18.6327347883598, 
19.2925198412698, 20.1952463624339, 18.8900384227765, 18.0934444444444, 
18.0554871794872, 17.8405270655271, 17.5540598290598, 17.454122110648, 
17.5764155982906, 16.9989942528736, 16.4252032967033, 16.5388571428571, 
17.0108695652174, 17.7725308641975, 18.4252564102564, 17.2278899240856, 
17.3102091315453, 17.3627204585538, 17.280641025641, 17.3746616809117, 
17.3014601139601, 17.2238271604938, 16.379012345679, 16.6044444444444, 
17.624415954416, 18.4023148148148, 18.0341435185185, 17.3016666666667, 
17.8204861111111, 17.827264957265, 17.2772467320261, 17.8786954365079, 
17.84375, 17.1732638888889, 16.9219907407407, 17.3826388888889, 
17.7413333333333, 18.4948412698413, 18.2363425925926, 17.3282057823129, 
17.5083333333333, 17.414898989899, 16.9453125, 17.4988095238095, 
17.6704012345679, 18.1333333333333, 18.11875, 17.4805555555556, 
17.4271367521368, 17.9006944444444, 17.9818181818182, 17.3125, 
16.73625, 17.2666666666667, 17.4279340277778, 17.8584444444444, 
17.2966666666667, 17.1, 18.3420833333333, 18.5814285714286, 17.6430555555556, 
18.2307122507123, 18.0830687830688, 16.7563492063492, 16.9055555555556, 
17.0090277777778, 17.3863095238095, 16.9139880952381, 16.7479166666667, 
17.0888888888889, 17.7648148148148, 18.2277777777778, 19.3694444444444, 
17.7064021164021, 18.7371527777778, 17.94375, 17.9416666666667, 
17.8736111111111, 18.5354166666667, 18.1919444444444, 18.2555555555556, 
17.7704365079365, 17.3509259259259, 17.3931216931217, 18.3355923202614, 
17.9180555555556, 18.2104166666667, 18.0171121593291, 17.6840277777778, 
17.5509259259259, 16.9631313131313, 17.4478070175439, 17.6916666666667, 
17.6143376068376, 18.7415656565657, 19.0048611111111, 18.285462962963, 
18.3816964285714, 18.2041310541311, 17.2343518518519, 17.2149382716049, 
17.3684027777778, 17.5229861111111, 16.8517857142857, 19.0929141414141, 
19.300404040404, 18.735, 17.9280277777778, 18.4470274170274, 
19.0686597406425, 18.325, 18.5, 18.4388888888889, 18.7291666666667, 
18.3708333333333, 18.0234918630752, 19.4925980392157, 19.2101488095238, 
19.3890625, 18.5150793650794, 19.1944444444444, 19.0815277777778, 
19.5192658730159, 17.2212418300654, 17.8081168831169, 18.2517361111111, 
17.7775555555556, 18.012962962963, 17.0347222222222, 16.5888888888889, 
18.8123101604278, 18.9187091503268, 19.0161111111111, 19.2625, 
20.875, 18.8092592592593, 18.6526515151515, 18.9083333333333, 
18.9835227272727, 18.1829292929293, 17.9060606060606, 17.7835227272727, 
17.8237719298246, 19.7386363636364, 18.4961051693405, 18.5332727272727, 
18.3787878787879, 18.5134199134199, 17.8098930481283, 18.4179292929293, 
17.230303030303, 18.9035064935065, 17.8935897435897, 17.6211966604824, 
17.9238095238095, 18.8382886904762, 19.42625, 18.6395833333333, 
18.0652777777778, 19.3354166666667, 18.75359375, 17.951123043623, 
17.6063068181818, 17.828022875817, 17.5528846153846, 18.5647727272727, 
19.0318181818182, 19.1659090909091, 18.8997564935065, 19.1301136363636, 
18.1705882352941, 17.1361570247934, 18.6090909090909, 18.1429951690821, 
17.8829545454545, 18.3387983091787, 18.41875, 19.7, 20.2508333333333, 
17.6387426900585, 18.1770897832817, 17.5400297619048, 17.7547246376812, 
17.246412037037, 17.0846153846154, 17.7060185185185, 18.325, 
18.5408333333333, 19.4251587301587, 18.3706018518519, 17.917, 
17.91, 18.6451388888889, 18.29375, 17.2316666666667, 18.7189393939394, 
18.1669193548387, 18.367297979798, 17.7043055555556, 18.1879520697168, 
19.12, 20.425, 18.6663888888889, 17.5108796296296, 18.1883333333333, 
18.3060049019608, 18.32625, 18.2861111111111, 18.0375, 17.3445175438596, 
18.6451058201058, 18.97875, 19.4583333333333, 18.2597222222222, 
19.9197222222222, 18.2342307692308, 18.7666666666667, 19.8277777777778, 
17.6464285714286, 18.690873015873, 18.4520833333333, 19.8696428571429, 
19.9833333333333, 18.2416666666667)), class = "data.frame", row.names = c(NA, 
-292L))

My data is in YYYY-MM-DD format and is monthly data. Right now, there's missing data for a few months (eg 2017-09, 2014-05, 2014-06, 2013-12), but they are not specified in the data frame. How do I create a new row for possible missing months across my entire dataset? Since my dataset has two columns, the other column besides the date column should have an NA value specified for the new missing month row. I'm looking for a tidyverse, lubridate, or data.table solution.

You can use tidyr::complete for this, but you have the additional wrinkle that you have dates on different days in each month. First then you need to make a column to count months on, which we can do with the day(x) <- setter from lubridate .

Here's an example using the provided data truncated to 2014 for conciseness. Note that you should use seq.Date to specify the full range of dates that you want to be included in the month column, and you also will have NA s in the date column. (you can replace with the first of the month if you want)

library(tidyverse)
library(lubridate)

tbl <- structure(list(date = structure(c(16076, 16129, 16143, 16184, 16276, 16303, 16343, 16374, 16400, 16417), tzone = "UTC", tclass = "Date", class = "Date"), AverageTemp = c(18.3387983091787, 18.41875, 19.7, 20.2508333333333, 17.6387426900585, 18.1770897832817, 17.5400297619048, 17.7547246376812, 17.246412037037, 17.0846153846154)), row.names = c(NA, -10L), class = "data.frame")

tbl %>%
  mutate(month = date %>% `day<-`(1)) %>%
  complete(month = seq.Date(min(month), max(month), by = "month"))
#> # A tibble: 12 x 3
#>    month      date       AverageTemp
#>    <date>     <date>           <dbl>
#>  1 2014-01-01 2014-01-06        18.3
#>  2 2014-02-01 2014-02-28        18.4
#>  3 2014-03-01 2014-03-14        19.7
#>  4 2014-04-01 2014-04-24        20.3
#>  5 2014-05-01 NA                NA  
#>  6 2014-06-01 NA                NA  
#>  7 2014-07-01 2014-07-25        17.6
#>  8 2014-08-01 2014-08-21        18.2
#>  9 2014-09-01 2014-09-30        17.5
#> 10 2014-10-01 2014-10-31        17.8
#> 11 2014-11-01 2014-11-26        17.2
#> 12 2014-12-01 2014-12-13        17.1

As an alternative, you can instead just get the year and month components and use complete on the combination of the two:

tbl %>%
  mutate(year = year(date), month = month(date)) %>%
  complete(year = min(year):max(year), month = 1:12)
#> # A tibble: 12 x 4
#>     year month date       AverageTemp
#>    <dbl> <dbl> <date>           <dbl>
#>  1  2014     1 2014-01-06        18.3
#>  2  2014     2 2014-02-28        18.4
#>  3  2014     3 2014-03-14        19.7
#>  4  2014     4 2014-04-24        20.3
#>  5  2014     5 NA                NA  
#>  6  2014     6 NA                NA  
#>  7  2014     7 2014-07-25        17.6
#>  8  2014     8 2014-08-21        18.2
#>  9  2014     9 2014-09-30        17.5
#> 10  2014    10 2014-10-31        17.8
#> 11  2014    11 2014-11-26        17.2
#> 12  2014    12 2014-12-13        17.1

Created on 2019-03-20 by the reprex package (v0.2.1)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM