简体   繁体   中英

Non linear interpolation to find the missing values using R

I have a data.table test_dt , in which some of the values are missing for each group . Can someone suggest the fastest way to interpolate and fill the missing value?

In the graph form x-axis is column a and y-axis is column b , hench the interval distance of column a should be considered while predicting b .

test_dt = structure(list(group = c("B1", "B1", "B1", "B1", "B1", "B1", "B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C1"), a = c(165, 170, 185, 195, 200, 210, 220, 240, 1, 1.5, 2, 4.5, 5, 5.5, 7, 10), b = c(1.925, 0.575, 0.3, NA, NA, 2.825, 9.05, 27.9, 3.775, 3.225, 2.75, 0.255, 0.04, NA, NA, NA)), row.names = c(NA, -16L), class = c("data.table", "data.frame"), index = integer(0))
> test_dt 
    group     a      b
 1:    B1 165.0  1.925
 2:    B1 170.0  0.575
 3:    B1 185.0  0.300
 4:    B1 195.0     NA
 5:    B1 200.0     NA
 6:    B1 210.0  2.825
 7:    B1 220.0  9.050
 8:    B1 240.0 27.900
 9:    C1   1.0  3.775
10:    C1   1.5  3.225
11:    C1   2.0  2.750
12:    C1   4.5  0.255
13:    C1   5.0  0.040
14:    C1   5.5     NA
15:    C1   7.0     NA
16:    C1  10.0     NA 

Following are some conditions -

  1. Interpolated values should not be zero or negative value
  2. Interpolated values should not be a copy of the last non-NA value

I tried to solve it using na.spline but the result is not correct, especially for group C1 , where value of column a is 7 and 10

test_dt[, predicted := zoo::na.spline(zoo(.SD), x = a)[, 2], by = c("group")]
> test_dt
    group     a      b   predicted
 1:    B1 165.0  1.925  1.92500000
 2:    B1 170.0  0.575  0.57500000
 3:    B1 185.0  0.300  0.30000000
 4:    B1 195.0     NA  0.18981191
 5:    B1 200.0     NA  0.40167712
 6:    B1 210.0  2.825  2.82500000
 7:    B1 220.0  9.050  9.05000000
 8:    B1 240.0 27.900 27.90000000
 9:    C1   1.0  3.775  3.77500000
10:    C1   1.5  3.225  3.22500000
11:    C1   2.0  2.750  2.75000000
12:    C1   4.5  0.255  0.25500000
13:    C1   5.0  0.040  0.04000000
14:    C1   5.5     NA  0.03038963
15:    C1   7.0     NA  1.67389631
16:    C1  10.0     NA 16.44642968

Update - 1 Following is the second dataset. Using the solutions below, the predicted values are very high.

temp_dt = structure(list(group = c("K1", "K1", "K1", "K1", "K1", "K1", 
"K1", "K1", "K1", "K1", "K1", "K1", "K1", "F1", "F1", "F1", "F1", 
"F1", "F1"), a = c(185, 190, 195, 200, 202.5, 205, 210, 212.5, 
215, 217.5, 220, 222.5, 225, 22, 23, 24, 25, 26, 35), b = c(NA, 
NA, 0.45, 0.6, 1.05, 1.45, 2.7, 3.125, 4.3, NA, 7.05, 8.95, 10.35, 
0.5, 0.7, 0.925, 1.2, 1.75, NA), predicted = c(463741.812236, 
14.7721571, 0.45, 0.6, 1.05, 1.45, 2.7, 3.125, 4.3, 5.6346081, 
7.05, 8.95, 10.35, 0.5, 0.7, 0.925, 1.2, 1.75, 65226323253857.9
)), row.names = c(NA, -19L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x563cccc504c0>, index = integer(0))

test_dt[, predicted := round(exp(na.spline(log(b), x = a, na.rm = FALSE)), 7), by = group]


    group     a      b    predicted
 1:    K1 185.0     NA 4.637418e+05
 2:    K1 190.0     NA 1.477216e+01
 3:    K1 195.0  0.450 4.500000e-01
 4:    K1 200.0  0.600 6.000000e-01
 5:    K1 202.5  1.050 1.050000e+00
 6:    K1 205.0  1.450 1.450000e+00
 7:    K1 210.0  2.700 2.700000e+00
 8:    K1 212.5  3.125 3.125000e+00
 9:    K1 215.0  4.300 4.300000e+00
10:    K1 217.5     NA 5.634608e+00
11:    K1 220.0  7.050 7.050000e+00
12:    K1 222.5  8.950 8.950000e+00
13:    K1 225.0 10.350 1.035000e+01
14:    F1  22.0  0.500 5.000000e-01
15:    F1  23.0  0.700 7.000000e-01
16:    F1  24.0  0.925 9.250000e-01
17:    F1  25.0  1.200 1.200000e+00
18:    F1  26.0  1.750 1.750000e+00
19:    F1  35.0     NA 6.522632e+13

There are infinite ways to do this, each way having a certain set of assumptions. Here is one way, transforming the data into the log scale, performing linear interpolation, and then exponentiating the result. This should approach 0 but not be exactly 0.

library(zoo)
round(exp(na.spline(log(test_dt$b),na.rm=F,method="natural")),5)
 [1]  1.92500  0.57500  0.30000  0.39398  0.96937  2.82500  9.05000 27.90000  3.77500
[10]  3.22500  2.75000  0.25500  0.04000  0.00835  0.00175  0.00036

Also you should perform this separately for each group, so that there is no interpolation between groups.

Ultimately, you have to decide the interpolation procedure based on scientific background. However, in order to avoid producing negative values, the log-transformation is useful. In the following, I combine that with a spline interpolation.

library(data.table)
test_dt = data.table(group = c("B1", "B1", "B1", "B1", "B1", "B1", 
                               "B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C1"), 
                     a = c(165, 170, 185, 195, 200, 210, 220, 240, 1, 1.5, 2, 4.5, 5, 5.5, 7, 10), 
                     b = c(1.925, 0.575, 0.3, NA, NA, 2.825, 9.05, 27.9, 3.775, 3.225, 2.75, 0.255, 
                           0.04, NA, NA, NA))

library(zoo)
test_dt[, c := exp(na.spline(log(b), x = a, na.rm = FALSE)), by = group]

library(ggplot2)
ggplot(test_dt, aes(x = a, color = group)) +
  geom_line(aes(y = c)) +
  geom_point(aes(y = c, color = "interpolated")) +
  geom_point(aes(y = b))

结果图显示原始值和插值

Try different methods of spline.
Also here I catch values lower than 0.0001 with pmax .

met <- c("fmm", "natural", "monoH.FC")
cbind(test_dt, lapply(setNames(met, met), function(met) {
  ave(seq_len(nrow(test_dt)), test_dt$group, FUN=function(i) {
    pmax(0.0001, splinefun(test_dt$a[i], test_dt$b[i], met)(test_dt$a[i]))
  }) }) )
#   group     a      b         fmm    natural   monoH.FC
#1     B1 165.0  1.925  1.92500000  1.9250000  1.9250000
#2     B1 170.0  0.575  0.57500000  0.5750000  0.5750000
#3     B1 185.0  0.300  0.30000000  0.3000000  0.3000000
#4     B1 195.0     NA  0.18981191  0.2372716  0.3751691
#5     B1 200.0     NA  0.40167712  0.4448177  0.6702794
#6     B1 210.0  2.825  2.82500000  2.8250000  2.8250000
#7     B1 220.0  9.050  9.05000000  9.0500000  9.0500000
#8     B1 240.0 27.900 27.90000000 27.9000000 27.9000000
#9     C1   1.0  3.775  3.77500000  3.7750000  3.7750000
#10    C1   1.5  3.225  3.22500000  3.2250000  3.2250000
#11    C1   2.0  2.750  2.75000000  2.7500000  2.7500000
#12    C1   4.5  0.255  0.25500000  0.2550000  0.2550000
#13    C1   5.0  0.040  0.04000000  0.0400000  0.0400000
#14    C1   5.5     NA  0.03038963  0.0001000  0.0001000
#15    C1   7.0     NA  1.67389631  0.0001000  0.0001000
#16    C1  10.0     NA 16.44642968  0.0001000  0.0001000

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM