簡體   English   中英

如何通過R中的插值替換NA

[英]How to replace NA by interpolation in R

我有以下數據集,所有列中都有很多 NA 值(Date.Local 除外,所有值都可用。)我想通過插值/外插替換所有 GA。

在此處輸入圖片說明

到目前為止,我嘗試了以下代碼:

library("zoo")
nev <- na.approx(név)

Error in approx(x[!na], y[!na], xout, ...) : 
  need at least two non-NA values to interpolate
In addition: Warning messages:
1: In na.approx.default(object, x = x, xout = xout, na.rm = FALSE,  :
  NAs introduced by coercion
2: In na.approx.default(object, x = x, xout = xout, na.rm = FALSE,  :
  NAs introduced by coercion
3: In xy.coords(x, y, setLab = FALSE) : NAs introduced by coercion
}

這對刪除 NA 沒有任何作用。 同樣對於最后 4 列,最后 ~100 行的數據為 NA。 在此處輸入圖片說明

在此處輸入圖片說明 將 nev 輸出為 .csv 文件。

"","Date.Local","cumulative_cases","cumulative_deaths","new_cases","new_deaths","pm2.5","temp","wind","press"
"456373",2020-01-22,0,0,NA,NA,10.8892157058824,50.3291667,102.76500005,957.083333
"456374",2020-01-23,0,0,0,0,12.1433946153846,53.4416666,106.43416665,957.375
"456375",2020-01-24,0,0,0,0,9.71062371428572,53.4444445555556,104.74458335,953.541667
"456376",2020-01-25,0,0,0,0,13.090885375,54.0509258888889,106.2454166,955.083333
"456377",2020-01-26,0,0,0,0,10.4494047857143,54.1666665,107.4506249,954.125
"456378",2020-01-27,0,0,0,0,4.04497283333333,57.2541667,141.6875,958
"456379",2020-01-28,0,0,0,0,6.8008605625,52.9083334,101.40020845,956.958333
"456380",2020-01-29,0,0,0,0,4.70659716666667,53.2583333,108.96375,954.416667
"456381",2020-01-30,0,0,0,0,3.48005957142857,52.2416666,98.6919021,957.958333
"456382",2020-01-31,0,0,0,0,8.5216145625,52.3666666,102.20354165,963.208333
"456383",2020-02-01,0,0,0,0,13.1172619285714,54.911458375,108.7499999,961.75
"456384",2020-02-02,0,0,0,0,12.60625,57.2791668,105.81625,946.352941
"456385",2020-02-03,0,0,0,0,4.34877458823529,41.475,158.19916665,948.666667
"456386",2020-02-04,0,0,0,0,2.57147446153846,39.7583334,124.341666631579,956.958333
"456387",2020-02-05,0,0,0,0,4.95521478571429,40.7041667,97.09187505,956.75
"456388",2020-02-06,0,0,0,0,9.55396126666667,49.3425926666667,91.9258334,953.5
"456389",2020-02-07,0,0,0,0,11.0976967142857,54.3541667,104.406798210526,953.541667
"456390",2020-02-08,0,0,0,0,12.3601190714286,54.3009258888889,99.99000005,948.083333
"456391",2020-02-09,0,0,0,0,4.69017857142857,49.7685183333333,130.606875,948.083333
"456392",2020-02-10,0,0,0,0,2.33012815384615,52.7124999,109.421929842105,951.458333
"456393",2020-02-11,0,0,0,0,2.97980769230769,49.3958334,102.6325001,955.083333
"456394",2020-02-12,0,0,0,0,7.11474353846154,49.7349637,102.7526359,953.125
"456395",2020-02-13,0,0,0,0,8.01398807142857,50.4958333,97.35916665,954.625
"456396",2020-02-14,0,0,0,0,10.4294871538462,51.7625,100.0085417,954.041667
"456397",2020-02-15,0,0,0,0,11.1705556,53.7416667,103.3418751,955.416667
"456398",2020-02-16,0,0,0,0,13.39629625,56.3148147777778,98.75041665,952.791667
"456399",2020-02-17,0,0,0,0,9.0129166,58.9958334,104.1306249,950.833333
"456400",2020-02-18,0,0,0,0,6.68954376923077,52.3875,85.109375,954.333333
"456401",2020-02-19,0,0,0,0,8.20976971428571,52.8208334,82.0013157368421,955.166667
"456402",2020-02-20,0,0,0,0,5.87976185714286,52.6125001,99.1868421578947,959.708333
"456403",2020-02-21,0,0,0,0,9.2473958125,56.2083334,97.72770845,956.166667
"456404",2020-02-22,0,0,0,0,8.31025638461539,53.0046297777778,84.2597917,950.458333
"456405",2020-02-23,0,0,0,0,5.02708333333333,55.6342593333333,110.4885418,955.333333
"456406",2020-02-24,0,0,0,0,6.88227053333333,57.9750001,95.65458335,957.458333
"456407",2020-02-25,0,0,0,0,4.43333342857143,52.4958333,67.62458335,964.083333
"456408",2020-02-26,0,0,0,0,6.26319441666667,50.8518516666667,90.12708335,965.5
"456409",2020-02-27,0,0,0,0,9.36715682352941,55.7083333,97.30375,961.916667
"456410",2020-02-28,0,0,0,0,7.56869828571429,58.1708335,98.71,958.416667
"456411",2020-02-29,0,0,0,0,10.5327381428571,57.1666667,92.84645835,949.916667
"456412",2020-03-01,0,0,0,0,5.1469444,56.4875001,115.70333325,944.375
"456413",2020-03-02,0,0,0,0,2.32756407692308,52.3958334,126.09875015,950.041667
"456414",2020-03-03,0,0,0,0,2.07640746153846,60.0374999,109.85041665,951.791667
"456415",2020-03-04,0,0,0,0,4.61678741666667,60.7749998,97.27749995,955.166667
"456416",2020-03-05,1,0,1,0,7.44285714285714,61.6666666666667,102.046875,957.291667
"456417",2020-03-06,1,0,0,0,7.32589285714286,64.3666666,87.91520845,951.916667
"456418",2020-03-07,1,0,0,0,6.71785707142857,63.0416666666667,103.56520825,945.875
"456419",2020-03-08,2,0,1,0,3.08910253846154,59.5333333,101.919375,948.25
"456420",2020-03-09,2,0,0,0,3.83044876923077,59.4907406666667,102.21375005,953.916667
"456421",2020-03-10,2,0,0,0,4.011197875,55.5833332,94.80375005,956.708333
"456422",2020-03-11,5,0,3,0,3.90032053846154,59.2685184444444,81.66229165,953.041667
"456423",2020-03-12,8,0,3,0,4.21160714285714,58.9833333,86.2745833,946.625
"456424",2020-03-13,16,0,8,0,1.71215275,55.9583333,102.529824526316,945
"456425",2020-03-14,16,0,0,0,2.611218,58.9666666,95.77375005,950.791667
"456426",2020-03-15,16,0,0,0,3.92202385714286,58.3703703333333,93.4629167,951.583333
"456427",2020-03-16,35,1,19,1,4.23482135714286,57.5375,87.1118751,948.875
"456428",2020-03-17,42,1,7,0,2.43214285714286,53.9083333,104.4222916,946.916667
"456429",2020-03-18,42,1,0,0,2.90993584615385,50.8249999,76.85854175,944.041667
"456430",2020-03-19,69,1,27,0,3.41448138461538,49.9916666,93.75895835,950.791667
"456431",2020-03-20,126,2,57,1,2.95059521428571,50.5208334,58.7370833,957.791667
"456432",2020-03-21,126,2,0,0,3.87692315384615,56.4791665,102.38687495,955.041667
"456433",2020-03-22,126,2,0,0,4.86472226666667,57.7222221111111,85.915625,953.958333
"456434",2020-03-23,126,4,0,2,2.06212127272727,58.4083333,113.3679166,951.083333
"456435",2020-03-24,212,4,86,0,2.57980769230769,59.4458334,97.6064583,951.208333
"456436",2020-03-25,249,6,37,2,3.79613092857143,58.3101853333333,108.9895834,946.5
"456437",2020-03-26,350,6,101,0,2.65693976923077,51.5916668,124.288125,946.125
"456438",2020-03-27,443,10,93,4,2.3113095,51.45,138.6695833,950
"456439",2020-03-28,443,10,0,0,4.54093135294118,55.4041665,87.8764583,954.083333
"456440",2020-03-29,528,14,85,4,4.34166669230769,57.2824073333333,112.5289582,951.083333
"456441",2020-03-30,753,14,225,0,4.18589746153846,59.3083333,102.16439395,956.416667
"456442",2020-03-31,869,23,116,9,5.35386907142857,63.7625,94.4827084,951.708333
"456443",2020-04-01,961,28,92,5,NA,NA,NA,NA
"456444",2020-04-02,1125,34,164,6,NA,NA,NA,NA
"456445",2020-04-03,1279,39,154,5,NA,NA,NA,NA
"456446",2020-04-04,1418,41,139,2,NA,NA,NA,NA
"456447",2020-04-05,1519,41,101,0,NA,NA,NA,NA
"456448",2020-04-06,1608,41,89,0,NA,NA,NA,NA
"456449",2020-04-07,1734,54,126,13,NA,NA,NA,NA

使用您提供的示例數據, zoo:na.approx以及imputeTS::na_interpolation工作正常,但是您需要小心,因為當您需要填充至少兩個現有值之間的缺失值時,插值可以工作,並且事實上您的樣本數據一直到數據末尾都有缺失值,這將導致某些函數(此處為imputeTS::na_interpolation )用最后看到的值填充缺失值,而其他函數則完全刪除這些 NA(如zoo:na.approx )。 這是一個例子。

讀取數據

options(stringsAsFactors = FALSE)
nev = read.csv(text='"","Date.Local","cumulative_cases","cumulative_deaths","new_cases","new_deaths","pm2.5","temp","wind","press"
"456373",2020-01-22,0,0,NA,NA,10.8892157058824,50.3291667,102.76500005,957.083333
"456374",2020-01-23,0,0,0,0,12.1433946153846,53.4416666,106.43416665,957.375
"456375",2020-01-24,0,0,0,0,9.71062371428572,53.4444445555556,104.74458335,953.541667
"456376",2020-01-25,0,0,0,0,13.090885375,54.0509258888889,106.2454166,955.083333
"456377",2020-01-26,0,0,0,0,10.4494047857143,54.1666665,107.4506249,954.125
"456378",2020-01-27,0,0,0,0,4.04497283333333,57.2541667,141.6875,958
"456379",2020-01-28,0,0,0,0,6.8008605625,52.9083334,101.40020845,956.958333
"456380",2020-01-29,0,0,0,0,4.70659716666667,53.2583333,108.96375,954.416667
"456381",2020-01-30,0,0,0,0,3.48005957142857,52.2416666,98.6919021,957.958333
"456382",2020-01-31,0,0,0,0,8.5216145625,52.3666666,102.20354165,963.208333
"456383",2020-02-01,0,0,0,0,13.1172619285714,54.911458375,108.7499999,961.75
"456384",2020-02-02,0,0,0,0,12.60625,57.2791668,105.81625,946.352941
"456385",2020-02-03,0,0,0,0,4.34877458823529,41.475,158.19916665,948.666667
"456386",2020-02-04,0,0,0,0,2.57147446153846,39.7583334,124.341666631579,956.958333
"456387",2020-02-05,0,0,0,0,4.95521478571429,40.7041667,97.09187505,956.75
"456388",2020-02-06,0,0,0,0,9.55396126666667,49.3425926666667,91.9258334,953.5
"456389",2020-02-07,0,0,0,0,11.0976967142857,54.3541667,104.406798210526,953.541667
"456390",2020-02-08,0,0,0,0,12.3601190714286,54.3009258888889,99.99000005,948.083333
"456391",2020-02-09,0,0,0,0,4.69017857142857,49.7685183333333,130.606875,948.083333
"456392",2020-02-10,0,0,0,0,2.33012815384615,52.7124999,109.421929842105,951.458333
"456393",2020-02-11,0,0,0,0,2.97980769230769,49.3958334,102.6325001,955.083333
"456394",2020-02-12,0,0,0,0,7.11474353846154,49.7349637,102.7526359,953.125
"456395",2020-02-13,0,0,0,0,8.01398807142857,50.4958333,97.35916665,954.625
"456396",2020-02-14,0,0,0,0,10.4294871538462,51.7625,100.0085417,954.041667
"456397",2020-02-15,0,0,0,0,11.1705556,53.7416667,103.3418751,955.416667
"456398",2020-02-16,0,0,0,0,13.39629625,56.3148147777778,98.75041665,952.791667
"456399",2020-02-17,0,0,0,0,9.0129166,58.9958334,104.1306249,950.833333
"456400",2020-02-18,0,0,0,0,6.68954376923077,52.3875,85.109375,954.333333
"456401",2020-02-19,0,0,0,0,8.20976971428571,52.8208334,82.0013157368421,955.166667
"456402",2020-02-20,0,0,0,0,5.87976185714286,52.6125001,99.1868421578947,959.708333
"456403",2020-02-21,0,0,0,0,9.2473958125,56.2083334,97.72770845,956.166667
"456404",2020-02-22,0,0,0,0,8.31025638461539,53.0046297777778,84.2597917,950.458333
"456405",2020-02-23,0,0,0,0,5.02708333333333,55.6342593333333,110.4885418,955.333333
"456406",2020-02-24,0,0,0,0,6.88227053333333,57.9750001,95.65458335,957.458333
"456407",2020-02-25,0,0,0,0,4.43333342857143,52.4958333,67.62458335,964.083333
"456408",2020-02-26,0,0,0,0,6.26319441666667,50.8518516666667,90.12708335,965.5
"456409",2020-02-27,0,0,0,0,9.36715682352941,55.7083333,97.30375,961.916667
"456410",2020-02-28,0,0,0,0,7.56869828571429,58.1708335,98.71,958.416667
"456411",2020-02-29,0,0,0,0,10.5327381428571,57.1666667,92.84645835,949.916667
"456412",2020-03-01,0,0,0,0,5.1469444,56.4875001,115.70333325,944.375
"456413",2020-03-02,0,0,0,0,2.32756407692308,52.3958334,126.09875015,950.041667
"456414",2020-03-03,0,0,0,0,2.07640746153846,60.0374999,109.85041665,951.791667
"456415",2020-03-04,0,0,0,0,4.61678741666667,60.7749998,97.27749995,955.166667
"456416",2020-03-05,1,0,1,0,7.44285714285714,61.6666666666667,102.046875,957.291667
"456417",2020-03-06,1,0,0,0,7.32589285714286,64.3666666,87.91520845,951.916667
"456418",2020-03-07,1,0,0,0,6.71785707142857,63.0416666666667,103.56520825,945.875
"456419",2020-03-08,2,0,1,0,3.08910253846154,59.5333333,101.919375,948.25
"456420",2020-03-09,2,0,0,0,3.83044876923077,59.4907406666667,102.21375005,953.916667
"456421",2020-03-10,2,0,0,0,4.011197875,55.5833332,94.80375005,956.708333
"456422",2020-03-11,5,0,3,0,3.90032053846154,59.2685184444444,81.66229165,953.041667
"456423",2020-03-12,8,0,3,0,4.21160714285714,58.9833333,86.2745833,946.625
"456424",2020-03-13,16,0,8,0,1.71215275,55.9583333,102.529824526316,945
"456425",2020-03-14,16,0,0,0,2.611218,58.9666666,95.77375005,950.791667
"456426",2020-03-15,16,0,0,0,3.92202385714286,58.3703703333333,93.4629167,951.583333
"456427",2020-03-16,35,1,19,1,4.23482135714286,57.5375,87.1118751,948.875
"456428",2020-03-17,42,1,7,0,2.43214285714286,53.9083333,104.4222916,946.916667
"456429",2020-03-18,42,1,0,0,2.90993584615385,50.8249999,76.85854175,944.041667
"456430",2020-03-19,69,1,27,0,3.41448138461538,49.9916666,93.75895835,950.791667
"456431",2020-03-20,126,2,57,1,2.95059521428571,50.5208334,58.7370833,957.791667
"456432",2020-03-21,126,2,0,0,3.87692315384615,56.4791665,102.38687495,955.041667
"456433",2020-03-22,126,2,0,0,4.86472226666667,57.7222221111111,85.915625,953.958333
"456434",2020-03-23,126,4,0,2,2.06212127272727,58.4083333,113.3679166,951.083333
"456435",2020-03-24,212,4,86,0,2.57980769230769,59.4458334,97.6064583,951.208333
"456436",2020-03-25,249,6,37,2,3.79613092857143,58.3101853333333,108.9895834,946.5
"456437",2020-03-26,350,6,101,0,2.65693976923077,51.5916668,124.288125,946.125
"456438",2020-03-27,443,10,93,4,2.3113095,51.45,138.6695833,950
"456439",2020-03-28,443,10,0,0,4.54093135294118,55.4041665,87.8764583,954.083333
"456440",2020-03-29,528,14,85,4,4.34166669230769,57.2824073333333,112.5289582,951.083333
"456441",2020-03-30,753,14,225,0,4.18589746153846,59.3083333,102.16439395,956.416667
"456442",2020-03-31,869,23,116,9,5.35386907142857,63.7625,94.4827084,951.708333
"456443",2020-04-01,961,28,92,5,NA,NA,NA,NA
"456444",2020-04-02,1125,34,164,6,NA,NA,NA,NA
"456445",2020-04-03,1279,39,154,5,NA,NA,NA,NA
"456446",2020-04-04,1418,41,139,2,NA,NA,NA,NA
"456447",2020-04-05,1519,41,101,0,NA,NA,NA,NA
"456448",2020-04-06,1608,41,89,0,NA,NA,NA,NA
"456449",2020-04-07,1734,54,126,13,NA,NA,NA,NA')

嘗試:

imputeTS::na_interpolation(nev$temp)
# or
# zoo::na.approx(nev$temp)

現在在值的末尾添加一個值,然后再試一次,看看會發生什么:

imputeTS::na_interpolation(c(nev$temp, 7.293))
# or
# zoo::na.approx(c(nev$temp, 7.293))

您可以使用 mutate_all 在所有列上應用上述內容

library(dplyr)

nev_clean <- nev %>% mutate_all(imputeTS::na_interpolation) 

nev %>% is.na() %>% colSums()
# X        Date.Local  cumulative_cases cumulative_deaths         new_cases        new_deaths 
# 0                 0                 0                 0                 1                 1 
# pm2.5              temp              wind             press 
# 7                 7                 7                 7 

nev_clean %>% is.na() %>% colSums()
# X        Date.Local  cumulative_cases cumulative_deaths         new_cases        new_deaths 
# 0                 0                 0                 0                 0                 0 
# pm2.5              temp              wind             press 
# 0                 0                 0                 0 

有幾個問題:

  • na.approx插值和插值意味着對於數字向量,它使用任何 NA(或 NA 段)兩側的值來填充 NA; 然而,這里的 NA 位於末端,而不是被非 NA 包圍。

  • 插值僅適用於數字列,但列不都是數字。 對於類似的更熟悉的示例, log(nev)也會給出錯誤,因為log需要數字輸入但並非所有輸入都是數字。

假設nev是一個數據框,其中Date列是第一列,則以下任何一項都可用於填充末端的值(或在最后一個中省略它們):

# gives no errors but there is nothing to interpolate so nev unchanged
# This excludes the non-numeric column from the interpolation.
nev[-1] <- na.approx(nev[-1])

# interpolate intermediate values (but thre are none) and extend values at the ends
nev[-1] <- na.approx(nev[-1], rule = 2)

# similar but does not overwrite
replace(nev, -1, na.approx(nev[-1], rule = 2))

# last value carried forward and perform the same in reverse order
na.locf(na.locf(nev), fromLast = TRUE)

# convert to zoo object first
nevz <- read.zoo(nev)
na.approx(nevz, rule = 2)

# use na.fill extending values at the ends
nev[-1] <- na.fill(nev[-1], fill = "extend")

# similar but does not overwrite
replace(nev, -1, na.fill(nev[-1], fill = "extend"))

# use na.fill but fill with zeros
nev[-1] <- na.fill(nev[-1], fill = 0)

# similar but does not overwrite
replace(nev, -1, na.fill(nev[-1], fill = 0))

# same
replace(nev, is.na(nev), 0)

# drop rows with NAs
na.omit(nev)

有關更多信息,請參閱 ?na.approx、?na.fill、?na.locf、?approx。 zoo中還有其他na.*例程: grep("^na\\\\.", ls("package:zoo"), value = TRUE)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM