首页 > 解决方案 > R中的位置缺失值插补

问题描述

我正在 R 中编写一个函数来估算缺失值。我有一个带有时间序列位置(x 和 y)观察的初始数据df框()- 时间戳也可用于缺失的观察。

我创建了另一个数据框,其中包含缺少观察的时段的开始和结束时间戳。(missings

现在我想将丢失的 x 和 y 坐标估算为直线上的等距点——我为此编写了函数func

func <- function(df, missings){  
  for (i in 1:nrow(missings)) {
    Xstart <- df %>%
      filter(TimeStamp == missings[[i,"StartMissing"]]) %>%
      pull(X)
    Xend <- df %>%
      filter(TimeStamp == missings[[i,"EndMissing"]]) %>%
      pull(X)
    n <- df %>%
      filter(TimeStamp > missings[[i,"StartMissing"]] &
             TimeStamp < missings[[i,"EndMissing"]] ) %>%
      nrow()
    
    df <- df %>%
      mutate(X = case_when(
        TimeStamp >  missings[[i,"StartMissing"]] &
        # I think the problem is with the lag() here
        TimeStamp <  missings[[i,"EndMissing"]] ~ lag(X,1) + ((Xend - Xstart) / (n + 1)),
        TRUE ~ X))
  }
  return(df)
}

除了缺失值插补外,这一切都很好。它不使用推算缺失值的滞后值累积到下一个观察值。是否有任何其他功能可以使用(而不是滞后功能)或任何更好的方法来处理这个?

数据 :

missings <- data.frame(
  StartMissing = c(1130.132,1206.398,1217.598,1771.732,1784.531),
  EndMissing = c(1205.865,1207.999,1218.665,1784.531,1785.598)
)

dput(df)
structure(list(TimeStamp = c(1130.665, 1131.199, 1131.732, 1132.265, 
1132.799, 1133.332, 1133.865, 1134.399, 1134.932, 1135.465, 1135.999, 
1136.532, 1137.065, 1137.599, 1138.132, 1138.665, 1139.199, 1139.732, 
1140.265, 1140.799, 1141.332, 1141.865, 1142.399, 1142.932, 1143.465, 
1143.999, 1144.532, 1145.065, 1145.599, 1146.132, 1146.665, 1147.199, 
1147.732, 1148.265, 1148.799, 1149.332, 1149.865, 1150.399, 1150.932, 
1151.465, 1151.999, 1152.532, 1153.065, 1153.599, 1154.132, 1154.665, 
1155.199, 1155.732, 1156.265, 1156.799, 1157.332, 1157.865, 1158.399, 
1158.932, 1159.465, 1159.999, 1160.532, 1161.065, 1161.599, 1162.132, 
1162.665, 1163.199, 1163.732, 1164.265, 1164.799, 1165.332, 1165.865, 
1166.399, 1166.932, 1167.465, 1167.999, 1168.532, 1169.065, 1169.599, 
1170.132, 1170.665, 1171.199, 1171.732, 1172.265, 1172.799, 1173.332, 
1173.865, 1174.399, 1174.932, 1175.465, 1175.999, 1176.532, 1177.065, 
1177.599, 1178.132, 1178.665, 1179.199, 1179.732, 1180.265, 1180.799, 
1181.332, 1181.865, 1182.399, 1182.932, 1183.465, 1183.999, 1184.532, 
1185.065, 1185.599, 1186.132, 1186.665, 1187.199, 1187.732, 1188.265, 
1188.799, 1189.332, 1189.865, 1190.399, 1190.932, 1191.465, 1191.999, 
1192.532, 1193.065, 1193.599, 1194.132, 1194.665, 1195.199, 1195.732, 
1196.265, 1196.799, 1197.332, 1197.865, 1198.399, 1198.932, 1199.465, 
1199.999, 1200.532, 1201.065, 1201.599, 1202.132, 1202.665, 1203.199, 
1203.732, 1204.265, 1204.799, 1205.332, 1205.865, 1206.399, 1206.932, 
1207.465, 1207.999, 1208.532, 1209.065, 1209.599, 1210.132, 1210.665, 
1211.199, 1211.732, 1212.265, 1212.799, 1213.332, 1213.865, 1214.399, 
1214.932, 1215.465, 1215.999, 1216.532, 1217.065, 1217.599, 1218.132
), X = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
517.249, 517.185, NA, NA, 519.711, 516.299, 519.852, 516.178, 
515.869, 518.132, 515.684, 515.559, 515.629, 515.223, 516.892, 
516.89, 517.375, 517.518, 517.909, 516.641, 517.523, 517.887, 
517.636, NA)), row.names = c(NA, -165L), class = c("tbl_df", 
"tbl", "data.frame"))

标签: r

解决方案


我定义了新的+并制作了额外的列。

`%+%` <- function(x, y)  mapply(sum, x, y, MoreArgs = list(na.rm = TRUE))

    func <- function(df, missings){  
  for (i in 1:nrow(missings)) {
    Xstart <- df %>%
      filter(TimeStamp == missings[[i,"StartMissing"]]) %>%
      pull(X)
    Xend <- df %>%
      filter(TimeStamp == missings[[i,"EndMissing"]]) %>%
      pull(X)
    n <- df %>%
      filter(TimeStamp > missings[[i,"StartMissing"]] &
               TimeStamp < missings[[i,"EndMissing"]] ) %>%
      nrow()
    diff_X <- (Xend - Xstart) / (n + 1)
    if (is_empty(diff_X) | n == 0) {
    } else {
      nvec <- seq(diff_X, n*diff_X, by = diff_X)
      
      df <- df %>%
        mutate(Y = case_when(
          TimeStamp >  missings[[i,"StartMissing"]] &
            TimeStamp <  missings[[i,"EndMissing"]] ~ diff_X,
          TRUE ~ X))
      df <- df %>%
        mutate(X = case_when(
          TimeStamp >  missings[[i,"StartMissing"]] &
            TimeStamp <  missings[[i,"EndMissing"]] ~ lag(X,1) %+% cumsum(Y),
          TRUE ~ X)) %>% select(-Y)
        }

  }
  return(df)
}

如果存在Xstart且不Xend存在,它不会填充任何内容,但如果它们存在,它将起作用


推荐阅读