Interpolate NA values

后端 未结 3 1920
不思量自难忘°
不思量自难忘° 2020-11-27 19:37

I have two set of samples that are time independent. I would like to merge them and calculate the missing values for the times where I do not have values of both. Simplified

相关标签:
3条回答
  • 2020-11-27 20:13

    Using the zoo package:

    library(zoo)
    Cz <- zoo(C)
    index(Cz) <- Cz[,1]
    Cz_approx <- na.approx(Cz)
    
    0 讨论(0)
  • 2020-11-27 20:16

    An ugly and probably inefficient Base R solution:

    # Data provided:
    A <- cbind(time=c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
               Avalue=c(1, 2, 3, 2, 1, 2, 3, 2, 1, 2))
    B <- cbind(time=c(15, 30, 45, 60), Bvalue=c(100, 200, 300, 400))
    C <- merge(A,B, all=TRUE)
    
    # Scalar valued at the minimum time difference: -> min_time_diff
    
    min_time_diff <- min(diff(C$time))
    
    # Adjust frequency of the series to hold all steps in range: -> df
    
    df <- merge(C, 
                data.frame(time = seq(min_time_diff, 
                                     max(C$time), 
                                     by = min_time_diff)),
               by = "time",
               all = TRUE)
    
    
    
    # Linear interpolation function handling ties,
    # returns interpolated vector the same length 
    # a the input vector: -> vector
    
    l_interp_vec <- function(na_vec){
    
      approx(x = na_vec,
    
             method = "linear",
    
             ties = "constant",
    
             n = length(na_vec))$y
    
    }
    
    # Applied to a dataframe, replacing NA values
    # in each of the numeric vectors, 
    # with interpolated values. 
    # input is dataframe: -> dataframe()
    
    interped_df <- data.frame(lapply(df, function(x){
    
          if(is.numeric(x)){
    
            # Store a scalar of min row where x isn't NA: -> min_non_na
    
            min_non_na <- min(which(!(is.na(x))))
    
            # Store a scalar of max row where x isn't NA: -> max_non_na
    
            max_non_na <- max(which(!(is.na(x))))
    
            # Store scalar of the number of rows needed to impute prior 
            # to first NA value: -> ru_lower
    
            ru_lower <- ifelse(min_non_na > 1, min_non_na - 1, min_non_na)
    
            # Store scalar of the number of rows needed to impute after
            # the last non-NA value: -> ru_lower
    
            ru_upper <- ifelse(max_non_na == length(x), 
    
                               length(x) - 1, 
    
                               (length(x) - (max_non_na + 1)))
    
            # Store a vector of the ramp to function: -> l_ramp_up: 
    
            ramp_up <- as.numeric(
              cumsum(rep(x[min_non_na]/(min_non_na), ru_lower))
              )
    
            # Apply the interpolation function on vector "x": -> y
    
            y <- as.numeric(l_interp_vec(as.numeric(x[min_non_na:max_non_na])))
    
            # Create a vector that combines the ramp_up vector 
            # and y if the first NA is at row 1: -> z
    
            if(length(ramp_up) > 1 & max_non_na != length(x)){
    
              # Create a vector interpolations if there are 
              # multiple NA values after the last value: -> lower_l_int
    
              lower_l_int <- as.numeric(cumsum(rep(mean(diff(c(ramp_up, y))),
                                                   ru_upper+1)) +
                                      as.numeric(x[max_non_na]))
    
              # Store the linear interpolations in  a vector: -> z
    
              z <- as.numeric(c(ramp_up, y, lower_l_int))
    
            }else if(length(ramp_up) > 1 & max_non_na == length(x)){
    
              # Store the linear interpolations in  a vector: -> z
    
              z <- as.numeric(c(ramp_up, y))
    
            }else if(min_non_na == 1 & max_non_na != length(x)){
    
              # Create a vector interpolations if there are 
              # multiple NA values after the last value: -> lower_l_int
    
              lower_l_int <- as.numeric(cumsum(rep(mean(diff(c(ramp_up, y))),
                                                   ru_upper+1)) +
                                      as.numeric(x[max_non_na]))
    
    
              # Store the linear interpolations in  a vector: -> z
    
              z <- as.numeric(c(y, lower_l_int))
    
            }else{
    
              # Store the linear interpolations in  a vector: -> z
    
              z <- as.numeric(y)
    
            }
    
            # Interpolate between points in x, return new x:
    
            return(as.numeric(ifelse(is.na(x), z, x)))
    
          }else{
    
            x
    
          }
    
        }
    
      )
    
    )
    
    # Subset interped df to only contain 
    # the time values in C, store a data frame: -> int_df_subset
    
    int_df_subset <- interped_df[interped_df$time %in% C$time,]
    
    0 讨论(0)
  • 2020-11-27 20:22

    The proper way to do this statistically and still get valid confidence intervals is to use Multiple Imputation. See Rubin's classic book, and there's an excellent R package for this (mi).

    0 讨论(0)
提交回复
热议问题