r - copy missing values from other variables

问题

Simple question, but I can't figure out how to do the following. This is my data:

ID    Time1    Time2    Time3    Time4
01    23       23       NA       NA  
02    21       21       21       NA
03    22       22       25       NA
04    29       29       20       NA
05    NA       NA       15       22
06    NA       NA       11       NA

Now, I want to replace missing values (NA) with the data that is available in other variables. Importantly, I need r to take the value that is 'closest' to the missing data point. E.g., for ID 5, Time1 and Time2 should be "15" (not "22").

Like this:

ID    Time1    Time2    Time3    Time4
01    23       23       23       23  
02    21       21       21       21
03    22       22       25       25
04    29       29       20       20
05    15       15       15       22
06    11       11       11       11

I've tried ifelse statements, but this did not work out.

Thanks!

回答1:

This is much more difficult that it looks. I built a solution that works on one column at a time, taking the pmin() of the absolute distance between all time column indexes and the current column index, stripping NAs with the na.rm=T argument. The result can then be used to index the original time columns using an index matrix, which can then be assigned to the current column index in the target data.frame.

An advantage of this design is that it's fully vectorized over the rows. In other words, it doesn't iterate over one row at a time. This could be an advantage for extremely row-heavy inputs. On the other hand, the solution does involve building matrices that parallel all time columns (timemat, nacols, and off), which could be expensive for large inputs. It's basically trading away memory to save CPU.

I added a couple of rows to test additional cases not covered by the OP's sample data.frame; specifically (1) an all-NA row, and (2) a row with candidate non-NA values on either side of NA values.

Input:

df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    NA    NA
## 2 02    21    21    21    NA
## 3 03    22    22    25    NA
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    NA
## 6 06    NA    NA    15    22
## 7 07    NA    NA    11    NA
## 8 08     1    NA    NA     2

Solution:

ris <- seq_len(nrow(df));
cis <- grep('^Time',names(df));
timemat <- as.matrix(df[cis]);
nacols <- as.data.frame(ifelse(is.na(timemat),NA,col(timemat)));
nacols;
##   Time1 Time2 Time3 Time4
## 1     1     2    NA    NA
## 2     1     2     3    NA
## 3     1     2     3    NA
## 4    NA    NA    NA    NA
## 5     1     2     3    NA
## 6    NA    NA     3     4
## 7    NA    NA     3    NA
## 8     1    NA    NA     4
for (ci in seq_len(ncol(timemat))) {
    off <- abs(nacols-ci);
    best <- which(off==do.call(pmin,c(off,na.rm=T)),arr.ind=T);
    df[cis[ci]] <- timemat[matrix(c(ris,best[match(ris,best[,'row']),'col']),nrow(df))];
};
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    23    23
## 2 02    21    21    21    21
## 3 03    22    22    25    25
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    20
## 6 06    15    15    15    22
## 7 07    11    11    11    11
## 8 08     1     1     2     2

Rcpp solution:

library(Rcpp);
cppFunction('
    IntegerMatrix fillDFNAsWithNearestInRow(DataFrame df, IntegerVector cis ) {
        IntegerMatrix res(df.nrows(),cis.size());
        if (df.nrows()==0 || cis.size()==0) return res;
        IntegerVector cis0 = clone(cis); for (int cisi = 0; cisi < cis0.size(); ++cisi) --cis0[cisi]; // correct from R 1-based to Rcpp 0-based
        for (int cisi = 0; cisi < cis0.size(); ++cisi) {
            IntegerVector colCur = df[cis0[cisi]];
            for (int ri = 0; ri < colCur.size(); ++ri) {
                if (!IntegerVector::is_na(colCur[ri])) {
                    res(ri,cisi) = colCur[ri];
                    continue;
                }
                int leftOk;
                int rightOk;
                IntegerVector colLeft;
                IntegerVector colRight;
                bool set = false; // assumption
                for (int off = 1; (leftOk = cisi-off>=0, rightOk = cisi+off<cis0.size(), leftOk ) || rightOk; ++off) {
                    if (leftOk && (colLeft = df[cis0[cisi-off]], !IntegerVector::is_na(colLeft[ri]))) {
                        res(ri,cisi) = colLeft[ri];
                        set = true;
                        break;
                    } else if (rightOk && (colRight = df[cis0[cisi+off]], !IntegerVector::is_na(colRight[ri]))) {
                        res(ri,cisi) = colRight[ri];
                        set = true;
                        break;
                    }
                }
                if (!set) res(ri,cisi) = NA_INTEGER;
            }
        }
        return res;
    }
');

df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
cis <- grep('^Time',names(df));
df[cis] <- fillDFNAsWithNearestInRow(df,cis);
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    23    23
## 2 02    21    21    21    21
## 3 03    22    22    25    25
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    20
## 6 06    15    15    15    22
## 7 07    11    11    11    11
## 8 08     1     1     2     2

回答2:

With data.table's rolling joins and set:

library(data.table)
good = as.data.table( which(!is.na(df[-1]), arr.ind = TRUE) )
all = CJ(row = seq(nrow(df)), col = seq(2L, ncol(df)))
good$col = good$col + 1L
good$col_src = good$col

changes = good[all, on = c("row", "col"), roll="nearest"][ col != col_src ]

changes[, {
  set(df, i = row, j = col, value = df[[ col_src ]][row])
  NULL
}, by=.(col,col_src)]

# based on input from bgoldst's answer
   ID  1  2  3  4
1: 01 23 23 23 23
2: 02 21 21 21 21
3: 03 22 22 25 25
4: 04 NA NA NA NA
5: 05 29 29 20 20
6: 06 15 15 15 22
7: 07 11 11 11 11
8: 08  1  1  2  2

We find all entries to switch and then modify by reference with set. I'm not sure how roll="nearest" handles ties, but I'm sure that can be tweaked.

回答3:

Yet another attempt. Breaking down as much as possible: (1) loop once from left to right carrying the last non-NA value forward and, also, recording where was the non-NA the replaced each NA, (2) loop again from right to left (a) replacing NAs carrying non-NAs backwards and (b) comparing the distance of the non-NA tha replaced each NA to the current non-NA and either keep or replace. Despite the two explicit loops, the computations involve vectors of length == nrow(x).

ff = function(x)
{    
    taken_from = lapply(seq_along(x), rep_len, nrow(x))
    nas = lapply(x, is.na)

    #loop left -> right 
    # carry forward non-NAs and record which non-NA replaced NA
    last_nona = !nas[[1L]]
    for(j in 2:length(x)) {
        i = which(nas[[j]] & last_nona)
        x[[j]][i] = x[[j - 1L]][i]
        taken_from[[j]][i] = taken_from[[j - 1L]][i]
        last_nona = !is.na(x[[j]])
    }

    #loop right -> left
    #if NA and not replace carry the previous non-NA backward
    #else compare which non-NA is nearer and replace appropriately
    last_nona = !nas[[length(x)]]
    for(j in (length(x) - 1L):1L) {
        i1 = which(nas[[j]] & last_nona)
        i = i1[(j - taken_from[[j]][i1]) > (taken_from[[(j + 1L)]][i1] - j)]
        ii = i1[j == taken_from[[j]][i1]]
        taken_from[[j]][i] = taken_from[[j + 1L]][i]
        taken_from[[j]][ii] = taken_from[[j + 1L]][ii]
        x[[j]][i] = x[[j + 1L]][i]
        x[[j]][ii] = x[[j + 1L]][ii]
        last_nona = !is.na(x[[j]])
    }

    return(x)
}

Using bgoldst's data:

ff(df[-1L])
#  Time1 Time2 Time3 Time4
#1    23    23    23    23
#2    21    21    21    21
#3    22    22    25    25
#4    NA    NA    NA    NA
#5    29    29    20    20
#6    15    15    15    22
#7    11    11    11    11
#8     1     1     2     2

And the neccessary benchmarking:

set.seed(911)            
DAT = as.data.frame(matrix(sample(c(NA, 0:10), 1e7, TRUE), 1e6, 10))
system.time({ ansff = ff(DAT) })
#   user  system elapsed 
#   0.82    0.38    1.75 
system.time({ ansbgoldst1 = bgoldst1(DAT) })
#   user  system elapsed 
#  20.96    7.53   42.04 
system.time({ ansbgoldst2 = bgoldst2(DAT) })
#   user  system elapsed 
#   0.97    0.25    1.64 
sf1 = system.time({ ansfrank = frank(DAT) }); sf2 = system.time( copy(DAT) )
sf1 - sf2
#   user  system elapsed 
#   5.84    1.46    8.59 
all.equal(ansff, ansbgoldst1)
#[1] TRUE
all.equal(ansbgoldst1, ansbgoldst2)
#[1] TRUE
all.equal(ansbgoldst2, ansfrank)
#[1] TRUE

the functions:

bgoldst1 = function(x)
{
    ris = seq_len(nrow(x))
    xm = as.matrix(x)
    nacols = as.data.frame(lapply(seq_along(x), function(i) { x[[i]][!is.na(x[[i]])] = i; x[[i]] }))
    for(ci in seq_along(x)) {
        off = abs(nacols - ci)
        best = which(off == do.call(pmin, c(off, na.rm = TRUE)), arr.ind = TRUE)
        x[ci] = xm[matrix(c(ris, best[match(ris, best[, "row"]), "col"]), nrow(x))]
    }
    x
}

bgoldst2 = function(x) 
{
    ans = as.data.frame(fillDFNAsWithNearestInRow(x, seq_along(x)))
    names(ans) = names(x)
    ans
}

frank = function(x)
{
    x = copy(x)
    good = as.data.table(which(!is.na(x), arr.ind = TRUE))
    all = CJ(row = seq_len(nrow(x)), col = seq_len(ncol(x)))
    good$col = good$col
    good$col_src = good$col

    changes = good[all, on = c("row", "col"), roll = "nearest"][col != col_src]

    changes[, {
            set(x, i = row, j = col, value = x[[col_src]][row])
            NULL
            }, by = .(col, col_src)]
    x
}

回答4:

Here is a simple solution:

x <-read.table(text="ID    Time1    Time2    Time3    Time4
01    23       23       NA       NA  
02    21       21       21       NA
03    22       22       25       NA
04    29       29       20       NA
05    NA       NA       15       22
06    NA       NA       11       NA", header=TRUE)

x <- as.matrix(x[,-1])

dofill <- function(r){
  PREV <- c(NA, suppressWarnings(head(r, -1)))
  NEXT <- c(tail(r,-1), NA)
  r[is.na(r)] <- PREV[is.na(r)]
  r[is.na(r)] <- NEXT[is.na(r)]
  r
}

rlefill <- function(r){
  r[is.na(r)] <- "NA"
  rle1 <- rle(r)
  rle1$values <- dofill(as.numeric(rle1$values))
  inverse.rle(rle1)
}

t(apply(x, 1, rlefill))

dofill simply replaces all NA's with the previous value, and all remaining NA's with next values.

rlefill is needed to transform a sequence of NA's into "one big NA".

Of course, if you have a larger number of time points, you may need something like ...

cis <- grep('^Time',names(df))
timemat <- as.matrix(df[cis]);

... i.e. a more universal solution of extracting the relevant part from the data frame.

(Now I realize that this is not exactly what you asked - my solution always uses the preceding value if it is available, even if the following value is closer in time. It doesn't make a difference in your example data set but it may make difference in real data.)

来源：https://stackoverflow.com/questions/36724309/r-copy-missing-values-from-other-variables

标签

missing-data