问题
Simple question, but I can't figure out how to do the following. This is my data:
ID Time1 Time2 Time3 Time4
01 23 23 NA NA
02 21 21 21 NA
03 22 22 25 NA
04 29 29 20 NA
05 NA NA 15 22
06 NA NA 11 NA
Now, I want to replace missing values (NA) with the data that is available in other variables. Importantly, I need r to take the value that is 'closest' to the missing data point. E.g., for ID 5, Time1 and Time2 should be "15" (not "22").
Like this:
ID Time1 Time2 Time3 Time4
01 23 23 23 23
02 21 21 21 21
03 22 22 25 25
04 29 29 20 20
05 15 15 15 22
06 11 11 11 11
I've tried ifelse statements, but this did not work out.
Thanks!
回答1:
This is much more difficult that it looks. I built a solution that works on one column at a time, taking the pmin() of the absolute distance between all time column indexes and the current column index, stripping NAs with the na.rm=T
argument. The result can then be used to index the original time columns using an index matrix, which can then be assigned to the current column index in the target data.frame.
An advantage of this design is that it's fully vectorized over the rows. In other words, it doesn't iterate over one row at a time. This could be an advantage for extremely row-heavy inputs. On the other hand, the solution does involve building matrices that parallel all time columns (timemat
, nacols
, and off
), which could be expensive for large inputs. It's basically trading away memory to save CPU.
I added a couple of rows to test additional cases not covered by the OP's sample data.frame; specifically (1) an all-NA row, and (2) a row with candidate non-NA values on either side of NA values.
Input:
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 NA NA
## 2 02 21 21 21 NA
## 3 03 22 22 25 NA
## 4 04 NA NA NA NA
## 5 05 29 29 20 NA
## 6 06 NA NA 15 22
## 7 07 NA NA 11 NA
## 8 08 1 NA NA 2
Solution:
ris <- seq_len(nrow(df));
cis <- grep('^Time',names(df));
timemat <- as.matrix(df[cis]);
nacols <- as.data.frame(ifelse(is.na(timemat),NA,col(timemat)));
nacols;
## Time1 Time2 Time3 Time4
## 1 1 2 NA NA
## 2 1 2 3 NA
## 3 1 2 3 NA
## 4 NA NA NA NA
## 5 1 2 3 NA
## 6 NA NA 3 4
## 7 NA NA 3 NA
## 8 1 NA NA 4
for (ci in seq_len(ncol(timemat))) {
off <- abs(nacols-ci);
best <- which(off==do.call(pmin,c(off,na.rm=T)),arr.ind=T);
df[cis[ci]] <- timemat[matrix(c(ris,best[match(ris,best[,'row']),'col']),nrow(df))];
};
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 23 23
## 2 02 21 21 21 21
## 3 03 22 22 25 25
## 4 04 NA NA NA NA
## 5 05 29 29 20 20
## 6 06 15 15 15 22
## 7 07 11 11 11 11
## 8 08 1 1 2 2
Rcpp solution:
library(Rcpp);
cppFunction('
IntegerMatrix fillDFNAsWithNearestInRow(DataFrame df, IntegerVector cis ) {
IntegerMatrix res(df.nrows(),cis.size());
if (df.nrows()==0 || cis.size()==0) return res;
IntegerVector cis0 = clone(cis); for (int cisi = 0; cisi < cis0.size(); ++cisi) --cis0[cisi]; // correct from R 1-based to Rcpp 0-based
for (int cisi = 0; cisi < cis0.size(); ++cisi) {
IntegerVector colCur = df[cis0[cisi]];
for (int ri = 0; ri < colCur.size(); ++ri) {
if (!IntegerVector::is_na(colCur[ri])) {
res(ri,cisi) = colCur[ri];
continue;
}
int leftOk;
int rightOk;
IntegerVector colLeft;
IntegerVector colRight;
bool set = false; // assumption
for (int off = 1; (leftOk = cisi-off>=0, rightOk = cisi+off<cis0.size(), leftOk ) || rightOk; ++off) {
if (leftOk && (colLeft = df[cis0[cisi-off]], !IntegerVector::is_na(colLeft[ri]))) {
res(ri,cisi) = colLeft[ri];
set = true;
break;
} else if (rightOk && (colRight = df[cis0[cisi+off]], !IntegerVector::is_na(colRight[ri]))) {
res(ri,cisi) = colRight[ri];
set = true;
break;
}
}
if (!set) res(ri,cisi) = NA_INTEGER;
}
}
return res;
}
');
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
cis <- grep('^Time',names(df));
df[cis] <- fillDFNAsWithNearestInRow(df,cis);
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 23 23
## 2 02 21 21 21 21
## 3 03 22 22 25 25
## 4 04 NA NA NA NA
## 5 05 29 29 20 20
## 6 06 15 15 15 22
## 7 07 11 11 11 11
## 8 08 1 1 2 2
回答2:
With data.table's rolling joins and set
:
library(data.table)
good = as.data.table( which(!is.na(df[-1]), arr.ind = TRUE) )
all = CJ(row = seq(nrow(df)), col = seq(2L, ncol(df)))
good$col = good$col + 1L
good$col_src = good$col
changes = good[all, on = c("row", "col"), roll="nearest"][ col != col_src ]
changes[, {
set(df, i = row, j = col, value = df[[ col_src ]][row])
NULL
}, by=.(col,col_src)]
# based on input from bgoldst's answer
ID 1 2 3 4
1: 01 23 23 23 23
2: 02 21 21 21 21
3: 03 22 22 25 25
4: 04 NA NA NA NA
5: 05 29 29 20 20
6: 06 15 15 15 22
7: 07 11 11 11 11
8: 08 1 1 2 2
We find all entries to switch and then modify by reference with set
. I'm not sure how roll="nearest"
handles ties, but I'm sure that can be tweaked.
回答3:
Yet another attempt. Breaking down as much as possible: (1) loop once from left to right carrying the last non-NA value forward and, also, recording where was the non-NA the replaced each NA, (2) loop again from right to left (a) replacing NAs carrying non-NAs backwards and (b) comparing the distance of the non-NA tha replaced each NA to the current non-NA and either keep or replace. Despite the two explicit loops, the computations involve vectors of length == nrow(x)
.
ff = function(x)
{
taken_from = lapply(seq_along(x), rep_len, nrow(x))
nas = lapply(x, is.na)
#loop left -> right
# carry forward non-NAs and record which non-NA replaced NA
last_nona = !nas[[1L]]
for(j in 2:length(x)) {
i = which(nas[[j]] & last_nona)
x[[j]][i] = x[[j - 1L]][i]
taken_from[[j]][i] = taken_from[[j - 1L]][i]
last_nona = !is.na(x[[j]])
}
#loop right -> left
#if NA and not replace carry the previous non-NA backward
#else compare which non-NA is nearer and replace appropriately
last_nona = !nas[[length(x)]]
for(j in (length(x) - 1L):1L) {
i1 = which(nas[[j]] & last_nona)
i = i1[(j - taken_from[[j]][i1]) > (taken_from[[(j + 1L)]][i1] - j)]
ii = i1[j == taken_from[[j]][i1]]
taken_from[[j]][i] = taken_from[[j + 1L]][i]
taken_from[[j]][ii] = taken_from[[j + 1L]][ii]
x[[j]][i] = x[[j + 1L]][i]
x[[j]][ii] = x[[j + 1L]][ii]
last_nona = !is.na(x[[j]])
}
return(x)
}
Using bgoldst's data:
ff(df[-1L])
# Time1 Time2 Time3 Time4
#1 23 23 23 23
#2 21 21 21 21
#3 22 22 25 25
#4 NA NA NA NA
#5 29 29 20 20
#6 15 15 15 22
#7 11 11 11 11
#8 1 1 2 2
And the neccessary benchmarking:
set.seed(911)
DAT = as.data.frame(matrix(sample(c(NA, 0:10), 1e7, TRUE), 1e6, 10))
system.time({ ansff = ff(DAT) })
# user system elapsed
# 0.82 0.38 1.75
system.time({ ansbgoldst1 = bgoldst1(DAT) })
# user system elapsed
# 20.96 7.53 42.04
system.time({ ansbgoldst2 = bgoldst2(DAT) })
# user system elapsed
# 0.97 0.25 1.64
sf1 = system.time({ ansfrank = frank(DAT) }); sf2 = system.time( copy(DAT) )
sf1 - sf2
# user system elapsed
# 5.84 1.46 8.59
all.equal(ansff, ansbgoldst1)
#[1] TRUE
all.equal(ansbgoldst1, ansbgoldst2)
#[1] TRUE
all.equal(ansbgoldst2, ansfrank)
#[1] TRUE
the functions:
bgoldst1 = function(x)
{
ris = seq_len(nrow(x))
xm = as.matrix(x)
nacols = as.data.frame(lapply(seq_along(x), function(i) { x[[i]][!is.na(x[[i]])] = i; x[[i]] }))
for(ci in seq_along(x)) {
off = abs(nacols - ci)
best = which(off == do.call(pmin, c(off, na.rm = TRUE)), arr.ind = TRUE)
x[ci] = xm[matrix(c(ris, best[match(ris, best[, "row"]), "col"]), nrow(x))]
}
x
}
bgoldst2 = function(x)
{
ans = as.data.frame(fillDFNAsWithNearestInRow(x, seq_along(x)))
names(ans) = names(x)
ans
}
frank = function(x)
{
x = copy(x)
good = as.data.table(which(!is.na(x), arr.ind = TRUE))
all = CJ(row = seq_len(nrow(x)), col = seq_len(ncol(x)))
good$col = good$col
good$col_src = good$col
changes = good[all, on = c("row", "col"), roll = "nearest"][col != col_src]
changes[, {
set(x, i = row, j = col, value = x[[col_src]][row])
NULL
}, by = .(col, col_src)]
x
}
回答4:
Here is a simple solution:
x <-read.table(text="ID Time1 Time2 Time3 Time4
01 23 23 NA NA
02 21 21 21 NA
03 22 22 25 NA
04 29 29 20 NA
05 NA NA 15 22
06 NA NA 11 NA", header=TRUE)
x <- as.matrix(x[,-1])
dofill <- function(r){
PREV <- c(NA, suppressWarnings(head(r, -1)))
NEXT <- c(tail(r,-1), NA)
r[is.na(r)] <- PREV[is.na(r)]
r[is.na(r)] <- NEXT[is.na(r)]
r
}
rlefill <- function(r){
r[is.na(r)] <- "NA"
rle1 <- rle(r)
rle1$values <- dofill(as.numeric(rle1$values))
inverse.rle(rle1)
}
t(apply(x, 1, rlefill))
dofill
simply replaces all NA's with the previous value, and all remaining NA's with next values.
rlefill
is needed to transform a sequence of NA's into "one big NA".
Of course, if you have a larger number of time points, you may need something like ...
cis <- grep('^Time',names(df))
timemat <- as.matrix(df[cis]);
... i.e. a more universal solution of extracting the relevant part from the data frame.
(Now I realize that this is not exactly what you asked - my solution always uses the preceding value if it is available, even if the following value is closer in time. It doesn't make a difference in your example data set but it may make difference in real data.)
来源:https://stackoverflow.com/questions/36724309/r-copy-missing-values-from-other-variables