I have a data.table of factor columns, and I want to pull out the label of the last non-missing value in each row. It\'s kindof a typical max.col
situation, bu
Another idea -similar to Frank's- that tries (1) to avoid subsetting 'data.table' rows (which I assume must have some cost) and (2) to avoid checking a length == nrow(dat)
vector for NA
s in every iteration.
alex = function(x, ans = rep_len(NA, length(x[[1L]])), wh = seq_len(length(x[[1L]])))
{
if(!length(wh)) return(ans)
ans[wh] = as.character(x[[length(x)]])[wh]
Recall(x[-length(x)], ans, wh[is.na(ans[wh])])
}
alex(as.list(dat)) #had some trouble with 'data.table' subsetting
# [1] "u" "q" "w" "h" "r" "t" "e" "t"
And to compare with Frank's:
frank = function(x)
{
x[, res := NA_character_]
for(v in rev(names(x))[-1]) x[is.na(res), res := get(v)]
return(x$res)
}
DAT1 = as.data.table(lapply(ceiling(seq(0, 1e4, length.out = 1e2)),
function(n) c(rep(NA, n), sample(letters, 3e5 - n, TRUE))))
DAT2 = copy(DAT1)
microbenchmark::microbenchmark(alex(as.list(DAT1)),
{ frank(DAT2); DAT2[, res := NULL] },
times = 30)
#Unit: milliseconds
# expr min lq median uq max neval
# alex(as.list(DAT1)) 102.9767 108.5134 117.6595 133.1849 166.9594 30
# { frank(DAT2) DAT2[, `:=`(res, NULL)] } 1413.3296 1455.1553 1497.3517 1540.8705 1685.0589 30
identical(alex(as.list(DAT1)), frank(DAT2))
#[1] TRUE