I am using R and RStudio for the first time to work with a very large dataset (15 million cases) with many columns of data. To facilitate analysis, I need to search a range of
We can use mtabulate
from qdapTools
library(qdapTools)
res <- cbind(df1, mtabulate(as.data.frame(t(df1)))!=0)
row.names(res) <- NULL
names(res)[-(1:3)] <- paste0("Var", names(res)[-(1:3)])
res
# Dx1 Dx2 Dx3 Var001 Var234 Var456 Var231 Var444 Var245 Var777
#1 001 234 456 TRUE TRUE TRUE FALSE FALSE FALSE FALSE
#2 231 001 444 TRUE FALSE FALSE TRUE TRUE FALSE FALSE
#3 245 777 001 TRUE FALSE FALSE FALSE FALSE TRUE TRUE
df1 <- structure(list(Dx1 = c("001", "231", "245"), Dx2 = c("234", "001",
"777"), Dx3 = c("456", "444", "001")), .Names = c("Dx1", "Dx2",
"Dx3"), row.names = c(NA, -3L), class = "data.frame")
In base R, we can implement a reusable function, in only a few lines of code, but requires some knowledge about which functions to use and how.
I'll call the function bag
, as in bag-of-words.
bag <- function(..., prefix=".", levels=NULL, `NA`=NULL) {
# Go from multiple columns to list of vectors
bags <- mapply(c, ..., SIMPLIFY = FALSE, USE.NAMES = FALSE)
# Find unique levels
if(is.null(levels)) {
levels <- sort(Reduce(union, bags))
# names persist through outer
names(levels) <- paste0(prefix, levels)
}
# Calculate out[level,bag] = level %in% bag
out <- outer(levels, bags, Vectorize(`%in%`))
# Output a data structure
structure(+t(out), class='bag', levels=levels)
}
Which would let us do:
with(df1, bag(Dx1, Dx2, Dx3, prefix="Var"))
#> Var001 Var231 Var234 Var245 Var444 Var456 Var777
#> [1,] 1 0 1 0 0 1 0
#> [2,] 1 1 0 0 1 0 0
#> [3,] 1 0 0 1 0 0 1
#> attr(,"class")
#> [1] "bag"
#> attr(,"levels")
#> Var001 Var231 Var234 Var245 Var444 Var456 Var777
#> "001" "231" "234" "245" "444" "456" "777"
This is probably not very performant, but it works. I've changed the output format from logical to numeric and included some metadata to make it easier to use in a model. We can add a function to enable modeling with bag directly:
#' @export
makepredictcall.bag <- function(var, call){
# Stolen from splines package
if (as.character(call)[1L] != "bag")
return(call)
args <- c("prefix", "levels")
at <- attributes(var)[args]
xxx <- call
xxx[args] <- NULL
xxx[names(at)] <- at
xxx
}
Now, you can use it directly in a model formula. This has the advantage that the dummy coding is now incorporated in to the model and you won't need to preprocess when predicting on new data sets. Example:
df2 <- as.data.frame(lapply(df1, sample, 20, TRUE), stringsAsFactors = FALSE)
df3 <- as.data.frame(lapply(df1, sample, 20, TRUE), stringsAsFactors = FALSE)
Y <- 1:nrow(df2)
m <- lm(Y~bag(Dx1, Dx2, Dx3), df2)
summary(m)
#>
#> Call:
#> lm(formula = Y ~ bag(Dx1, Dx2, Dx3), data = df2)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -8.1110 -3.6765 0.1948 3.1899 8.7961
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 16.6709 10.3948 1.604 0.135
#> bag(Dx1, Dx2, Dx3).001 -3.7385 5.6141 -0.666 0.518
#> bag(Dx1, Dx2, Dx3).231 -3.7286 4.1728 -0.894 0.389
#> bag(Dx1, Dx2, Dx3).234 3.1786 4.6528 0.683 0.507
#> bag(Dx1, Dx2, Dx3).245 -7.2493 4.4900 -1.615 0.132
#> bag(Dx1, Dx2, Dx3).444 -2.2936 4.3033 -0.533 0.604
#> bag(Dx1, Dx2, Dx3).456 2.9979 4.3826 0.684 0.507
#> bag(Dx1, Dx2, Dx3).777 -0.8608 4.5353 -0.190 0.853
#>
#> Residual standard error: 5.971 on 12 degrees of freedom
#> Multiple R-squared: 0.3566, Adjusted R-squared: -0.01874
#> F-statistic: 0.9501 on 7 and 12 DF, p-value: 0.5056
predict(m, df3)
#> 1 2 3 4 5 6 7
#> 8.681003 16.111016 4.822329 15.079445 19.108899 10.306611 13.817465
#> 8 9 10 11 12 13 14
#> 16.111016 9.788011 12.382454 9.778103 3.389569 12.382454 9.203882
#> 15 16 17 18 19 20
#> 13.817465 9.788011 12.071654 6.267249 13.827373 15.069537
Created on 2019-08-06 by the reprex package (v0.3.0)
EDIT:
And some benchmarks for comparison
microbenchmark::microbenchmark(mtab = mtabulate(as.data.frame(t(df1)))!=0,
lapply = lapply(as.character(unique(melt(df1, id.vars = NULL)$value)),
function(x) rowSums(df1==x) > 0),
bag = do.call(bag, df1))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> mtab 439.320 452.107 519.9429 462.9035 511.8710 1960.582 100
#> lapply 276.914 295.976 337.6020 300.7870 315.0135 2268.210 100
#> bag 121.996 130.305 146.6677 139.6990 145.3275 294.711 100
Another idea using base R with lapply
:
uniq_dxs <- as.character(unique(melt(df1, id.vars = NULL)$value))
df1[, paste0("var", uniq_dxs)] <- lapply(uniq_dxs, function(x) rowSums(df1==x) > 0)
df1
# Dx1 Dx2 Dx3 var001 var231 var245 var234 var777 var456 var444
#1 001 234 456 TRUE FALSE FALSE TRUE FALSE TRUE FALSE
#2 231 001 444 TRUE TRUE FALSE FALSE FALSE FALSE TRUE
#3 245 777 001 TRUE FALSE TRUE FALSE TRUE FALSE FALSE
Benchmark on my machine since I was curious. Just wanted to compare the mtabulate
to the lapply
. Not including the <-
:
microbenchmark::microbenchmark(mtab = mtabulate(as.data.frame(t(df1)))!=0,
lapply = lapply(uniq_dxs, function(x) rowSums(df1==x) > 0))
Unit: microseconds
expr min lq mean median uq max neval
mtab 1039.317 1088.9120 1182.3375 1109.334 1145.255 5931.031 100
lapply 742.838 795.7155 823.7991 813.220 843.488 1034.211 100