I have a table which has unequal number of element in string format
File1 A B C
File2 A B D
File3 E F
I want to convert into a format
One possibility:
library(reshape2)
df2 <- melt(df, id.var = "V1")
with(df2, table(V1, value))
# value
# V1 A B C D E F
# File1 1 1 1 0 0 0
# File2 1 1 0 1 0 0
# File3 0 0 0 0 1 1
A reasonably efficient approach is to use the (presently) non-exported charMat
function from my "splitstackshape" package. Since it's not exported, you will have to use :::
to access it.
library(splitstackshape)
cbind(mydata[1], splitstackshape:::charMat(
split.default(mydata[-1], sequence(ncol(mydata)-1)), fill=0))
# V1 V1 A B C D E F
# 1 File1 0 1 1 1 0 0 0
# 2 File2 0 1 1 0 1 0 0
# 3 File3 1 0 0 0 0 1 1
Under the hood, charMat
makes use of matrix indexing to process everything pretty efficiently. Step-by-step, this is what charMat
does.
X <- split.default(mydata[-1], sequence(ncol(mydata)-1))
len <- length(X)
vec <- unlist(X, use.names=FALSE)
lvl <- sort(unique(vec))
out <- matrix(0L, nrow = len, ncol = length(lvl), dimnames = list(NULL, lvl))
i.idx <- rep(seq.int(len), vapply(X, length, integer(1L)))
j.idx <- match(vec, lvl)
out[cbind(i.idx, j.idx)] <- 1
out
# A B C D E F
# [1,] 0 1 1 1 0 0 0
# [2,] 0 1 1 0 1 0 0
# [3,] 1 0 0 0 0 1 1
That looks like a mouthful, but it is actually quite a fast operation, made faster by using the charMat
function :-)
The following benchmarks test Henrik's answer with my charMat
answer, and also adapts Henrik's answer to use "data.table" instead, for better efficiency.
Two tests were run. The first is on a similar dataset with 90K rows, and the second on one with 900K rows.
Here's the sample data:
biggerdata <- do.call(rbind, replicate(30000, mydata, simplify = FALSE))
biggerdata$V1 <- make.unique(biggerdata$V1)
dim(biggerdata)
# [1] 90000 4
evenBigger <- do.call(rbind, replicate(10, biggerdata, simplify = FALSE))
evenBigger$V1 <- make.unique(evenBigger$V1)
dim(evenBigger)
# [1] 900000 4
Here are the functions to benchmark:
fun1 <- function(indf) {
cbind(indf[1], splitstackshape:::charMat(
split.default(indf[-1], sequence(ncol(indf)-1)), fill=0))
}
library(reshape2)
fun2 <- function(indf) {
df2 <- melt(indf, id.var = "V1")
with(df2, table(V1, value))
}
library(data.table)
library(reshape2)
DT <- data.table(biggerdata)
DT2 <- data.table(evenBigger)
fun3 <- function(inDT) {
DTL <- melt(inDT, id.vars="V1")
dcast.data.table(DTL, V1 ~ value, fun.aggregate=length)
}
And the results of the benchmarking.
library(microbenchmark)
microbenchmark(fun1(biggerdata), fun2(biggerdata), fun3(DT), times = 20)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1(biggerdata) 185.3652 199.8725 289.0206 308.5826 327.4185 20
# fun2(biggerdata) 1453.8791 1605.6053 1639.8567 1758.3984 1797.2229 20
# suppressMessages(fun3(DT)) 469.8979 570.4664 586.4715 598.6229 675.2961 20
microbenchmark(fun1(evenBigger), fun2(evenBigger), fun3(DT2), times = 5)
# Unit: seconds
# expr min lq median uq max neval
# fun1(evenBigger) 1.871611 1.896351 2.071355 2.140580 2.464569 5
# fun2(evenBigger) 26.911523 27.212910 27.363442 27.469812 27.938178 5
# fun3(DT2) 7.103615 7.131603 7.141908 7.205006 7.218321 5