Convert a dataframe to presence absence matrix

前端 未结 2 1025
别跟我提以往
别跟我提以往 2020-11-28 16:46

I have a table which has unequal number of element in string format

File1 A  B  C
File2 A  B  D
File3 E  F

I want to convert into a format

相关标签:
2条回答
  • 2020-11-28 17:16

    One possibility:

    library(reshape2)
    df2 <- melt(df, id.var = "V1")
    with(df2, table(V1, value))
    
    #         value
    # V1      A B C D E F
    #   File1 1 1 1 0 0 0
    #   File2 1 1 0 1 0 0
    #   File3 0 0 0 0 1 1
    
    0 讨论(0)
  • 2020-11-28 17:27

    A reasonably efficient approach is to use the (presently) non-exported charMat function from my "splitstackshape" package. Since it's not exported, you will have to use ::: to access it.

    library(splitstackshape)
    cbind(mydata[1], splitstackshape:::charMat(
      split.default(mydata[-1], sequence(ncol(mydata)-1)), fill=0))
    #      V1 V1 A B C D E F
    # 1 File1  0 1 1 1 0 0 0
    # 2 File2  0 1 1 0 1 0 0
    # 3 File3  1 0 0 0 0 1 1
    

    Under the hood, charMat makes use of matrix indexing to process everything pretty efficiently. Step-by-step, this is what charMat does.

    X <- split.default(mydata[-1], sequence(ncol(mydata)-1))
    len <- length(X)
    vec <- unlist(X, use.names=FALSE)
    lvl <- sort(unique(vec))
    out <- matrix(0L, nrow = len, ncol = length(lvl), dimnames = list(NULL, lvl))
    i.idx <- rep(seq.int(len), vapply(X, length, integer(1L)))
    j.idx <- match(vec, lvl)
    out[cbind(i.idx, j.idx)] <- 1
    out
    #        A B C D E F
    # [1,] 0 1 1 1 0 0 0
    # [2,] 0 1 1 0 1 0 0
    # [3,] 1 0 0 0 0 1 1
    

    That looks like a mouthful, but it is actually quite a fast operation, made faster by using the charMat function :-)


    Update: Benchmarks

    The following benchmarks test Henrik's answer with my charMat answer, and also adapts Henrik's answer to use "data.table" instead, for better efficiency.

    Two tests were run. The first is on a similar dataset with 90K rows, and the second on one with 900K rows.

    Here's the sample data:

    biggerdata <- do.call(rbind, replicate(30000, mydata, simplify = FALSE))
    biggerdata$V1 <- make.unique(biggerdata$V1)
    dim(biggerdata)
    # [1] 90000     4
    
    evenBigger <- do.call(rbind, replicate(10, biggerdata, simplify = FALSE))
    evenBigger$V1 <- make.unique(evenBigger$V1)
    dim(evenBigger)
    # [1] 900000      4
    

    Here are the functions to benchmark:

    fun1 <- function(indf) {
      cbind(indf[1], splitstackshape:::charMat(
        split.default(indf[-1], sequence(ncol(indf)-1)), fill=0))
    }
    
    library(reshape2)
    fun2 <- function(indf) {
      df2 <- melt(indf, id.var = "V1")
      with(df2, table(V1, value))
    }
    
    library(data.table)
    library(reshape2)
    DT <- data.table(biggerdata)
    DT2 <- data.table(evenBigger)
    
    fun3 <- function(inDT) {
      DTL <- melt(inDT, id.vars="V1")
      dcast.data.table(DTL, V1 ~ value, fun.aggregate=length)
    }
    

    And the results of the benchmarking.

    library(microbenchmark)
    microbenchmark(fun1(biggerdata), fun2(biggerdata), fun3(DT), times = 20)
    # Unit: milliseconds
    #                        expr       min        lq    median        uq       max neval
    #            fun1(biggerdata)  185.3652  199.8725  289.0206  308.5826  327.4185    20
    #            fun2(biggerdata) 1453.8791 1605.6053 1639.8567 1758.3984 1797.2229    20
    #  suppressMessages(fun3(DT))  469.8979  570.4664  586.4715  598.6229  675.2961    20
    
    microbenchmark(fun1(evenBigger), fun2(evenBigger), fun3(DT2), times = 5)
    # Unit: seconds
    #              expr       min        lq    median        uq       max neval
    #  fun1(evenBigger)  1.871611  1.896351  2.071355  2.140580  2.464569     5
    #  fun2(evenBigger) 26.911523 27.212910 27.363442 27.469812 27.938178     5
    #         fun3(DT2)  7.103615  7.131603  7.141908  7.205006  7.218321     5
    
    0 讨论(0)
提交回复
热议问题