Speedy/elegant way to unite many pairs of columns

后端 未结 3 1500
臣服心动
臣服心动 2021-01-25 05:00

Is there an elegant/fastR way to combine all pairs of columns in a data.frame?

For example, using mapply() and paste() we can turn this data.fr

相关标签:
3条回答
  • 2021-01-25 05:14

    I'm not sure this is the best approach. See if the below code gives any speed improvement

    require(dplyr)
    transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2)) 
    

    Answer updated based on comment :-)

    0 讨论(0)
  • 2021-01-25 05:17

    It's amusing to note that the OP's solution appears to be the fastest one:

    f1 <- function(mydf) {
        mapply(function(x, y) {
            paste(x, y, sep = ".")},
            mydf[ ,seq(1, ncol(mydf), by = 2)],
            mydf[ ,seq(2, ncol(mydf), by = 2)])
    }
    
    f.thelatemail <- function(mydf) {
        mapply(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep=".")
    }
    
    require(dplyr)
    
    f.on_the_shores_of_linux_sea <- function(mydf) {
        transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2)) 
    }
    
    f.jazurro <- function(mydf) {
        odd <- seq(1, ncol(mydf), 2);
        lapply(odd, function(x) paste(mydf[,x], mydf[,x+1], sep = ".")) %>% 
            do.call(cbind,.)
    }
    
    library(data.table) 
    f.akrun <- function(mydf) {
        res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
        indx <- seq(1, ncol(mydf), 2)
        setDT(mydf)
        for(j in seq_along(indx)){
            set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]], 
                                               mydf[[indx[j]+1]], sep='.'))
        }
        res
    }
    
    mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
    mydf <- mydf[rep(1:nrow(mydf),5000),]
    
    
    library(rbenchmark)
    benchmark(f1(mydf),f.thelatemail(mydf),f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf))
    

    Results:

    #                                 test replications elapsed relative user.self sys.self user.child sys.child
    # 5                      f.akrun(mydf)          100  14.000   75.269    13.673    0.296          0         0
    # 4                    f.jazurro(mydf)          100   0.388    2.086     0.314    0.071          0         0
    # 3 f.on_the_shores_of_linux_sea(mydf)          100  15.585   83.790    15.293    0.280          0         0
    # 2                f.thelatemail(mydf)          100  26.416  142.022    25.736    0.639          0         0
    # 1                           f1(mydf)          100   0.186    1.000     0.169    0.017          0         0
    

    [Updated Benchmark]

    I've added one solution from @thelatemail, which I missed in the original answer, and one solution from @akrun:

    f.thelatemail2 <- function(mydf) {
        data.frame(Map(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep="."))
    }
    
    f.akrun2 <- function(mydf) {    
        setDT(mydf)
        indx <- as.integer(seq(1, ncol(mydf), 2))
        mydf2 <- copy(mydf)
        for(j in indx){
            set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
                                                 mydf2[[j+1]], sep="."))
        }
        mydf2[,indx, with=FALSE]
    }
    

    Benchmark:

    library(rbenchmark)
    
    benchmark(f1(mydf),f.thelatemail(mydf), f.thelatemail2(mydf), f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf),f.akrun2(mydf))
    #                                 test replications elapsed relative user.self sys.self user.child sys.child
    # 6                      f.akrun(mydf)          100  13.247   69.356    12.897    0.340          0         0
    # 7                     f.akrun2(mydf)          100  12.746   66.733    12.405    0.339          0         0
    # 5                    f.jazurro(mydf)          100   0.327    1.712     0.254    0.073          0         0
    # 4 f.on_the_shores_of_linux_sea(mydf)          100  16.347   85.586    15.838    0.445          0         0
    # 2                f.thelatemail(mydf)          100  26.307  137.733    25.536    0.708          0         0
    # 3               f.thelatemail2(mydf)          100  15.938   83.445    15.136    0.750          0         0
    # 1                           f1(mydf)          100   0.191    1.000     0.156    0.036          0         0
    
    0 讨论(0)
  • 2021-01-25 05:20

    An option using set from data.table. It should be fast for large datasets as it modifies by reference and the overhead of [.data.table is avoided. Assuming that the columns are ordered for each pair of columns.

    library(data.table)
    res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
    indx <- seq(1, ncol(mydf), 2)
    setDT(mydf)
    for(j in seq_along(indx)){
       set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]], 
                               mydf[[indx[j]+1]], sep='.'))
     }
    head(res)
    #    V1  V2
    #1: a.26 a.1
    #2: b.25 b.2
    #3: c.24 c.3
    #4: d.23 d.4
    #5: e.22 e.5
    #6: f.21 f.6
    

    Instead of creating a new result dataset, we can also update the same or a copy of the original dataset. There will be some warnings about type conversion, but I guess this would be a bit faster (not benchmarked)

    setDT(mydf)
    mydf2 <- copy(mydf)
    for(j in indx){
      set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
       mydf2[[j+1]], sep="."))
     }
     mydf2[,indx, with=FALSE]
    

    Benchmarks

    I tried the benchmarks on a slightly bigger data with many columns.

    data

    set.seed(24)
    d1 <- as.data.frame(matrix(sample(letters,500*10000, replace=TRUE), 
        ncol=500), stringsAsFactors=FALSE)
    set.seed(4242)
    d2 <- as.data.frame(matrix(sample(1:200,500*10000,
                replace=TRUE), ncol=500))
    d3 <- cbind(d1,d2)
    mydf <- d3[,order(c(1:ncol(d1), 1:ncol(d2)))]
    mydf1 <- copy(mydf) 
    

    Compared f1, f.jazurro (fastest) (from @Marat Talipov's post) with f.akrun2

       microbenchmark(f1(mydf), f.jazurro(mydf), f.akrun2(mydf1),
             unit='relative', times=20L)
       #Unit: relative
       #        expr      min        lq     mean   median       uq      max neval
       #      f1(mydf) 3.420448 2.3217708 2.714495 2.653178 2.819952 2.736376    20
       #f.jazurro(mydf) 1.000000 1.0000000 1.000000 1.000000 1.000000 1.000000    20
       #f.akrun2(mydf1) 1.204488 0.8015648 1.031248 1.042262 1.097136 1.066671    20
       #cld
       #b
       #a 
       #a 
    

    In this, f.jazurro is slighly better than f.akrun2. I think if I increase the group size, nrows etc, it would be an interesting comparison

    0 讨论(0)
提交回复
热议问题