Is there an elegant/fastR way to combine all pairs of columns in a data.frame?
For example, using mapply()
and paste()
we can turn this data.fr
I'm not sure this is the best approach. See if the below code gives any speed improvement
require(dplyr)
transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2))
Answer updated based on comment :-)
It's amusing to note that the OP's solution appears to be the fastest one:
f1 <- function(mydf) {
mapply(function(x, y) {
paste(x, y, sep = ".")},
mydf[ ,seq(1, ncol(mydf), by = 2)],
mydf[ ,seq(2, ncol(mydf), by = 2)])
}
f.thelatemail <- function(mydf) {
mapply(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep=".")
}
require(dplyr)
f.on_the_shores_of_linux_sea <- function(mydf) {
transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2))
}
f.jazurro <- function(mydf) {
odd <- seq(1, ncol(mydf), 2);
lapply(odd, function(x) paste(mydf[,x], mydf[,x+1], sep = ".")) %>%
do.call(cbind,.)
}
library(data.table)
f.akrun <- function(mydf) {
res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
indx <- seq(1, ncol(mydf), 2)
setDT(mydf)
for(j in seq_along(indx)){
set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]],
mydf[[indx[j]+1]], sep='.'))
}
res
}
mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
mydf <- mydf[rep(1:nrow(mydf),5000),]
library(rbenchmark)
benchmark(f1(mydf),f.thelatemail(mydf),f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf))
Results:
# test replications elapsed relative user.self sys.self user.child sys.child
# 5 f.akrun(mydf) 100 14.000 75.269 13.673 0.296 0 0
# 4 f.jazurro(mydf) 100 0.388 2.086 0.314 0.071 0 0
# 3 f.on_the_shores_of_linux_sea(mydf) 100 15.585 83.790 15.293 0.280 0 0
# 2 f.thelatemail(mydf) 100 26.416 142.022 25.736 0.639 0 0
# 1 f1(mydf) 100 0.186 1.000 0.169 0.017 0 0
[Updated Benchmark]
I've added one solution from @thelatemail, which I missed in the original answer, and one solution from @akrun:
f.thelatemail2 <- function(mydf) {
data.frame(Map(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep="."))
}
f.akrun2 <- function(mydf) {
setDT(mydf)
indx <- as.integer(seq(1, ncol(mydf), 2))
mydf2 <- copy(mydf)
for(j in indx){
set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
mydf2[[j+1]], sep="."))
}
mydf2[,indx, with=FALSE]
}
Benchmark:
library(rbenchmark)
benchmark(f1(mydf),f.thelatemail(mydf), f.thelatemail2(mydf), f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf),f.akrun2(mydf))
# test replications elapsed relative user.self sys.self user.child sys.child
# 6 f.akrun(mydf) 100 13.247 69.356 12.897 0.340 0 0
# 7 f.akrun2(mydf) 100 12.746 66.733 12.405 0.339 0 0
# 5 f.jazurro(mydf) 100 0.327 1.712 0.254 0.073 0 0
# 4 f.on_the_shores_of_linux_sea(mydf) 100 16.347 85.586 15.838 0.445 0 0
# 2 f.thelatemail(mydf) 100 26.307 137.733 25.536 0.708 0 0
# 3 f.thelatemail2(mydf) 100 15.938 83.445 15.136 0.750 0 0
# 1 f1(mydf) 100 0.191 1.000 0.156 0.036 0 0
An option using set
from data.table
. It should be fast for large datasets as it modifies by reference and the overhead of [.data.table
is avoided. Assuming that the columns are ordered for each pair of columns.
library(data.table)
res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
indx <- seq(1, ncol(mydf), 2)
setDT(mydf)
for(j in seq_along(indx)){
set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]],
mydf[[indx[j]+1]], sep='.'))
}
head(res)
# V1 V2
#1: a.26 a.1
#2: b.25 b.2
#3: c.24 c.3
#4: d.23 d.4
#5: e.22 e.5
#6: f.21 f.6
Instead of creating a new result dataset, we can also update the same or a copy of the original dataset. There will be some warnings about type conversion, but I guess this would be a bit faster (not benchmarked)
setDT(mydf)
mydf2 <- copy(mydf)
for(j in indx){
set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
mydf2[[j+1]], sep="."))
}
mydf2[,indx, with=FALSE]
I tried the benchmarks on a slightly bigger data with many columns.
set.seed(24)
d1 <- as.data.frame(matrix(sample(letters,500*10000, replace=TRUE),
ncol=500), stringsAsFactors=FALSE)
set.seed(4242)
d2 <- as.data.frame(matrix(sample(1:200,500*10000,
replace=TRUE), ncol=500))
d3 <- cbind(d1,d2)
mydf <- d3[,order(c(1:ncol(d1), 1:ncol(d2)))]
mydf1 <- copy(mydf)
Compared f1
, f.jazurro
(fastest) (from @Marat Talipov's post) with f.akrun2
microbenchmark(f1(mydf), f.jazurro(mydf), f.akrun2(mydf1),
unit='relative', times=20L)
#Unit: relative
# expr min lq mean median uq max neval
# f1(mydf) 3.420448 2.3217708 2.714495 2.653178 2.819952 2.736376 20
#f.jazurro(mydf) 1.000000 1.0000000 1.000000 1.000000 1.000000 1.000000 20
#f.akrun2(mydf1) 1.204488 0.8015648 1.031248 1.042262 1.097136 1.066671 20
#cld
#b
#a
#a
In this, f.jazurro
is slighly better than f.akrun2
. I think if I increase the group size, nrows etc, it would be an interesting comparison