I have a list of vectors as follows.
data <- list(v1=c(\"a\", \"b\", \"c\"), v2=c(\"g\", \"h\", \"k\"),
v3=c(\"c\", \"d\"), v4=c(\"n\", \"a\
Here's another approach using only base R
Next update after akrun's comment and with his sample data:
data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))
Modified function:
x <- lapply(seq_along(data), function(i) {
if(!any(data[[i]] %in% unlist(data[-i]))) {
data[[i]]
} else if (any(data[[i]] %in% unlist(data[seq_len(i-1)]))) {
NULL
} else {
z <- lapply(data[-seq_len(i)], intersect, data[[i]])
z <- names(z[sapply(z, length) >= 1L])
if (is.null(z)) NULL else union(data[[i]], unlist(data[z]))
}
})
x[!sapply(x, is.null)]
#[[1]]
#[1] "g" "k"
#
#[[2]]
#[1] "a" "b" "c" "d"
This works well with the original sample data, MrFlick's sample data and akrun's sample data.
In general, you cannot do much better/faster than Floyd-Warshall-Algorithm, which is as follows:
library(Rcpp)
cppFunction(
"LogicalMatrix floyd(LogicalMatrix w){
int n = w.nrow();
for( int k = 0; k < n; k++ )
for( int i = 0; i < (n-1); i++ )
for( int j = i+1; j < n; j++ )
if( w(i,k) && w(k,j) ) {
w(i,j) = true;
w(j,i) = true;
}
return w;
}")
fw.union<-function(x) {
n<-length(x)
w<-matrix(F,nrow=n,ncol=n)
for( i in 1:n ) {
w[i,i]<-T
}
for( i in 1:(n-1) ) {
for( j in (i+1):n ) {
w[i,j]<-w[j,i]<- any(x[[i]] %in% x[[j]])
}
}
apply( unique( floyd(w) ), 1, function(y) { Reduce(union,x[y]) } )
}
Running benchmarks would be interesting, though. Preliminary tests suggest that my implementation is about 2-3 times faster than Vlo's.
This is kind of like a graph problem so I like to use the igraph
library for this, using your sample data, you can do
library(igraph)
#build edgelist
el <- do.call("rbind",lapply(data, embed, 2))
#make a graph
gg <- graph.edgelist(el, directed=F)
#partition the graph into disjoint sets
split(V(gg)$name, clusters(gg)$membership)
# $`1`
# [1] "b" "a" "c" "d" "n"
#
# $`2`
# [1] "h" "g" "k" "i"
And we can view the results with
V(gg)$color=c("green","purple")[clusters(gg)$membership]
plot(gg)
Efficiency be damned and do you people even sleep? Base R only and much slower than the fastest answer. Since I wrote it, might as well post it.
f.union = function(x) {
repeat{
n = length(x)
m = matrix(F, nrow = n, ncol = n)
for (i in 1:n){
for (j in 1:n) {
m[i,j] = any(x[[i]] %in% x[[j]])
}
}
o = apply(m, 2, function(v) Reduce(union, x[v]))
if (all(apply(m, 1, sum)==1)) {return(o)} else {x=unique(o)}
}
}
f.union(data)
[[1]]
[1] "a" "b" "c" "d" "n"
[[2]]
[1] "g" "h" "k" "i"
Because I like being slow. (loaded library outside of benchmark)
Unit: microseconds
expr min lq mean median uq max neval
vlo() 896.435 1070.6540 1315.8194 1129.4710 1328.6630 7859.999 1000
akrun() 596.263 658.6590 789.9889 694.1360 804.9035 3470.158 1000
flick() 805.854 928.8160 1160.9509 1001.8345 1172.0965 5780.824 1000
josh() 2427.752 2693.0065 3344.8671 2943.7860 3524.1550 16505.909 1000 <- deleted :-(
doc() 254.462 288.9875 354.6084 302.6415 338.9565 2734.795 1000
One option would be to use combn
and then find the intersects. There would be easier options.
indx <- combn(names(data),2)
lst <- lapply(split(indx, col(indx)),
function(i) Reduce(`intersect`,data[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
lapply(split(1:ncol(indx3), indx3[1,]),
function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE)))
#$v1
#[1] "a" "b" "c" "d" "n"
#$v2
#[1] "g" "h" "k" "i"
You could use combnPrim
from library(gRbase)
to make this even faster. Using a slightly bigger dataset
library(gRbase)
set.seed(25)
data <- setNames(lapply(1:1e3,function(i)sample(letters,
sample(1:20), replace=FALSE)), paste0("v", 1:1000))
and comparing with the fastest
. These are modified functions based on OP's comments to @docendo discimus.
akrun2M <- function(){
ind <- sapply(seq_along(data), function(i){#copied from @docendo discimus
!any(data[[i]] %in% unlist(data[-i]))
})
data1 <- data[!ind]
indx <- combnPrim(names(data1),2)
lst <- lapply(split(indx, col(indx)),
function(i) Reduce(`intersect`,data1[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
c(data[ind],lapply(split(1:ncol(indx3), indx3[1,]),
function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE))))
}
doc2 <- function(){
x <- lapply(seq_along(data), function(i) {
if(!any(data[[i]] %in% unlist(data[-i]))) {
data[[i]]
}
else {
z <- unlist(data[names(unlist(lapply(data[-c(1:i)],
intersect, data[[i]])))])
if (is.null(z)){
z
}
else union(data[[i]], z)
}
})
x[!sapply(x, is.null)]
}
microbenchmark(doc2(), akrun2M(), times=10L)
# Unit: seconds
# expr min lq mean median uq max neval cld
# doc2() 35.43687 53.76418 54.77813 54.34668 62.86665 67.76754 10 b
#akrun2M() 26.64997 28.74721 38.02259 35.35081 47.56781 49.82158 10 a