Union of intersecting vectors in a list in R

后端 未结 5 1300
北恋
北恋 2020-12-06 05:19

I have a list of vectors as follows.

data <- list(v1=c(\"a\", \"b\", \"c\"), v2=c(\"g\", \"h\", \"k\"), 
             v3=c(\"c\", \"d\"), v4=c(\"n\", \"a\         


        
相关标签:
5条回答
  • 2020-12-06 05:35

    Here's another approach using only base R

    Update

    Next update after akrun's comment and with his sample data:

    data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))
    

    Modified function:

    x <- lapply(seq_along(data), function(i) {
      if(!any(data[[i]] %in% unlist(data[-i]))) {
        data[[i]]
      } else if (any(data[[i]] %in% unlist(data[seq_len(i-1)]))) {
        NULL 
      } else {
        z <- lapply(data[-seq_len(i)], intersect,  data[[i]]) 
        z <- names(z[sapply(z, length) >= 1L])
        if (is.null(z)) NULL else union(data[[i]], unlist(data[z]))
      }
    })
    x[!sapply(x, is.null)]
    #[[1]]
    #[1] "g" "k"
    #
    #[[2]]
    #[1] "a" "b" "c" "d"
    

    This works well with the original sample data, MrFlick's sample data and akrun's sample data.

    0 讨论(0)
  • 2020-12-06 05:37

    In general, you cannot do much better/faster than Floyd-Warshall-Algorithm, which is as follows:

    library(Rcpp)
    
    cppFunction(
      "LogicalMatrix floyd(LogicalMatrix w){
        int n = w.nrow();
        for( int k = 0; k < n; k++ )
         for( int i = 0; i < (n-1); i++ )
          for( int j = i+1; j < n; j++ ) 
           if( w(i,k) && w(k,j) ) {
            w(i,j) = true;
            w(j,i) = true;
           }
       return w;
    }")
    
    fw.union<-function(x) {
      n<-length(x)
      w<-matrix(F,nrow=n,ncol=n)
      for( i in 1:n ) {
       w[i,i]<-T
      }
      for( i in 1:(n-1) ) {
       for( j in (i+1):n ) {
         w[i,j]<-w[j,i]<- any(x[[i]] %in% x[[j]])
       }
      }
     apply( unique( floyd(w) ), 1, function(y) { Reduce(union,x[y]) } )
    }
    

    Running benchmarks would be interesting, though. Preliminary tests suggest that my implementation is about 2-3 times faster than Vlo's.

    0 讨论(0)
  • 2020-12-06 05:49

    This is kind of like a graph problem so I like to use the igraph library for this, using your sample data, you can do

    library(igraph)
    #build edgelist
    el <- do.call("rbind",lapply(data, embed, 2))
    #make a graph
    gg <- graph.edgelist(el, directed=F)
    #partition the graph into disjoint sets
    split(V(gg)$name, clusters(gg)$membership)
    
    # $`1`
    # [1] "b" "a" "c" "d" "n"
    # 
    # $`2`
    # [1] "h" "g" "k" "i"
    

    And we can view the results with

    V(gg)$color=c("green","purple")[clusters(gg)$membership]
    plot(gg)
    

    enter image description here

    0 讨论(0)
  • 2020-12-06 05:57

    Efficiency be damned and do you people even sleep? Base R only and much slower than the fastest answer. Since I wrote it, might as well post it.

    f.union = function(x) {
      repeat{
        n = length(x)
        m = matrix(F, nrow = n, ncol = n)
        for (i in 1:n){
          for (j in 1:n) {
            m[i,j] = any(x[[i]] %in% x[[j]])
          }
        }
        o = apply(m, 2, function(v) Reduce(union, x[v]))
        if (all(apply(m, 1, sum)==1)) {return(o)} else {x=unique(o)}
      }
    }
    
    f.union(data)
    
    [[1]]
    [1] "a" "b" "c" "d" "n"
    
    [[2]]
    [1] "g" "h" "k" "i"
    

    Because I like being slow. (loaded library outside of benchmark)

    Unit: microseconds
        expr      min        lq      mean    median        uq       max neval
       vlo()  896.435 1070.6540 1315.8194 1129.4710 1328.6630  7859.999  1000
     akrun()  596.263  658.6590  789.9889  694.1360  804.9035  3470.158  1000
     flick()  805.854  928.8160 1160.9509 1001.8345 1172.0965  5780.824  1000
      josh() 2427.752 2693.0065 3344.8671 2943.7860 3524.1550 16505.909  1000 <- deleted :-(
       doc()  254.462  288.9875  354.6084  302.6415  338.9565  2734.795  1000
    
    0 讨论(0)
  • 2020-12-06 05:59

    One option would be to use combn and then find the intersects. There would be easier options.

    indx <- combn(names(data),2)
    lst <- lapply(split(indx, col(indx)), 
            function(i) Reduce(`intersect`,data[i]))
    indx1 <- names(lst[sapply(lst, length)>0])
    indx2 <- indx[,as.numeric(indx1)]
    indx3 <- apply(indx2,2, sort)
    lapply(split(1:ncol(indx3), indx3[1,]),
       function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE)))
    #$v1
    #[1] "a" "b" "c" "d" "n"
    
    #$v2
    #[1] "g" "h" "k" "i"
    

    Update

    You could use combnPrim from library(gRbase) to make this even faster. Using a slightly bigger dataset

    library(gRbase)
    set.seed(25)
    data <- setNames(lapply(1:1e3,function(i)sample(letters,
             sample(1:20), replace=FALSE)), paste0("v", 1:1000))
    

    and comparing with the fastest. These are modified functions based on OP's comments to @docendo discimus.

    akrun2M <- function(){
         ind <- sapply(seq_along(data), function(i){#copied from @docendo discimus
                !any(data[[i]] %in% unlist(data[-i]))
                  })
         data1 <- data[!ind] 
         indx <- combnPrim(names(data1),2)
         lst <- lapply(split(indx, col(indx)), 
                  function(i) Reduce(`intersect`,data1[i]))
         indx1 <- names(lst[sapply(lst, length)>0])
         indx2 <- indx[,as.numeric(indx1)]
         indx3 <- apply(indx2,2, sort)
         c(data[ind],lapply(split(1:ncol(indx3), indx3[1,]),
            function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE))))
       } 
    
    doc2 <- function(){
          x <- lapply(seq_along(data), function(i) {
              if(!any(data[[i]] %in% unlist(data[-i]))) {
                   data[[i]]
               } 
              else {
                z <- unlist(data[names(unlist(lapply(data[-c(1:i)],
                                         intersect, data[[i]])))]) 
              if (is.null(z)){ 
                   z
                   }
              else union(data[[i]], z)
            }
       })
    x[!sapply(x, is.null)]
    }
    

    Benchmarks

     microbenchmark(doc2(), akrun2M(), times=10L)
     # Unit: seconds
     #    expr      min       lq     mean   median       uq      max neval  cld
     #   doc2() 35.43687 53.76418 54.77813 54.34668 62.86665 67.76754    10   b
     #akrun2M() 26.64997 28.74721 38.02259 35.35081 47.56781 49.82158    10   a 
    
    0 讨论(0)
提交回复
热议问题