How to make R foreach loops efficient

后端 未结 1 906
北荒
北荒 2021-01-26 00:43

I am trying to compute a 300,000x300,000 matrix in R, my codes are working quite well but it\'s been running for days now, how can i make it more efficient and time saving?

1条回答
  •  时光取名叫无心
    2021-01-26 01:14

    The problem you have is that this is recursive. Each loop depends on the previous loop's results. Therefore, you can't really use vectorization to solve the problem.

    If you want to use R for this, you're best bet is to look into Rcpp. I'm not that good with Rcpp but I do have some suggestions.

    The easiest thing to do is to get rid of the foreach loop and replace it with a regular for loop. There's a lot of overhead to use parallel threads and when a function is recursive, it's hard for the workers to really do better on their own.

    # Before
    
    foreach(j = 1:(t-1),  .combine='c', .packages=c("Matrix", "foreach")) %do%
    { ... }
    
    # After
    for (j in 1:(t-1)) {
    ...
    }
    

    The next thing to do is to contemplate whether you really need a sparse matrix. If you're not having memory problems, you might as well use a regular matrix.

    A<-Matrix(0,nrow=n,ncol=n, sparse=TRUE)
    # to
    A<-matrix(0,nrow=n,ncol=n)
    

    The last thing to do is to rethink how you initialize everything. Parts of that code gets repeated multiple times like the assignment to the diag. Since we're summing separate elements, we can initialize the diag with the part common to all 3 code snippets 2 - 0.5^(fam[t, 'GEN'] - 1).

    A<-matrix(0,nrow=n,ncol=n)
    diag(A) <- 2-0.5^(fam[["GEN"]]-1)
    

    This is important because that allows us to skip ahead. Your original code snippet had like, 1,000 rows with 0s for 'mum' and 'dad'. With this initialization, we can skip right ahead to the first row with a non-zero result for 'mum' or 'dad':

      t_start <- min(which.max(fam$dad > 0), which.max(fam$mum > 0))
      t_end <- max(fam[['ID']])
    
      for (t in t_start:t_end) {
    ...
    }
    

    I decided in the interest of skipping if statements, I wanted to use sum(c(..., ...)) to sum up everything. That way, if the subset resulted in a NULL, I could still sum. Altogether:

      t_start <- min(which.max(fam$dad > 0), which.max(fam$mum > 0))
      t_end <- max(fam[['ID']])
    
      A<-matrix(0,nrow=t_end,ncol=t_end)
      diag(A) <- 2-0.5^(fam[["GEN"]]-1)
    
      for (t in t_start:t_end) {
        A[t,t]<- sum(c(A[t,t], 0.5^(fam[t,"GEN"])*A[fam[t,"dad"],fam[t,"mum"]]))
    
        for(j in 1:(t-1))  {
          A[t,j]<- 0.5 * sum(c(A[j,fam[t,"dad"]],A[j,fam[t,"mum"]]))
          A[j,t]<- A[t,j]
        }
      }
      A
    

    Performance

    Unit: microseconds
                    expr       min         lq      mean    median        uq     max neval
                original 85759.901 86650.7515 88776.695 88740.050 90529.750 97433.2   100
             non_foreach 47912.601 48528.5010 50699.867 50220.901 51782.651 88355.1   100
     non_sparse_non_each  1423.701  1454.3015  1531.833  1471.451  1496.401  4126.3   100
            final_change   953.102   981.8015  1212.264  1010.500  1026.052 21350.1   100
    

    All code

    fam <- structure(list(ID = c(1L, 2L, 3L, 4L, 6L, 5L, 7L), dad = c(0L, 
                                                                      0L, 1L, 1L, 1L, 3L, 5L), mum = c(0L, 0L, 0L, 2L, 4L, 4L, 6L), 
                          GEN = c(1L, 1L, 2L, 2L, 3L, 3L, 4L)), class = "data.frame", row.names = c(NA, 
                                                                                                    -7L))
    A<-matrix(0,nrow=7,ncol=7)
    diag(A) <- 2-0.5^(fam[["GEN"]]-1)
    
    t_start <- min(which.max(fam$dad > 0), which.max(fam$mum > 0))
    t_end <- max(fam[['ID']])
    
    for (t in t_start:t_end) {
      A[t,t]<- sum(c(A[t,t], 0.5^(fam[t,"GEN"])*A[fam[t,"dad"],fam[t,"mum"]]))
    
      for(j in 1:(t-1))  {
        A[t,j]<- 0.5 * sum(c(A[j,fam[t,"dad"]],A[j,fam[t,"mum"]]))
        A[j,t]<- A[t,j]
      }
    }
    A
    
    hom<-function(data) { 
      library(Matrix)
      library(foreach)
      n<-max(as.numeric(fam[,"ID"])) 
      t<-min(as.numeric(fam[,"ID"])) 
      A<-Matrix(0,nrow=n,ncol=n, sparse=TRUE)
    
      while(t <=n) {
    
        s<-max(fam[t,"dad"],fam[t,"mum"]) 
        d<-min(fam[t,"dad"],fam[t,"mum"])
        if (s>0 & d>0 ) 
        { 
          if (fam[t,"GEN"]==999 & s!=d) 
          { warning("both dad and mum should be the same, different for at least       one individual")
            NULL    
          }
    
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1)+0.5^(fam[t,"GEN"])*A[fam[t,"dad"],fam[t,"mum"]]
          foreach(j = 1:(t-1),  .combine='c', .packages=c("Matrix", "foreach")) %do%
    
          { 
            A[t,j]<- 0.5*(A[j,fam[t,"dad"]]+A[j,fam[t,"mum"]])
            A[j,t]<- A[t,j] 
          } 
        } 
        if (s>0 & d==0 )
        { 
          if ( fam[t,"GEN"]==999) 
          { warning("both dad and mum should be the same, one parent equal to zero for at least individual")
            NULL }
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1) 
          foreach(j = 1:(t-1),  .combine='c', .packages=c("Matrix", "foreach")) %do%  
          { 
            A[t,j]<-0.5*A[j,s]
            A[j,t]<-A[t,j] 
          } 
        } 
        if (s==0 )
        {
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1)
        }
    
        # cat(" MatbyGEN: ", t ,"\n") 
        t <- t+1
    
    
      } 
    
      A
    
    }
    
    hom2<-function(data) { 
      library(Matrix)
      n<-max(as.numeric(fam[,"ID"])) 
      t<-min(as.numeric(fam[,"ID"])) 
      A<-Matrix(0,nrow=n,ncol=n, sparse = T)
    
      while(t <=n) {
    
        s<-max(fam[t,"dad"],fam[t,"mum"]) 
        d<-min(fam[t,"dad"],fam[t,"mum"])
        if (s>0 & d>0 ) 
        { 
          if (fam[t,"GEN"]==999 & s!=d) 
          { warning("both dad and mum should be the same, different for at least       one individual")
            NULL    
          }
    
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1)+0.5^(fam[t,"GEN"])*A[fam[t,"dad"],fam[t,"mum"]]
          for (j in 1:(t-1)) { 
            A[t,j]<- 0.5*(A[j,fam[t,"dad"]]+A[j,fam[t,"mum"]])
            A[j,t]<- A[t,j] 
          } 
        } 
        if (s>0 & d==0 )
        { 
          if ( fam[t,"GEN"]==999) 
          { warning("both dad and mum should be the same, one parent equal to zero for at least individual")
            NULL }
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1) 
          for (j in 1:(t-1)) { 
            A[t,j]<-0.5*A[j,s]
            A[j,t]<-A[t,j] 
          } 
        } 
        if (s==0 )
        {
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1)
        }
    
        # cat(" MatbyGEN: ", t ,"\n") 
        t <- t+1
    
    
      } 
    
      A
    
    }
    
    hom3<-function(data) { 
      n<-max(as.numeric(fam[,"ID"])) 
      t<-min(as.numeric(fam[,"ID"])) 
      A<-matrix(0,nrow=n,ncol=n)
    
      while(t <=n) {
    
        s<-max(fam[t,"dad"],fam[t,"mum"]) 
        d<-min(fam[t,"dad"],fam[t,"mum"])
        if (s>0 & d>0 ) 
        { 
          if (fam[t,"GEN"]==999 & s!=d) 
          { warning("both dad and mum should be the same, different for at least       one individual")
            NULL    
          }
    
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1)+0.5^(fam[t,"GEN"])*A[fam[t,"dad"],fam[t,"mum"]]
          for (j in 1:(t-1)) { 
            A[t,j]<- 0.5*(A[j,fam[t,"dad"]]+A[j,fam[t,"mum"]])
            A[j,t]<- A[t,j] 
          } 
        } 
        if (s>0 & d==0 )
        { 
          if ( fam[t,"GEN"]==999) 
          { warning("both dad and mum should be the same, one parent equal to zero for at least individual")
            NULL }
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1) 
          for (j in 1:(t-1)) { 
            A[t,j]<-0.5*A[j,s]
            A[j,t]<-A[t,j] 
          } 
        } 
        if (s==0 )
        {
          A[t,t]<- 2-0.5^(fam[t,"GEN"]-1)
        }
    
        # cat(" MatbyGEN: ", t ,"\n") 
        t <- t+1
    
    
      } 
    
      A
    
    }
    
    library(microbenchmark)
    f_changed = function(fam) {
      t_start <- min(which.max(fam$dad > 0), which.max(fam$mum > 0))
      t_end <- max(fam[['ID']])
    
      A<-matrix(0,nrow=t_end,ncol=t_end)
      diag(A) <- 2-0.5^(fam[["GEN"]]-1)
    
      for (t in t_start:t_end) {
        A[t,t]<- sum(c(A[t,t], 0.5^(fam[t,"GEN"])*A[fam[t,"dad"],fam[t,"mum"]]))
    
        for(j in 1:(t-1))  {
          A[t,j]<- 0.5 * sum(c(A[j,fam[t,"dad"]],A[j,fam[t,"mum"]]))
          A[j,t]<- A[t,j]
        }
      }
      A
    }
    microbenchmark(
      original = {
        hom(fam)
      }
      , non_foreach = {
        hom2(fam)
      }
      , non_sparse_non_each = {
        hom3(fam)
      }
      , final_change = {
      f_changed(fam)
      }
    ,times = 100
    )
    

    0 讨论(0)
提交回复
热议问题