R Optimizing double for loop, matrix manipulation

后端 未结 4 2089
感情败类
感情败类 2021-01-20 19:20

I am trying to manipulate column data in a two column matrix and output it as a data.frame.

The matrix that I have is in this format where both the values in the sta

相关标签:
4条回答
  • 2021-01-20 19:53

    Here's a solution built around findInterval(), split(), and paste():

    m <- matrix(c(1,2,3,7,8,11,12,14,6,9,15,NA,NA,NA,NA,NA),ncol=2,dimnames=list(NULL,c('Start','End')));
    data.frame(Start=sapply(split(m[,'Start'],findInterval(m[,'Start'],na.omit(m[,'End']))),paste,collapse=','),End=na.omit(m[,'End']));
    ##      Start End
    ## 0    1,2,3   6
    ## 1      7,8   9
    ## 2 11,12,14  15
    

    Edit: The problem you encountered was due to the fact that in your real data some intervals between input End values do not contain any input Start values. My solution above is incorrectly omitting those intervals from the output Start vector, which causes a length mismatch against the output End vector.

    Here is a fixed solution:

    end <- na.omit(m[,'End']);
    data.frame(Start=unname(sapply(split(m[,'Start'],findInterval(m[,'Start'],end))[as.character(0:c(length(end)-1))],paste,collapse=',')),End=end);
    ##      Start End
    ## 1    1,2,3   6
    ## 2      7,8   9
    ## 3 11,12,14  15
    

    Here's a demonstration on a test matrix that has an empty interval:

    m <- matrix(c(1,2,3,11,12,14,6,9,15,NA,NA,NA),ncol=2,dimnames=list(NULL,c('Start','End')));
    m;
    ##      Start End
    ## [1,]     1   6
    ## [2,]     2   9
    ## [3,]     3  15
    ## [4,]    11  NA
    ## [5,]    12  NA
    ## [6,]    14  NA
    end <- na.omit(m[,'End']);
    data.frame(Start=unname(sapply(split(m[,'Start'],findInterval(m[,'Start'],end))[as.character(0:c(length(end)-1))],paste,collapse=',')),End=end);
    ##      Start End
    ## 1    1,2,3   6
    ## 2            9
    ## 3 11,12,14  15
    

    As you can see, for an empty interval, the value that results in the output Start vector is the empty string, which I consider a sensible result. You can change the result afterward if desired.

    Finally, here's a demo using the real data you posted to dropbox:

    m <- read.table('start_end.txt',col.names=c('Start','End'));
    head(m);
    ##   Start   End
    ## 1 11165 10548
    ## 2 12416 11799
    ## 3 12466 11900
    ## 4 12691 11976
    ## 5 12834 13336
    ## 6 13320 14028
    end <- na.omit(m[,'End']);
    system.time({ out <- data.frame(Start=unname(sapply(split(m[,'Start'],findInterval(m[,'Start'],end))[as.character(0:c(length(end)-1))],paste,collapse=',')),End=end); });
    ##    user  system elapsed
    ##  21.234   0.015  21.251
    head(out);
    ##                           Start   End
    ## 1                               10548
    ## 2                         11165 11799
    ## 3                               11900
    ## 4                               11976
    ## 5 12416,12466,12691,12834,13320 13336
    ## 6       13425,13571,13703,13920 14028
    nrow(out);
    ## [1] 131668
    
    0 讨论(0)
  • 2021-01-20 19:57

    You could use Rcpp:

    start_end <- matrix(c(1, 6, 2, 9, 3, 15, 7, NA, 8, NA, 11, NA, 12, NA, 14, NA), 
                        nrow=8, 
                        ncol=2, byrow = TRUE)
    
    library(Rcpp)
    cppFunction('
                DataFrame fun(const IntegerMatrix& Mat) {
                  IntegerVector start = na_omit(Mat(_, 0)); // remove NAs from starts
                  std::sort(start.begin(), start.end()); // sort starts
                  IntegerVector end = na_omit(Mat(_, 1)); // remove NAs from ends
                  std::sort(end.begin(), end.end()); // sort ends
                  IntegerVector res = clone(start); // initialize vector for matching ends
                  int j = 0;
                  for (int i = 0; i < start.length(); i++) { // loop over starts
                    while (end(j) < start(i) && j < (end.length() - 1)) { // find corresponding end
                      j++;
                    }
                    if (end(j) >= start(i)) res(i) = end(j); // assign end
                    else res(i) = NA_INTEGER; // assign NA if no end >= start exists
                  }
                  return DataFrame::create(_["start"]= start, _["end"]= res); // return a data.frame
                }
                ')
    
    Res <- fun(start_end)
    
    library(data.table)
    setDT(Res)
    Res[, .(start = paste(start, collapse = ",")), by = end]
    #   end    start
    #1:   6    1,2,3
    #2:   9      7,8
    #3:  15 11,12,14
    
    0 讨论(0)
  • 2021-01-20 20:06

    An ugly dplyr solution:

    library(dplyr)
    df <- as.data.frame(df)
    
    df %>% mutate(End = V2[findInterval(V1, na.omit(V2)) + 1]) %>%
           group_by(End) %>%
           summarise(Start = paste(V1, collapse=", "))
    

    Edit - using findInterval thanks to @bgoldst

    0 讨论(0)
  • 2021-01-20 20:09

    Here is a simple base R version

    with(as.data.frame(dat), {
      data.frame(
        Start=tapply(Start, cut(Start, c(0, End)), c),
        End=na.omit(End)
      )
    })
    #        Start End
    # 1    1, 2, 3   6
    # 2       7, 8   9
    # 3 11, 12, 14  15
    

    Another

    with(as.data.frame(dat), {
      group <- as.integer(cut(Start, c(0, End)))                  # assign Start values to End groups
      data.frame(
        Start=unclass(by(dat, group, function(g) g[["Start"]])),  # combine Start groups
        End=unique(na.omit(End))                                  # Remove duplicate/NA End values
      )
    })
    
    0 讨论(0)
提交回复
热议问题