Fastest way to transpose a list in R / Rcpp

后端 未结 2 1567
轻奢々
轻奢々 2021-01-17 18:19

I have a list:

ls <- list(c(\"a\", \"b\", \"c\"), c(\"1\", \"2\", \"3\"), c(\"foo\", \"bar\", \"baz\"))
ls

#> [[1]]
#> [1] \"a\" \"b\" \"c\"

#>         


        
相关标签:
2条回答
  • 2021-01-17 19:13

    In the data.table package, there's a transpose() function which does exactly this. It is implemented in C for speed.

    require(data.table) # v1.9.6+
    transpose(ls)
    # [[1]]
    # [1] "a"   "1"   "foo"
    
    # [[2]]
    # [1] "b"   "2"   "bar"
    
    # [[3]]
    # [1] "c"   "3"   "baz"
    

    It also fills automatically with NA in case the list elements are not of equal lengths, and also coerces automatically to the highest SEXPTYPE. You can provide a different value to the fill argument if necessary. Check ?transpose.

    0 讨论(0)
  • 2021-01-17 19:19

    "list"s are R objects with no C equivalent, so manipulating them in C will gain efficiency only in terms of surrounding computations, since the actual transposing will need to come back and forth between R objects. Arun's transpose is a concise approach to this problem and, seemingly, can't get any better. I'll just provide some other alternatives just to show that transposing a "list" can be cranky and maybe adopting a different approach to achieve the final goal might be better.

    map = function(x) .mapply(c, x, NULL)
    
    lap = function(x) lapply(seq_along(x[[1]]), function(i) unlist(lapply(x, "[[", i)))
    
    library(data.table)
    DT = function(x) transpose(x)
    
    # very simple C loop that proves that `data.table::transpose` is as good as it gets
    loopC = inline::cfunction(sig = c(R_ls = "list"), body = '
        SEXPTYPE tp = 0;
        SEXP ans, tmp;
        PROTECT(ans = allocVector(VECSXP, LENGTH(VECTOR_ELT(R_ls, 0))));
        for(int i = 0; i < LENGTH(R_ls); i++) {
            tmp = VECTOR_ELT(R_ls, i);
            if(TYPEOF(tmp) > tp) tp = TYPEOF(tmp);
        }
        for(int i = 0; i < LENGTH(ans); i++) SET_VECTOR_ELT(ans, i, allocVector(tp, LENGTH(R_ls)));
    
        switch(tp) {
            case LGLSXP:
            case INTSXP: {
                for(int i = 0; i < LENGTH(R_ls); i++) {
                    PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
                    int *ptmp = INTEGER(tmp);
                    for(int j = 0; j < LENGTH(ans); j++) INTEGER(VECTOR_ELT(ans, j))[i] = ptmp[j];
                    UNPROTECT(1);
                }
    
                break;
            }
            case REALSXP: {
                for(int i = 0; i < LENGTH(R_ls); i++) {
                    PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
                    double *ptmp = REAL(tmp);
                    for(int j = 0; j < LENGTH(ans); j++) REAL(VECTOR_ELT(ans, j))[i] = ptmp[j];
                    UNPROTECT(1);
                }
    
                break;
            }
            case STRSXP: {
                for(int i = 0; i < LENGTH(R_ls); i++) {
                    PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
                    for(int j = 0; j < LENGTH(ans); j++) SET_STRING_ELT(VECTOR_ELT(ans, j), i, STRING_ELT(tmp, j));
                    UNPROTECT(1);
                }
    
                break;
            }
        }
    
        UNPROTECT(1);
        return(ans);
    ')
    
    spl = function(x) split(unlist(x), rep(seq_along(x[[1]]), length(x)))
    
    map(ls)
    #[[1]]
    #[1] "a"   "1"   "foo"
    #
    #[[2]]
    #[1] "b"   "2"   "bar"
    #
    #[[3]]
    #[1] "c"   "3"   "baz"
    #
    lap(ls)
    #[[1]]
    #[1] "a"   "1"   "foo"
    #
    #[[2]]
    #[1] "b"   "2"   "bar"
    #
    #[[3]]
    #[1] "c"   "3"   "baz"
    #
    DT(ls)
    #[[1]]
    #[1] "a"   "1"   "foo"
    #
    #[[2]]
    #[1] "b"   "2"   "bar"
    #
    #[[3]]
    #[1] "c"   "3"   "baz"
    #
    loopC(ls)
    #[[1]]
    #[1] "a"   "1"   "foo"
    #
    #[[2]]
    #[1] "b"   "2"   "bar"
    #
    #[[3]]
    #[1] "c"   "3"   "baz"
    #
    spl(ls)
    #$`1`
    #[1] "a"   "1"   "foo"
    #
    #$`2`
    #[1] "b"   "2"   "bar"
    #
    #$`3`
    #[1] "c"   "3"   "baz"
    

    And a benchmark:

    myls1 = rep_len(list(sample(1e3), runif(1e3), sample(letters, 1e3, T)), 1e3)  #1e3 x 1e3
    myls2 = rep_len(list(sample(1e5), runif(1e5), sample(letters, 1e5, T)), 1e1)  #10 x 1e5
    myls3 = rep_len(list(sample(1e1), runif(1e1), sample(letters, 1e1, T)), 1e5)  #1e5 x 10
    
    identical(map(myls1), lap(myls1))
    #[1] TRUE
    identical(map(myls1), DT(myls1))
    #[1] TRUE
    identical(map(myls1), loopC(myls1))
    #[1] TRUE
    identical(map(myls1), unname(spl(myls1)))
    #[1] TRUE
    
    microbenchmark::microbenchmark(map(myls1), lap(myls1), DT(myls1), loopC(myls1), spl(myls1),
                                    map(myls2), lap(myls2), DT(myls2), loopC(myls2), spl(myls2),
                                    map(myls3), lap(myls3), DT(myls3), loopC(myls3), spl(myls3), 
                                    times = 10)
    #Unit: milliseconds
    #         expr       min        lq    median        uq       max neval
    #   map(myls1) 1141.9477 1187.8107 1281.4314 1331.4490 1961.8452    10
    #   lap(myls1) 1082.7023 1104.6467 1182.8303 1219.5397 1695.6164    10
    #    DT(myls1)  378.0574  399.7339  433.4307  459.0293  495.2200    10
    # loopC(myls1)  390.0305  392.5139  405.6461  480.7480  638.9145    10
    #   spl(myls1)  676.2639  756.1798  786.8639  821.7699  869.0219    10
    #   map(myls2) 1241.1010 1304.2250 1386.1915 1439.5182 1546.3835    10
    #   lap(myls2) 1823.2029 1922.1878 1965.6653 2006.6102 2161.9819    10
    #    DT(myls2)  471.5797  521.7380  554.2221  578.3043  887.1452    10
    # loopC(myls2)  472.5713  494.9302  524.2538  591.0493  657.6087    10
    #   spl(myls2) 1108.1530 1117.7448 1212.0051 1297.8838 1336.8266    10
    #   map(myls3) 2005.1325 2178.3739 2214.1824 2451.7050 2539.5152    10
    #   lap(myls3) 1172.3033 1215.1297 1242.0294 1292.7345 1434.1707    10
    #    DT(myls3)  388.6679  393.5446  416.5494  479.1473  721.0758    10
    # loopC(myls3)  389.4098  396.6768  404.9609  432.4390  451.8912    10
    #   spl(myls3)  675.7749  704.3328  767.0548  817.7189  937.1469    10
    
    0 讨论(0)
提交回复
热议问题