Fast concatenation of data.table columns into one string column

后端 未结 3 1583
挽巷
挽巷 2021-01-31 18:35

Given an arbitrary list of column names in a data.table, I want to concatenate the contents of those columns into a single string stored in a new column. The column

相关标签:
3条回答
  • 2021-01-31 18:46

    I don't know how representative the sample data is for your actual data, but in the case of your sampled data you can achieve a substantial performance improvement by only concatenating each unique combination of ConcatCols once instead of multiple times.

    That means for the sample data, you'd be looking at ~500k concatenations vs 10 million if you do all the duplicates too.

    See the following code and timing example:

    system.time({
      setkeyv(DT, ConcatCols)
      DTunique <- unique(DT[, ConcatCols, with=FALSE], by = key(DT))
      DTunique[, State :=  do.call(paste, c(DTunique, sep = ""))]
      DT[DTunique, State := i.State, on = ConcatCols]
    })
    #       user      system     elapsed 
    #      7.448       0.462       4.618 
    

    About half the time is spent on the setkey part. In case your data is already keyed, the time is cut down further to just a bit more than 2 seconds.

    setkeyv(DT, ConcatCols)
    system.time({
      DTunique <- unique(DT[, ConcatCols, with=FALSE], by = key(DT))
      DTunique[, State :=  do.call(paste, c(DTunique, sep = ""))]
      DT[DTunique, State := i.State, on = ConcatCols]
    })
    #       user      system     elapsed 
    #      2.526       0.280       2.181 
    
    0 讨论(0)
  • 2021-01-31 18:57

    This uses unite from package tidyr. May not be the fastest, but it is probably faster than hand-coded R code.

    library(tidyr)
    system.time(
      DNew <- DT %>% unite(State, ConcatCols, sep = "", remove = FALSE)
    )
    # user  system elapsed 
    # 14.974   0.183  15.343 
    
    DNew[1:10]
    # State   x   y a b c d e f
    # 1: foo211621bar foo bar 2 1 1 6 2 1
    # 2: foo532735bar foo bar 5 3 2 7 3 5
    # 3: foo965776bar foo bar 9 6 5 7 7 6
    # 4: foo221284bar foo bar 2 2 1 2 8 4
    # 5: foo485976bar foo bar 4 8 5 9 7 6
    # 6: foo566778bar foo bar 5 6 6 7 7 8
    # 7: foo892636bar foo bar 8 9 2 6 3 6
    # 8: foo836672bar foo bar 8 3 6 6 7 2
    # 9: foo963926bar foo bar 9 6 3 9 2 6
    # 10: foo385216bar foo bar 3 8 5 2 1 6
    
    0 讨论(0)
  • 2021-01-31 19:06

    C to the rescue!

    Stealing some code from data.table we can write a C function that works way faster (and could be parallelized to be even faster).

    First make sure you have a working C++ toolchain with:

    library(inline)
    
    fx <- inline::cfunction( signature(x = "integer", y = "numeric" ) , '
        return ScalarReal( INTEGER(x)[0] * REAL(y)[0] ) ;
    ' )
    fx( 2L, 5 ) #Should return 10
    

    Then this should work (assuming integer-only data, but the code can be extended to other types):

    library(inline)
    library(data.table)
    library(stringi)
    
    header <- "
    
    //Taken from https://github.com/Rdatatable/data.table/blob/master/src/fwrite.c
    static inline void reverse(char *upp, char *low)
    {
      upp--;
      while (upp>low) {
      char tmp = *upp;
      *upp = *low;
      *low = tmp;
      upp--;
      low++;
      }
    }
    
    void writeInt32(int *col, size_t row, char **pch)
    {
      char *ch = *pch;
      int x = col[row];
      if (x == INT_MIN) {
      *ch++ = 'N';
      *ch++ = 'A';
      } else {
      if (x<0) { *ch++ = '-'; x=-x; }
      // Avoid log() for speed. Write backwards then reverse when we know how long.
      char *low = ch;
      do { *ch++ = '0'+x%10; x/=10; } while (x>0);
      reverse(ch, low);
      }
      *pch = ch;
    }
    
    //end of copied code 
    
    "
    
    
    
     worker_fun <- inline::cfunction( signature(x = "list", preallocated_target = "character", columns = "integer", start_row = "integer", end_row = "integer"), includes = header , "
      const size_t _start_row = INTEGER(start_row)[0] - 1;
      const size_t _end_row = INTEGER(end_row)[0];
    
      const int max_out_len = 256 * 256; //max length of the final string
      char buffer[max_out_len];
      const size_t num_elements = _end_row - _start_row;
      const size_t num_columns = LENGTH(columns);
      const int * _columns = INTEGER(columns);
    
      for(size_t i = _start_row; i < _end_row; ++i) {
        char *buf_pos = buffer;
        for(size_t c = 0; c < num_columns; ++c) {
          if(c > 0) {
            buf_pos[0] = ',';
            ++buf_pos;
          }
          writeInt32(INTEGER(VECTOR_ELT(x, _columns[c] - 1)), i, &buf_pos);
        }
        SET_STRING_ELT(preallocated_target,i, mkCharLen(buffer, buf_pos - buffer));
      }
    return preallocated_target;
    " )
    
    #Test with the same data
    
    RowCount <- 5e6
    DT <- data.table(x = "foo",
                     y = "bar",
                     a = sample.int(9, RowCount, TRUE),
                     b = sample.int(9, RowCount, TRUE),
                     c = sample.int(9, RowCount, TRUE),
                     d = sample.int(9, RowCount, TRUE),
                     e = sample.int(9, RowCount, TRUE),
                     f = sample.int(9, RowCount, TRUE))
    
    ## Generate an expression to paste an arbitrary list of columns together
    ConcatCols <- list("a","b","c","d","e","f")
    ## Do it 3x as many times
    ConcatCols <- c(ConcatCols,ConcatCols,ConcatCols)
    
    
    ptm <- proc.time()
    preallocated_target <- character(RowCount)
    column_indices <- sapply(ConcatCols, FUN = function(x) { which(colnames(DT) == x )})
    x <- worker_fun(DT, preallocated_target, column_indices, as.integer(1), as.integer(RowCount))
    DT[, State := preallocated_target]
    proc.time() - ptm
    

    While your (integer only) example runs in about 20s on my PC, this runs in ~5s and can be easily parallelized.

    Some things to note:

    • The code is not production ready - a lot of sanity checks should be made on the function inputs (especially checking if all columns are the same length, checking column types, preallocated_target size etc.)
    • The function puts its output into a preallocated character vector, this is non-standard and ugly (R usually does not have pass-by-reference semantics) but allows for parallelization (see below).
    • The last two parameters are start and end rows to be processed, once again, this is for paralellization
    • The function accepts column indices not column names. All columns have to be of type integer.
    • Except for the input data.table and preallocated_target the inputs have to be integers
    • Compilation time for the function is not included (as you should compile it beforehand - maybe even make a package)

    Parallelization

    EDIT: The approach below would actually fail due to the way clusterExport and R string storage work. Paralellization thus probably needs to be done in C as well, similarly to the way it is achieved in data.table.

    Since you cannot pass inline-compiled functions across R processes, paralellization requires some more work. To be able to use the above function in parallel, you either need to compile it separately with R compiler and use dyn.load OR wrap it in a package OR use a forking backend for parallel (I don't have one, forking works only on UNIX).

    Running in parallel would then look something like (not tested):

    no_cores <- detectCores()
    
    # Initiate cluster
    cl <- makeCluster(no_cores)
    
    #Preallocated target and prepare params
    num_elements <- length(DT[[1]])
    preallocated_target <- character(num_elements)
    block_size <- 4096 #No of rows processed at once. Adjust for best performance
    column_indices <- sapply(ConcatCols, FUN = function(x) { which(colnames(DT) == x )})
    
    num_blocks <- ceiling(num_elements / block_size)
    
    clusterExport(cl, 
       c("DT","preallocated_target","column_indices","num_elements", "block_size"))
    clusterEvalQ(cl, <CODE TO LOAD THE NATIVE FUNCTION HERE>)
    
    parLapply(cl, 1:num_blocks ,
              function(block_id)
              {
                throw_away <- 
                  worker_fun(DT, preallocated_target, columns, 
                  (block_id - 1) * block_size + 1, min(num_elements, block_id * block_size - 1))
                return(NULL)
              })
    
    
    
    stopCluster(cl)
    
    0 讨论(0)
提交回复
热议问题