Split a string column into several dummy variables

前端未结

关注

 6  667

As a relatively inexperienced user of the data.table package in R, I\'ve been trying to process one text column into a large number of indicator columns (dummy variables), w

Benchmarks:

Creating the sample data:

# sample data, using OP's exmple
set.seed(10)
N <- 1e6  # number of rows
elements_list <- c(outer(letters, letters, FUN = paste, sep = ""))  
messy_string_vec <- random_string_fast(N, 2, 5, "$")   # Create the messy strings in a single shot. 
masterDT <- data.table(ID = c(1:N), messy_string = messy_string_vec, key="ID")   # create the data.table

Side Note It is significantly faster to create the random strings all at once and assign the results as a single column than to call the function N times and assign each, one by one.

  # Faster way to create the `messy_string` 's
  random_string_fast <- function(N, min_length, max_length, separator) {  
    ints <- seq(from=min_length, to=max_length)
    replicate(N, paste(sample(elements_list, sample(ints)), collapse=separator))
  }

Comparing Four Methods:

this answer -- "DT.RS"
@eddi's answer -- "Plyr.eddi"
@GeekTrader's answer -- DT.GT
GeekTrader's' answer with some modifications -- DT.GT_Mod

Here is the setup:

library(data.table); library(plyr); library(microbenchmark)

# data.table method - RS
usingDT.RS <- quote({DT <- copy(masterDT);
                    DT2 <- setkey(DT[, list(val=unlist(strsplit(messy_string, "\\$"))), by=list(ID, messy_string)], "val"); DT2[, c(elements_list) := 0L]
                    for (el in elements_list) DT2[el, c(el) := 1L]; DT2[, lapply(.SD, sum), by=list(ID, messy_string), .SDcols=elements_list]})

# data.table method - GeekTrader
usingDT.GT <- quote({dt <- copy(masterDT); myFunc()})

# data.table method - GeekTrader, modified by RS
usingDT.GT_Mod <- quote({dt <- copy(masterDT); myFunc.modified()})

# ply method from below
usingPlyr.eddi <- quote({dt <- copy(masterDT); indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i) dt[i, data.frame(t(as.matrix(table(strsplit(messy_string, split = "\\$"))))) ])); 
                    dt = cbind(dt, indicators); dt[is.na(dt)] = 0; dt })

Here are the benchmark results:

microbenchmark( usingDT.RS=eval(usingDT.RS), usingDT.GT=eval(usingDT.GT), usingDT.GT_Mod=eval(usingDT.GT_Mod), usingPlyr.eddi=eval(usingPlyr.eddi), times=5L)


  On smaller data: 

  N = 600
  Unit: milliseconds
              expr       min        lq    median        uq       max
  1     usingDT.GT 1189.7549 1198.1481 1200.6731 1202.0972 1203.3683
  2 usingDT.GT_Mod  581.7003  591.5219  625.7251  630.8144  650.6701
  3     usingDT.RS 2586.0074 2602.7917 2637.5281 2819.9589 3517.4654
  4 usingPlyr.eddi 2072.4093 2127.4891 2225.5588 2242.8481 2349.6086


  N = 1,000 
  Unit: seconds
       expr      min       lq   median       uq      max
  1 usingDT.GT 1.941012 2.053190 2.196100 2.472543 3.096096
  2 usingDT.RS 3.107938 3.344764 3.903529 4.010292 4.724700
  3  usingPlyr 3.297803 3.435105 3.625319 3.812862 4.118307

  N = 2,500
  Unit: seconds
              expr      min       lq   median       uq       max
  1     usingDT.GT 4.711010 5.210061 5.291999 5.307689  7.118794
  2 usingDT.GT_Mod 2.037558 2.092953 2.608662 2.638984  3.616596
  3     usingDT.RS 5.253509 5.334890 6.474915 6.740323  7.275444
  4 usingPlyr.eddi 7.842623 8.612201 9.142636 9.420615 11.102888

  N = 5,000
              expr       min        lq    median        uq       max
  1     usingDT.GT  8.900226  9.058337  9.233387  9.622531 10.839409
  2 usingDT.GT_Mod  4.112934  4.293426  4.460745  4.584133  6.128176
  3     usingDT.RS  8.076821  8.097081  8.404799  8.800878  9.580892
  4 usingPlyr.eddi 13.260828 14.297614 14.523016 14.657193 16.698229

  # dropping the slower two from the tests:
  microbenchmark( usingDT.RS=eval(usingDT.RS), usingDT.GT=eval(usingDT.GT), usingDT.GT_Mod=eval(usingDT.GT_Mod), times=6L)

  N = 10,000
  Unit: seconds
              expr       min        lq    median        uq       max
  1 usingDT.GT_Mod  8.426744  8.739659  8.750604  9.118382  9.848153
  2     usingDT.RS 15.260702 15.564495 15.742855 16.024293 16.249556

  N = 25,000
  ... (still running)

-----------------

Functions Used in benchmarking:

  # original random string function
  random_string <- function(min_length, max_length, separator) {  
      selection <- paste(sample(elements_list, ceiling(runif(1, min_length, max_length))), collapse = separator)  
      return(selection)  
  }  

  # GeekTrader's function
  myFunc <- function() {
    ll <- strsplit(dt[,messy_string], split="\\$")


    COLS <- do.call(rbind, 
                    lapply(1:length(ll), 
                           function(i) {
                             data.frame(
                               ID= rep(i, length(ll[[i]])),
                               COL = ll[[i]], 
                               VAL= rep(1, length(ll[[i]]))
                               )
                             }
                           )
                    )

    res <- as.data.table(tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length ))
    dt <- cbind(dt, res)
    for (j in names(dt))
      set(dt,which(is.na(dt[[j]])),j,0)
    return(dt)
  }


  # Improvements to @GeekTrader's `myFunc` -RS  '
  myFunc.modified <- function() {
    ll <- strsplit(dt[,messy_string], split="\\$")

    ## MODIFICATIONS: 
    # using `rbindlist` instead of `do.call(rbind.. )`
    COLS <- rbindlist( lapply(1:length(ll), 
                           function(i) {
                             data.frame(
                               ID= rep(i, length(ll[[i]])),
                               COL = ll[[i]], 
                               VAL= rep(1, length(ll[[i]])), 
  # MODICIATION:  Not coercing to factors                             
                               stringsAsFactors = FALSE
                               )
                             }
                           )
                    )

  # MODIFICATION: Preserve as matrix, the output of tapply
    res2 <- tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length )

  # FLATTEN into a data.table
    resdt <- data.table(r=c(res2))

  # FIND & REPLACE NA's of single column
    resdt[is.na(r), r:=0L]

  # cbind with dt, a matrix, with the same attributes as `res2`  
    cbind(dt, 
          matrix(resdt[[1]], ncol=ncol(res2), byrow=FALSE, dimnames=dimnames(res2)))
  }


  ### Benchmarks comparing the two versions of GeekTrader's function: 
  orig = quote({dt <- copy(masterDT); myFunc()})
  modified = quote({dt <- copy(masterDT); myFunc.modified()})
  microbenchmark(Modified = eval(modified), Orig = eval(orig), times=20L)

  #  Unit: milliseconds
  #        expr      min        lq   median       uq      max
  #  1 Modified  895.025  971.0117 1011.216 1189.599 2476.972
  #  2     Orig 1953.638 2009.1838 2106.412 2230.326 2356.802

0 讨论(0)

醉酒成梦

2020-12-01 22:18

Here's a somewhat newer approach, using cSplit_e() from the splitstackshape package.

library(splitstackshape)
cSplit_e(dt, split.col = "String", sep = "$", type = "character", 
         mode = "binary", fixed = TRUE, fill = 0)
#  ID String String_a String_b String_c
#1  1    a$b        1        1        0
#2  2    b$c        0        1        1
#3  3      c        0        0        1

0 讨论(0)

执念已碎

2020-12-01 22:19

UPDATE : VERSION 3

Found even faster way. This function is also highly memory efficient. Primary reason previous function was slow because of copy/assignments happening inside lapply loop as well as rbinding of the result.

In following version, we preallocate matrix with appropriate size, and then change values at appropriate coordinates, which makes it very fast compared to other looping versions.

funcGT3 <- function() {
    #Get list of column names in result
    resCol <- unique(dt[, unlist(strsplit(messy_string, split="\\$"))])

    #Get dimension of result
    nresCol <- length(resCol)
    nresRow <- nrow(dt)

    #Create empty matrix with dimensions same as desired result
    mat <- matrix(rep(0, nresRow * nresCol), nrow = nresRow, dimnames = list(as.character(1:nresRow), resCol))

    #split each messy_string by $
    ll <- strsplit(dt[,messy_string], split="\\$")

    #Get coordinates of mat which we need to set to 1
    coords <- do.call(rbind, lapply(1:length(ll), function(i) cbind(rep(i, length(ll[[i]])), ll[[i]] )))

    #Set mat to 1 at appropriate coordinates
    mat[coords] <- 1    

    #Bind the mat to original data.table
    return(cbind(dt, mat))

}


result <- funcGT3()  #result for 1000 rows in dt
result
        ID   messy_string zn tc sv db yx st ze qs wq oe cv ut is kh kk im le qg rq po wd kc un ft ye if zl zt wy et rg iu
   1:    1 zn$tc$sv$db$yx  1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   2:    2    st$ze$qs$wq  0  0  0  0  0  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   3:    3    oe$cv$ut$is  0  0  0  0  0  0  0  0  0  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   4:    4 kh$kk$im$le$qg  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   5:    5    rq$po$wd$kc  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  1  1  0  0  0  0  0  0  0  0  0  0
  ---                                                                                                                    
 996:  996    rp$cr$tb$sa  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
 997:  997    cz$wy$rj$he  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
 998:  998       cl$rr$bm  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
 999:  999    sx$hq$zy$zd  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
1000: 1000    bw$cw$pw$rq  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0

Benchmark againt version 2 suggested by Ricardo (this is for 250K rows in data) :

Unit: seconds
 expr       min        lq    median        uq       max neval
  GT2 104.68672 104.68672 104.68672 104.68672 104.68672     1
  GT3  15.15321  15.15321  15.15321  15.15321  15.15321     1

VERSION 1 Following is version 1 of suggested answer

set.seed(10)  
elements_list <- c(outer(letters, letters, FUN = paste, sep = ""))  
random_string <- function(min_length, max_length, separator) {  
  selection <- paste(sample(elements_list, ceiling(runif(1, min_length, max_length))), collapse = separator)  
  return(selection)  
}  
dt <- data.table(ID = c(1:1000), messy_string = "")  
dt[ , messy_string := random_string(2, 5, "$"), by = ID]  


myFunc <- function() {
  ll <- strsplit(dt[,messy_string], split="\\$")


  COLS <- do.call(rbind, 
                  lapply(1:length(ll), 
                         function(i) {
                           data.frame(
                             ID= rep(i, length(ll[[i]])),
                             COL = ll[[i]], 
                             VAL= rep(1, length(ll[[i]]))
                             )
                           }
                         )
                  )

  res <- as.data.table(tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length ))
  dt <- cbind(dt, res)
  for (j in names(dt))
    set(dt,which(is.na(dt[[j]])),j,0)
  return(dt)
}


create_indicators <- function(search_list, searched_string) {  
  y <- rep(0, length(search_list))  
  for(j in 1:length(search_list)) {  
    x <- regexpr(search_list[j], searched_string)  
    x <- x[1]  
    y[j] <- ifelse(x > 0, 1, 0)  
  }  
  return(y)  
}  
OPFunc <- function() {
indicators <- matrix(0, nrow = nrow(dt), ncol = length(elements_list))  
for(n in 1:nrow(dt)) {  
  indicators[n, ] <- dt[n, create_indicators(elements_list, messy_string)]  
}  
indicators <- data.table(indicators)  
setnames(indicators, elements_list)  
dt <- cbind(dt, indicators)
return(dt)
}



library(plyr)
plyrFunc <- function() {
  indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i)
    dt[i,
       data.frame(t(as.matrix(table(strsplit(messy_string,
                                             split = "\\$")))))
       ]))
  dt = cbind(dt, indicators)
  #dt[is.na(dt)] = 0 #THIS DOESN'T WORK. USING FOLLOWING INSTEAD

  for (j in names(dt))
    set(dt,which(is.na(dt[[j]])),j,0)

  return(dt)  
}

BENCHMARK

system.time(res <- myFunc())
## user  system elapsed 
## 1.01    0.00    1.01

system.time(res2 <- OPFunc())
## user  system elapsed 
## 21.58    0.00   21.61

system.time(res3 <- plyrFunc())
## user  system elapsed 
## 1.81    0.00    1.81

VERSION 2 : Suggested by Ricardo

I'm posting this here instead of in my answer as the framework is really @GeekTrader's -Rick_

  myFunc.modified <- function() {
    ll <- strsplit(dt[,messy_string], split="\\$")

    ## MODIFICATIONS: 
    # using `rbindlist` instead of `do.call(rbind.. )`
    COLS <- rbindlist( lapply(1:length(ll), 
                           function(i) {
                             data.frame(
                               ID= rep(i, length(ll[[i]])),
                               COL = ll[[i]], 
                               VAL= rep(1, length(ll[[i]])), 
  # MODICIATION:  Not coercing to factors                             
                               stringsAsFactors = FALSE
                               )
                             }
                           )
                    )

  # MODIFICATION: Preserve as matrix, the output of tapply
    res2 <- tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length )

  # FLATTEN into a data.table
    resdt <- data.table(r=c(res2))

  # FIND & REPLACE NA's of single column
    resdt[is.na(r), r:=0L]

  # cbind with dt, a matrix, with the same attributes as `res2`  
    cbind(dt, 
          matrix(resdt[[1]], ncol=ncol(res2), byrow=FALSE, dimnames=dimnames(res2)))
  }


 ### Benchmarks: 

orig = quote({dt <- copy(masterDT); myFunc()})
modified = quote({dt <- copy(masterDT); myFunc.modified()})
microbenchmark(Modified = eval(modified), Orig = eval(orig), times=20L)

#  Unit: milliseconds
#        expr      min        lq   median       uq      max
#  1 Modified  895.025  971.0117 1011.216 1189.599 2476.972
#  2     Orig 1953.638 2009.1838 2106.412 2230.326 2356.802

0 讨论(0)

星月不相逢

2020-12-01 22:32

Here is an approach using rapply and table. I'm sure there would be a slightly faster approach than using table here, but it is still slightly faster than the myfunc.Modified from @ricardo;s answer

# a copy with enough column pointers available
dtr <- alloc.col(copy(dt)  ,1000L)

rapplyFun <- function(){
ll <- strsplit(dtr[, messy_string], '\\$')
Vals <- rapply(ll, classes = 'character', f= table, how = 'replace')
Names <- unique(rapply(Vals, names))

dtr[, (Names) := 0L]
for(ii in seq_along(Vals)){
  for(jj in names(Vals[[ii]])){
    set(dtr, i = ii, j = jj, value =Vals[[ii]][jj])
  }
}
}


microbenchmark(myFunc.modified(), rapplyFun(),times=5)
Unit: milliseconds
#             expr      min       lq   median       uq      max neval
# myFunc.modified() 395.1719 396.8706 399.3218 400.6353 401.1700     5
# rapplyFun()       308.9103 309.5763 309.9368 310.2971 310.3463     5

0 讨论(0)

耶瑟儿～

2020-12-01 22:33

Here's a ~10x faster version using rbind.fill.

library(plyr)
indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i)
                        dt[i,
                           data.frame(t(as.matrix(table(strsplit(messy_string,
                                                                 split = "\\$")))))
                          ]))
dt = cbind(dt, indicators)

# dt[is.na(dt)] = 0
# faster NA replace (thanks geektrader)
for (j in names(dt))
  set(dt, which(is.na(dt[[j]])), j, 0L)

0 讨论(0)

感动是毒

2020-12-01 22:38
Here's another solution, that constructs a sparse matrix object instead of what you have. This shaves off a lot of time AND memory.

It produces ordered results and even with conversion to data.table it's faster than GT3 with 0L and 1L and without reordering (this could be because I use a different method for arriving at the required coordinates - I didn't go through the GT3 algo), however if you don't convert and keep it as a sparse matrix it's about 10-20x faster than GT3 (and has a much smaller memory footprint).
```
library(Matrix)

strings = strsplit(dt$messy_string, split = "$", fixed = TRUE)
element.map = data.table(el = elements_list, n = seq_along(elements_list), key = "el")

tmp = data.table(n = seq_along(strings), each = unlist(lapply(strings, length)))

rows = tmp[, rep(n, each = each), by = n][, V1]
cols = element.map[J(unlist(strings))][,n]

dt.sparse = sparseMatrix(rows, cols, x = 1,
                         dims = c(max(rows), length(elements_list)))

# optional, should be avoided until absolutely necessary
dt = cbind(dt, as.data.table(as.matrix(dt.sparse)))
setnames(dt, c('id', 'messy_string', elements_list))
```
The idea is to split to strings, then use a data.table as a map object to map each substring to its correct column position. From there on it's just a matter of figuring out the rows correctly and filling in the matrix.
0 讨论(0)
发布评论:

提交评论
- 加载中...