Fastest way to replace NAs in a large data.table

后端 未结 10 974
走了就别回头了
走了就别回头了 2020-11-22 17:10

I have a large data.table, with many missing values scattered throughout its ~200k rows and 200 columns. I would like to re code those NA values to zeros as efficiently as

相关标签:
10条回答
  • 2020-11-22 17:22
    > DT = data.table(a=LETTERS[c(1,1:3,4:7)],b=sample(c(15,51,NA,12,21),8,T),key="a")
    > DT
       a  b
    1: A 12
    2: A NA
    3: B 15
    4: C NA
    5: D 51
    6: E NA
    7: F 15
    8: G 51
    > DT[is.na(b),b:=0]
    > DT
       a  b
    1: A 12
    2: A  0
    3: B 15
    4: C  0
    5: D 51
    6: E  0
    7: F 15
    8: G 51
    > 
    
    0 讨论(0)
  • 2020-11-22 17:23

    To generalize to many columns you could use this approach (using previous sample data but adding a column):

    z = data.table(x = sample(c(NA_integer_, 1), 2e7, TRUE), y = sample(c(NA_integer_, 1), 2e7, TRUE))
    
    z[, names(z) := lapply(.SD, function(x) fifelse(is.na(x), 0, x))]
    

    Didn't test for the speed though

    0 讨论(0)
  • 2020-11-22 17:25

    Using the fifelse function from the newest data.table versions 1.12.6, it is even 10 times faster than NAToUnknown in the gdata package:

    z = data.table(x = sample(c(NA_integer_, 1), 2e7, TRUE))
    system.time(z[,x1 := gdata::NAToUnknown(x, 0)])
    
    #   user  system elapsed 
    #  0.798   0.323   1.173 
    
    system.time(z[,x2:= fifelse(is.na(x), 0, x)])
    
    #   user  system elapsed 
    #  0.172   0.093   0.113 
    
    0 讨论(0)
  • 2020-11-22 17:29

    Dedicated functions (nafill and setnafill) for that purpose are available in data.table package (version >= 1.12.4):

    It process columns in parallel so well address previously posted benchmarks, below its timings vs fastest approach till now, and also scaled up, using 40 cores machine.

    library(data.table)
    create_dt <- function(nrow=5, ncol=5, propNA = 0.5){
      v <- runif(nrow * ncol)
      v[sample(seq_len(nrow*ncol), propNA * nrow*ncol)] <- NA
      data.table(matrix(v, ncol=ncol))
    }
    f_dowle3 = function(DT) {
      for (j in seq_len(ncol(DT)))
        set(DT,which(is.na(DT[[j]])),j,0)
    }
    
    set.seed(1)
    dt1 = create_dt(2e5, 200, 0.1)
    dim(dt1)
    #[1] 200000    200
    dt2 = copy(dt1)
    system.time(f_dowle3(dt1))
    #   user  system elapsed 
    #  0.193   0.062   0.254 
    system.time(setnafill(dt2, fill=0))
    #   user  system elapsed 
    #  0.633   0.000   0.020   ## setDTthreads(1) elapsed: 0.149
    all.equal(dt1, dt2)
    #[1] TRUE
    
    set.seed(1)
    dt1 = create_dt(2e7, 200, 0.1)
    dim(dt1)
    #[1] 20000000    200
    dt2 = copy(dt1)
    system.time(f_dowle3(dt1))
    #   user  system elapsed 
    # 22.997  18.179  41.496
    system.time(setnafill(dt2, fill=0))
    #   user  system elapsed 
    # 39.604  36.805   3.798 
    all.equal(dt1, dt2)
    #[1] TRUE
    
    0 讨论(0)
  • 2020-11-22 17:31

    Here's a solution using data.table's := operator, building on Andrie and Ramnath's answers.

    require(data.table)  # v1.6.6
    require(gdata)       # v2.8.2
    
    set.seed(1)
    dt1 = create_dt(2e5, 200, 0.1)
    dim(dt1)
    [1] 200000    200    # more columns than Ramnath's answer which had 5 not 200
    
    f_andrie = function(dt) remove_na(dt)
    
    f_gdata = function(dt, un = 0) gdata::NAToUnknown(dt, un)
    
    f_dowle = function(dt) {     # see EDIT later for more elegant solution
      na.replace = function(v,value=0) { v[is.na(v)] = value; v }
      for (i in names(dt))
        eval(parse(text=paste("dt[,",i,":=na.replace(",i,")]")))
    }
    
    system.time(a_gdata = f_gdata(dt1)) 
       user  system elapsed 
     18.805  12.301 134.985 
    
    system.time(a_andrie = f_andrie(dt1))
    Error: cannot allocate vector of size 305.2 Mb
    Timing stopped at: 14.541 7.764 68.285 
    
    system.time(f_dowle(dt1))
      user  system elapsed 
     7.452   4.144  19.590     # EDIT has faster than this
    
    identical(a_gdata, dt1)   
    [1] TRUE
    

    Note that f_dowle updated dt1 by reference. If a local copy is required then an explicit call to the copy function is needed to make a local copy of the whole dataset. data.table's setkey, key<- and := do not copy-on-write.

    Next, let's see where f_dowle is spending its time.

    Rprof()
    f_dowle(dt1)
    Rprof(NULL)
    summaryRprof()
    $by.self
                      self.time self.pct total.time total.pct
    "na.replace"           5.10    49.71       6.62     64.52
    "[.data.table"         2.48    24.17       9.86     96.10
    "is.na"                1.52    14.81       1.52     14.81
    "gc"                   0.22     2.14       0.22      2.14
    "unique"               0.14     1.36       0.16      1.56
    ... snip ...
    

    There, I would focus on na.replace and is.na, where there are a few vector copies and vector scans. Those can fairly easily be eliminated by writing a small na.replace C function that updates NA by reference in the vector. That would at least halve the 20 seconds I think. Does such a function exist in any R package?

    The reason f_andrie fails may be because it copies the whole of dt1, or creates a logical matrix as big as the whole of dt1, a few times. The other 2 methods work on one column at a time (although I only briefly looked at NAToUnknown).

    EDIT (more elegant solution as requested by Ramnath in comments) :

    f_dowle2 = function(DT) {
      for (i in names(DT))
        DT[is.na(get(i)), (i):=0]
    }
    
    system.time(f_dowle2(dt1))
      user  system elapsed 
     6.468   0.760   7.250   # faster, too
    
    identical(a_gdata, dt1)   
    [1] TRUE
    

    I wish I did it that way to start with!

    EDIT2 (over 1 year later, now)

    There is also set(). This can be faster if there are a lot of column being looped through, as it avoids the (small) overhead of calling [,:=,] in a loop. set is a loopable :=. See ?set.

    f_dowle3 = function(DT) {
      # either of the following for loops
    
      # by name :
      for (j in names(DT))
        set(DT,which(is.na(DT[[j]])),j,0)
    
      # or by number (slightly faster than by name) :
      for (j in seq_len(ncol(DT)))
        set(DT,which(is.na(DT[[j]])),j,0)
    }
    
    0 讨论(0)
  • 2020-11-22 17:31

    Here is a solution using NAToUnknown in the gdata package. I have used Andrie's solution to create a huge data table and also included time comparisons with Andrie's solution.

    # CREATE DATA TABLE
    dt1 = create_dt(2e5, 200, 0.1)
    
    # FUNCTIONS TO SET NA TO ZERO   
    f_gdata  = function(dt, un = 0) gdata::NAToUnknown(dt, un)
    f_Andrie = function(dt) remove_na(dt)
    
    # COMPARE SOLUTIONS AND TIMES
    system.time(a_gdata  <- f_gdata(dt1))
    
    user  system elapsed 
    4.224   2.962   7.388 
    
    system.time(a_andrie <- f_Andrie(dt1))
    
     user  system elapsed 
    4.635   4.730  20.060 
    
    identical(a_gdata, g_andrie)  
    
    TRUE
    
    0 讨论(0)
提交回复
热议问题