Search multiple columns for string to set indicator variable

前端 未结 3 1827
一整个雨季
一整个雨季 2021-01-23 13:57

I am using R and RStudio for the first time to work with a very large dataset (15 million cases) with many columns of data. To facilitate analysis, I need to search a range of

相关标签:
3条回答
  • 2021-01-23 14:25

    We can use mtabulate from qdapTools

    library(qdapTools)
    res <- cbind(df1, mtabulate(as.data.frame(t(df1)))!=0)
    row.names(res) <- NULL
    names(res)[-(1:3)] <- paste0("Var", names(res)[-(1:3)])
    res
    #  Dx1 Dx2 Dx3 Var001 Var234 Var456 Var231 Var444 Var245 Var777
    #1 001 234 456   TRUE   TRUE   TRUE  FALSE  FALSE  FALSE  FALSE
    #2 231 001 444   TRUE  FALSE  FALSE   TRUE   TRUE  FALSE  FALSE
    #3 245 777 001   TRUE  FALSE  FALSE  FALSE  FALSE   TRUE   TRUE
    

    data

    df1 <- structure(list(Dx1 = c("001", "231", "245"), Dx2 = c("234", "001", 
    "777"), Dx3 = c("456", "444", "001")), .Names = c("Dx1", "Dx2", 
    "Dx3"), row.names = c(NA, -3L), class = "data.frame")
    
    0 讨论(0)
  • 2021-01-23 14:26

    In base R, we can implement a reusable function, in only a few lines of code, but requires some knowledge about which functions to use and how.

    I'll call the function bag, as in bag-of-words.

    bag <- function(..., prefix=".", levels=NULL, `NA`=NULL) {
    
      # Go from multiple columns to list of vectors
      bags <- mapply(c, ..., SIMPLIFY = FALSE, USE.NAMES = FALSE)
    
      # Find unique levels
      if(is.null(levels)) {
        levels <- sort(Reduce(union, bags))
    
        # names persist through outer
        names(levels) <- paste0(prefix, levels)
      }
    
      # Calculate out[level,bag] = level %in% bag 
      out <- outer(levels, bags, Vectorize(`%in%`))
    
      # Output a data structure
      structure(+t(out), class='bag', levels=levels)
    }
    

    Which would let us do:

    with(df1, bag(Dx1, Dx2, Dx3, prefix="Var"))
    #>      Var001 Var231 Var234 Var245 Var444 Var456 Var777
    #> [1,]      1      0      1      0      0      1      0
    #> [2,]      1      1      0      0      1      0      0
    #> [3,]      1      0      0      1      0      0      1
    #> attr(,"class")
    #> [1] "bag"
    #> attr(,"levels")
    #> Var001 Var231 Var234 Var245 Var444 Var456 Var777 
    #>  "001"  "231"  "234"  "245"  "444"  "456"  "777"
    

    This is probably not very performant, but it works. I've changed the output format from logical to numeric and included some metadata to make it easier to use in a model. We can add a function to enable modeling with bag directly:

    #' @export
    makepredictcall.bag <- function(var, call){
      # Stolen from splines package
      if (as.character(call)[1L] != "bag")
        return(call)
      args <- c("prefix", "levels")
    
    
      at <- attributes(var)[args]
      xxx <- call
      xxx[args] <- NULL
      xxx[names(at)] <- at
      xxx
    }
    

    Now, you can use it directly in a model formula. This has the advantage that the dummy coding is now incorporated in to the model and you won't need to preprocess when predicting on new data sets. Example:

    df2 <- as.data.frame(lapply(df1, sample, 20, TRUE), stringsAsFactors = FALSE)
    df3 <- as.data.frame(lapply(df1, sample, 20, TRUE), stringsAsFactors = FALSE)
    
    Y <- 1:nrow(df2)
    m <- lm(Y~bag(Dx1, Dx2, Dx3), df2)
    summary(m)
    #> 
    #> Call:
    #> lm(formula = Y ~ bag(Dx1, Dx2, Dx3), data = df2)
    #> 
    #> Residuals:
    #>     Min      1Q  Median      3Q     Max 
    #> -8.1110 -3.6765  0.1948  3.1899  8.7961 
    #> 
    #> Coefficients:
    #>                        Estimate Std. Error t value Pr(>|t|)
    #> (Intercept)             16.6709    10.3948   1.604    0.135
    #> bag(Dx1, Dx2, Dx3).001  -3.7385     5.6141  -0.666    0.518
    #> bag(Dx1, Dx2, Dx3).231  -3.7286     4.1728  -0.894    0.389
    #> bag(Dx1, Dx2, Dx3).234   3.1786     4.6528   0.683    0.507
    #> bag(Dx1, Dx2, Dx3).245  -7.2493     4.4900  -1.615    0.132
    #> bag(Dx1, Dx2, Dx3).444  -2.2936     4.3033  -0.533    0.604
    #> bag(Dx1, Dx2, Dx3).456   2.9979     4.3826   0.684    0.507
    #> bag(Dx1, Dx2, Dx3).777  -0.8608     4.5353  -0.190    0.853
    #> 
    #> Residual standard error: 5.971 on 12 degrees of freedom
    #> Multiple R-squared:  0.3566, Adjusted R-squared:  -0.01874 
    #> F-statistic: 0.9501 on 7 and 12 DF,  p-value: 0.5056
    predict(m, df3)
    #>         1         2         3         4         5         6         7 
    #>  8.681003 16.111016  4.822329 15.079445 19.108899 10.306611 13.817465 
    #>         8         9        10        11        12        13        14 
    #> 16.111016  9.788011 12.382454  9.778103  3.389569 12.382454  9.203882 
    #>        15        16        17        18        19        20 
    #> 13.817465  9.788011 12.071654  6.267249 13.827373 15.069537
    

    Created on 2019-08-06 by the reprex package (v0.3.0)

    EDIT:

    And some benchmarks for comparison

    microbenchmark::microbenchmark(mtab = mtabulate(as.data.frame(t(df1)))!=0,
                                   lapply = lapply(as.character(unique(melt(df1, id.vars = NULL)$value)), 
                                                   function(x) rowSums(df1==x) > 0),
                                   bag = do.call(bag, df1))
    #> Unit: microseconds
    #>    expr     min      lq     mean   median       uq      max neval
    #>    mtab 439.320 452.107 519.9429 462.9035 511.8710 1960.582   100
    #>  lapply 276.914 295.976 337.6020 300.7870 315.0135 2268.210   100
    #>     bag 121.996 130.305 146.6677 139.6990 145.3275  294.711   100
    
    0 讨论(0)
  • 2021-01-23 14:42

    Another idea using base R with lapply:

    uniq_dxs <- as.character(unique(melt(df1, id.vars = NULL)$value))
    df1[, paste0("var", uniq_dxs)] <- lapply(uniq_dxs, function(x) rowSums(df1==x) > 0)
    
    df1
    #  Dx1 Dx2 Dx3 var001 var231 var245 var234 var777 var456 var444
    #1 001 234 456   TRUE  FALSE  FALSE   TRUE  FALSE   TRUE  FALSE
    #2 231 001 444   TRUE   TRUE  FALSE  FALSE  FALSE  FALSE   TRUE
    #3 245 777 001   TRUE  FALSE   TRUE  FALSE   TRUE  FALSE  FALSE
    

    Benchmark on my machine since I was curious. Just wanted to compare the mtabulate to the lapply. Not including the <-:

    microbenchmark::microbenchmark(mtab = mtabulate(as.data.frame(t(df1)))!=0,
                                   lapply = lapply(uniq_dxs, function(x) rowSums(df1==x) > 0))
    Unit: microseconds
       expr      min        lq      mean   median       uq      max neval
       mtab 1039.317 1088.9120 1182.3375 1109.334 1145.255 5931.031   100
     lapply  742.838  795.7155  823.7991  813.220  843.488 1034.211   100
    
    0 讨论(0)
提交回复
热议问题