Apply a function to dataframe subsetted by all possible combinations of categorical variables

后端 未结 4 1050
野的像风
野的像风 2021-01-20 03:30

An example dataframe with categorical variables catA, catB, and catC. Obs is some observed value.

catA <- rep(factor(c(\"a\",\"b\",\"c\")), length.out=10         


        
相关标签:
4条回答
  • 2021-01-20 03:56

    Using only vectorized functions and base R

    # Find all possible subsets of your data
    combVars <- c("catA", "catB", "catC")
    subsets <- lapply(0:length(combVars), combn, x = combVars, simplify = FALSE)
    subsets <- do.call(c, subsets)
    # Calculate means by each subset
    meanValues <- lapply(subsets, function(x) aggregate(dat[["obs"]], by = dat[x], FUN = mean))
    # Pull them all into one dataframe
    Reduce(function(x,y) merge(x,y,all=TRUE), meanValues)
    
    0 讨论(0)
  • This isn't the cleanest solution, but I think it gets close to what you want.

    getAllSubs <- function(df, lookup, fun) {
    
      out <- lapply(1:nrow(lookup), function(i) {
    
        df_new <- df
    
        if(length(na.omit(unlist(lookup[i,]))) > 0) {
    
          for(j in colnames(lookup)[which(!is.na(unlist(lookup[i,])))]) {
            df_new <- df_new[df_new[,j] == lookup[i,j],]
          }  
        } 
        fun(df_new)  
      })
    
      if(mean(sapply(out, length) ==1) == 1) {
        out <- unlist(out)
      } else {
        out <- do.call("rbind", out)
      }
    
      final <- cbind(lookup, out)
      final[is.na(final)] <- NA
      final
    }
    

    As it is currently written you have to construct the lookup table beforehand, but you could just as easily move that construction into the function itself. I added a few lines at the end to make sure it could accomodate outputs of different lengths and so NaNs were turned into NAs, just because that seemed to create a cleaner output. As it is currently written, it applies the function to the entire original data frame in cases where all columns are NA.

    dat_out <- getAllSubs(dat, allsubs, function(x) mean(x$obs, na.rm = TRUE))
    
    head(dat_out,20)
    
       catA catB catC      out
    1  <NA> <NA> <NA> 47.25446
    2     a <NA> <NA> 51.54226
    3     b <NA> <NA> 46.45352
    4     c <NA> <NA> 43.63767
    5  <NA>    1 <NA> 47.23872
    6     a    1 <NA> 66.59281
    7     b    1 <NA> 32.03513
    8     c    1 <NA> 40.66896
    9  <NA>    2 <NA> 45.16588
    10    a    2 <NA> 50.59323
    11    b    2 <NA> 51.02013
    12    c    2 <NA> 33.15251
    13 <NA>    3 <NA> 51.67809
    14    a    3 <NA> 48.13645
    15    b    3 <NA> 57.92084
    16    c    3 <NA> 49.27710
    17 <NA>    4 <NA> 44.93515
    18    a    4 <NA> 40.36266
    19    b    4 <NA> 44.26717
    20    c    4 <NA> 50.74718
    
    0 讨论(0)
  • 2021-01-20 03:59
    ans <- with(dat, tapply(obs, list(catA, catB, catC), mean))
    ans <- data.frame(expand.grid(dimnames(ans)), results=c(ans))
    names(ans)[1:3] <- names(dat)[1:3]
    
    str(ans)
    # 'data.frame':  36 obs. of  4 variables:
    #  $ catA   : Factor w/ 3 levels "a","b","c": 1 2 3 1 2 3 1 2 3 1 ...
    #  $ catB   : Factor w/ 4 levels "1","2","3","4": 1 1 1 2 2 2 3 3 3 4 ...
    #  $ catC   : Factor w/ 3 levels "d","e","f": 1 1 1 1 1 1 1 1 1 1 ...
    #  $ results: num  69.7 NA NA 55.3 NA ...
    
    0 讨论(0)
  • 2021-01-20 04:12

    An alternative approach, one function to get all combinations of variables and another to apply a function over all subsets. The combinations function was stolen from another post...

    ## return all combinations of vector up to maximum length n
    multicombn <- function(dat, n) {
        unlist(lapply(1:n, function(x) combn(dat, x, simplify=F)), recursive=F)
    }
    

    For allsubs, vars is of form c("catA","catB","catC"), out.name = "mean". func needs to be written in form that ddply would take,

    func=function(x) mean(x$obs, na.rm=TRUE)
    
    library(plyr)
    allsubs <- function(indat, vars, func=NULL, out.name=NULL) {
        results <- data.frame()
        nvars <- rev(multicombn(vars,length(vars)))
        for(i in 1:length(nvars)) {
            results <-
                rbind.fill(results, ddply(indat, unlist(nvars[i]), func))
        }
        if(!missing(out.name)) names(results)[length(vars)+1] <- out.name
        results
    }
    

    One difference between this answer and shwaund's, this does not return rows for empty subsets, so no NAs in results column.

    allsubs(dat, c("catA","catB","catc"), func, out.name="mean")
    > head(allsubs(dat, vars, func, out.name = "mean"),20)
       catA catB catC     mean
    1     a    1    d 56.65909
    2     a    2    d 54.98116
    3     a    3    d 37.52655
    4     a    4    d 58.29034
    5     b    1    e 52.88945
    6     b    2    e 50.43122
    7     b    3    e 52.57115
    8     b    4    e 59.45348
    9     c    1    f 52.41637
    10    c    2    f 34.58122
    11    c    3    f 46.80256
    12    c    4    f 51.58668
    13 <NA>    1    d 56.65909
    14 <NA>    1    e 52.88945
    15 <NA>    1    f 52.41637
    16 <NA>    2    d 54.98116
    17 <NA>    2    e 50.43122
    18 <NA>    2    f 34.58122
    19 <NA>    3    d 37.52655
    20 <NA>    3    e 52.57115
    
    0 讨论(0)
提交回复
热议问题