Create Combinations in R by Groups

后端 未结 7 1075
一整个雨季
一整个雨季 2021-02-08 19:23

I want to create a list for my classroom of every possible group of 4 students. If I have 20 students, how I can I create this, by group, in R where my rows are each combination

相关标签:
7条回答
  • 2021-02-08 19:39

    Currently, this is implemented in the development version of RcppAlgos and will be in the next official release on CRAN. This is now officially apart of the production version of RcppAlgos*.

    library(RcppAlgos)
    a <- comboGroups(10, numGroups = 2, retType = "3Darray")
    
    dim(a)
    [1] 126   5   2
    
    a[1,,]
         Grp1 Grp2
    [1,]    1    6
    [2,]    2    7
    [3,]    3    8
    [4,]    4    9
    [5,]    5   10
    
    a[126,,]
         Grp1 Grp2
    [1,]    1    2
    [2,]    7    3
    [3,]    8    4
    [4,]    9    5
    [5,]   10    6
    

    Or if you prefer matrices:

    a1 <- comboGroups(10, 2, retType = "matrix")
    
    head(a1)
         Grp1 Grp1 Grp1 Grp1 Grp1 Grp2 Grp2 Grp2 Grp2 Grp2
    [1,]    1    2    3    4    5    6    7    8    9   10
    [2,]    1    2    3    4    6    5    7    8    9   10
    [3,]    1    2    3    4    7    5    6    8    9   10
    [4,]    1    2    3    4    8    5    6    7    9   10
    [5,]    1    2    3    4    9    5    6    7    8   10
    [6,]    1    2    3    4   10    5    6    7    8    9
    

    It is also really fast. You can even generate in parallel with nThreads or Parallel = TRUE (the latter uses one minus the system max threads) for greater efficiency gains:

    comboGroupsCount(16, 4)
    [1] 2627625
    
    system.time(comboGroups(16, 4, "matrix"))
     user  system elapsed 
    0.107   0.030   0.137
    
    system.time(comboGroups(16, 4, "matrix", nThreads = 4))
     user  system elapsed 
    0.124   0.067   0.055
                                    ## 7 threads on my machine
    system.time(comboGroups(16, 4, "matrix", Parallel = TRUE))
     user  system elapsed 
    0.142   0.126   0.047
    

    A really nice feature is the ability to generate samples or specific lexicographical combination groups, especially when the number of results is large.

    comboGroupsCount(factor(state.abb), numGroups = 10)
    Big Integer ('bigz') :
    [1] 13536281554808237495608549953475109376
    
    mySamp <- comboGroupsSample(factor(state.abb), 
                                numGroups = 10, "3Darray", n = 5, seed = 42)
    
    mySamp[1,,]
         Grp1 Grp2 Grp3 Grp4 Grp5 Grp`6 Grp7 Grp8 Grp9 Grp10
    [1,] AL   AK   AR   CA   CO   CT   DE   FL   LA   MD   
    [2,] IA   AZ   ME   ID   GA   OR   IL   IN   MS   NM   
    [3,] KY   ND   MO   MI   HI   PA   MN   KS   MT   OH   
    [4,] TX   RI   SC   NH   NV   WI   NE   MA   NY   TN  
    [5,] VA   VT   UT   OK   NJ   WY   WA   NC   SD   WV   
    50 Levels: AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH ... WY
    
    firstAndLast <- comboGroupsSample(state.abb, 10, "3Darray",
                                      sampleVec = c("1",
                                                    "13536281554808237495608549953475109376"))
    
    firstAndLast[1,,]
         Grp1 Grp2 Grp3 Grp4 Grp5 Grp6 Grp7 Grp8 Grp9 Grp10
    [1,] "AL" "CO" "HI" "KS" "MA" "MT" "NM" "OK" "SD" "VA" 
    [2,] "AK" "CT" "ID" "KY" "MI" "NE" "NY" "OR" "TN" "WA" 
    [3,] "AZ" "DE" "IL" "LA" "MN" "NV" "NC" "PA" "TX" "WV" 
    [4,] "AR" "FL" "IN" "ME" "MS" "NH" "ND" "RI" "UT" "WI" 
    [5,] "CA" "GA" "IA" "MD" "MO" "NJ" "OH" "SC" "VT" "WY"
    
    firstAndLast[2,,]
         Grp1 Grp2 Grp3 Grp4 Grp5 Grp6 Grp7 Grp8 Grp9 Grp10
    [1,] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" 
    [2,] "WA" "TX" "RI" "OH" "NM" "NE" "MN" "ME" "IA" "HI" 
    [3,] "WV" "UT" "SC" "OK" "NY" "NV" "MS" "MD" "KS" "ID" 
    [4,] "WI" "VT" "SD" "OR" "NC" "NH" "MO" "MA" "KY" "IL" 
    [5,] "WY" "VA" "TN" "PA" "ND" "NJ" "MT" "MI" "LA" "IN"
    

    And finally, generating all 2,546,168,625 combinations groups of 20 people into 5 groups (what the OP asked for) can be achieved in under a minute using the lower and upper arguments:

    system.time(aPar <- parallel::mclapply(seq(1, 2546168625, 969969), function(x) {
         combs <- comboGroups(20, 5, "3Darray", lower = x, upper = x + 969968)
         ### do something
         dim(combs)
    }, mc.cores = 6))
       user  system elapsed 
    217.667  22.932  48.482
    
    sum(sapply(aPar, "[", 1))
    [1] 2546168625
    

    Although I started working on this problem over a year ago, this question was a huge inspiration for getting this formalized in a package.

    * I am the author of RcppAlgos

    0 讨论(0)
  • 2021-02-08 19:40

    This code below works.

    # Create list of the 20 records
    list <- c(1:20)
    
    # Generate all combinations including repetitions
    c <- data.frame(expand.grid(rep(list(list), 4))); rm(list)
    c$combo <- paste(c$Var1, c$Var2, c$Var3, c$Var4)
    # Remove repetitions
    c <- subset(c, c$Var1 != c$Var2 & c$Var1 != c$Var3 & c$Var1 != c$Var4 & c$Var2 != c$Var3 & c$Var2 != c$Var4 & c$Var3 != c$Var4)
    
    # Create common group labels (ex. abc, acb, bac, bca, cab, cba would all have "abc" as their group label).
    key <- data.frame(paste(c$Var1, c$Var2, c$Var3, c$Var4))
    key$group  <- apply(key, 1, function(x) paste(sort(unlist(strsplit(x, " "))), collapse = " "))
    c$group <- key$group; rm(key)
    
    # Sort by common group label and id combos by group
    c <- c[order(c$group),]
    c$Var1 <- NULL; c$Var2 <- NULL; c$Var3 <- NULL; c$Var4 <- NULL;
    c$rank <- rep(1:24)
    
    # Pivot
    c <- reshape(data=c,idvar="group", v.names = "combo", timevar = "rank", direction="wide")
    
    0 讨论(0)
  • 2021-02-08 19:42

    This code below gives all unique combinations for 4 selected from 20 without duplicates.

    x <- c(1:20)
    combinations <- data.frame(t(combn(x, 4)))
    
    0 讨论(0)
  • 2021-02-08 19:56

    This is a challenging problem computationally, since I believe there are 2.5 billion possibilities to enumerate. (If it's mistaken, I'd welcome any insight about where this approach goes wrong.)

    Depending on how it's stored, a table with all those groupings might require more RAM than most computers can handle. I'd be impressed to see an efficient way to create that. If we took a "create one combination at a time" approach, it would still take 41 minutes to generate all the possibilities if we could generate 1,000,000 per second, or a month if we could only generate 1,000 per second.

    EDIT - added partial implementation at the bottom to create any desired grouping from #1 to #2,546,168,625. For some purposes, this may be almost as good as actually storing the whole sequence, which is very large.


    Let's say we are going to make 5 groups of four students each: Group A, B, C, D, and E.

    Let's define Group A as the group Student #1 is in. They can be paired with any three of the other 19 students. I believe there are 969 such combinations of other students:

    > nrow(t(combn(1:19, 3)))
    [1] 969
    

    Now there are now 16 students left for other groups. Let's assign the first student not already in Group A into Group B. That might be student 2, 3, 4, or 5. It doesn't matter; all we need to know is that there are only 15 students that can be paired with that student. There are 455 such combinations:

    > nrow(t(combn(1:15, 3)))
    [1] 455
    

    Now there are 12 student left. Again, let's assign the first ungrouped student to Group C, and we have 165 combinations left for them with the other 11 students:

    > nrow(t(combn(1:11, 3)))
    [1] 165
    

    And we have 8 students left, 7 of whom can be paired with first ungrouped student into Group D in 35 ways:

    > nrow(t(combn(1:7, 3)))
    [1] 35
    

    And then, once our other groups are determined, there's only one group of four students left, three of whom can be paired with the first ungrouped student:

    > nrow(t(combn(1:3, 3)))
    [1] 1
    

    That implies 2.546B combinations:

    > 969*455*165*35*1
    [1] 2546168625
    

    Here's a work-in-progress function that produces a grouping based on any arbitrary sequence number.

    1) [in progress] Convert sequence number to a vector describing which # combination should be used for Group A, B, C, D, and E. For instance, this should convert #1 to c(1, 1, 1, 1, 1) and #2,546,168,625 to c(969, 455, 165, 35, 1).

    2) Convert the combinations to a specific output describing the students in each Group.

    groupings <- function(seq_nums) {
      students <- 20
      group_size = 4
      grouped <- NULL
      remaining <- 1:20
      seq_nums_pad <- c(seq_nums, 1) # Last group always uses the only possible combination
      for (g in 1:5) {
        group_relative <- 
          c(1, 1 + t(combn(1:(length(remaining) - 1), group_size - 1))[seq_nums_pad[g], ])
        group <- remaining[group_relative]
        print(group)
        grouped = c(grouped, group)
        remaining <-  setdiff(remaining, grouped)
      }
    }
    
    > groupings(c(1,1,1,1))
    #[1] 1 2 3 4
    #[1] 5 6 7 8
    #[1]  9 10 11 12
    #[1] 13 14 15 16
    #[1] 17 18 19 20
    > groupings(c(1,1,1,2))
    #[1] 1 2 3 4
    #[1] 5 6 7 8
    #[1]  9 10 11 12
    #[1] 13 14 15 17
    #[1] 16 18 19 20
    > groupings(c(969, 455, 165, 35))   # This one uses the last possibility for
    #[1]  1 18 19 20                    #   each grouping.
    #[1]  2 15 16 17
    #[1]  3 12 13 14
    #[1]  4  9 10 11
    #[1] 5 6 7 8
    
    0 讨论(0)
  • 2021-02-08 20:00

    This relies heavily on this answer:

    Algorithm that can create all combinations and all groups of those combinations

    One thing to note is that the answer is not that dynamic - it only included a solution for groups of 3. To make it more robust, we can create the code based on the input parameters. That is, the following recursive function is created on the fly for groups 3:

    group <- function(input, step){
     len <- length(input) 
     combination[1, step] <<- input[1] 
    
     for (i1 in 2:(len-1)) { 
       combination[2, step] <<- input[i1] 
    
       for (i2 in (i1+1):(len-0)) { 
         combination[3, step] <<- input[i2] 
    
         if (step == m) { 
           print(z); result[z, ,] <<- combination 
           z <<- z+1 
         } else { 
           rest <- setdiff(input, input[c(i1,i2, 1)]) 
           group(rest, step +1) #recursive if there are still additional possibilities
       }} 
     } 
    }
    

    This takes around 55 seconds to run for N = 16 and k = 4. I'd like to translate it into Rcpp but unfortunately I do not have that skillset.

    group_N <- function(input, k = 2) {
      N = length(input)
      m = N/k
      combos <- factorial(N) / (factorial(k)^m * factorial(m))
    
      result <- array(NA_integer_, dim = c(combos, m, k))
      combination = matrix(NA_integer_, nrow = k, ncol = m)
    
      z = 1
    
      group_f_start = 'group <- function(input, step){\n len <- length(input) \n combination[1,  step] <<- input[1] \n '
      i_s <- paste0('i', seq_len(k-1))
    
      group_f_fors = paste0('for (', i_s, ' in ', c('2', if (length(i_s) != 1) {paste0('(', i_s[-length(i_s)], '+1)')}), ':(len-', rev(seq_len(k)[-k])-1, ')) { \n combination[', seq_len(k)[-1], ', step] <<- input[', i_s, '] \n', collapse = '\n ')
    
      group_f_inner = paste0('if (step == m) { \n result[z, ,] <<- combination \n z <<- z+1 \n } else { \n rest <- setdiff(input, input[c(',
                             paste0(i_s, collapse = ','),
                             ', 1)]) \n group(rest, step +1) \n }')
    
      eval(parse(text = paste0(group_f_start, group_f_fors, group_f_inner, paste0(rep('}', times = k), collapse = ' \n '))))
    
      group(input, 1)
      return(result)
    }
    

    Performance

    system.time({test_1 <- group_N(seq_len(4), 2)})
    #   user  system elapsed 
    #   0.01    0.00    0.02
    library(data.table)
    
    #this funky step is just to better show the groups. the provided
    ## array is fine.
    
    as.data.table(t(rbindlist(as.data.table(apply(test_1, c(1,3), list)))))
    #    V1  V2
    #1: 1,2 3,4
    #2: 1,3 2,4
    #3: 1,4 2,3
    
    system.time({test_1 <- group_N(seq_len(16), 4)})
    #   user  system elapsed 
    #  55.00    0.19   55.29 
    
    as.data.table(t(rbindlist(as.data.table(apply(test_1, c(1,3), list)))))
    #very slow
    #                  V1          V2          V3          V4
    #      1:     1,2,3,4     5,6,7,8  9,10,11,12 13,14,15,16
    #      2:     1,2,3,4     5,6,7,8  9,10,11,13 12,14,15,16
    #      3:     1,2,3,4     5,6,7,8  9,10,11,14 12,13,15,16
    #      4:     1,2,3,4     5,6,7,8  9,10,11,15 12,13,14,16
    #      5:     1,2,3,4     5,6,7,8  9,10,11,16 12,13,14,15
    #     ---                                                
    #2627621:  1,14,15,16  2,11,12,13  3, 6, 9,10     4,5,7,8
    #2627622:  1,14,15,16  2,11,12,13     3,7,8,9  4, 5, 6,10
    #2627623:  1,14,15,16  2,11,12,13  3, 7, 8,10     4,5,6,9
    #2627624:  1,14,15,16  2,11,12,13  3, 7, 9,10     4,5,6,8
    #2627625:  1,14,15,16  2,11,12,13  3, 8, 9,10     4,5,6,7
    
    0 讨论(0)
  • 2021-02-08 20:00

    So you could get all the combinations with the expand.grid function just adding the vector of data four times. Then the result will have combinations like c(1,1,1,1) so i remove each row that have any duplicated value and the last part is just making the combinations. It is 2 loops and it is quite slow but it will get what you want. It could be speed up with the Rcpp package. The code is:

    ids = 1:20
    d2 = expand.grid(ids,ids,ids,ids)
    ## Remove rows with duplicated values
    pos_use = apply(apply(d2,1,duplicated),2,function(x) all(x == F))
    d2_temp = t(apply(d2[pos_use,],1,sort))
    list_temp = list()
    pos_quitar = NULL
    for(i in 1:nrow(d2_temp)){
      pos_quitar = c(pos_quitar,i)
      ini_comb = d2_temp[i,]
      d2_temp_use  = d2_temp[-pos_quitar,]
      temp_comb = ini_comb
      for(j in 2:5){
        pos_quitar_new = which(apply(d2_temp_use,1,function(x) !any(temp_comb%in%x)))[1]
        temp_comb = c(temp_comb,d2_temp_use[pos_quitar_new,])
      }
      pos_quitar = c(pos_quitar,pos_quitar_new)
      list_temp[[i]] = temp_comb
    }
    
    list_temp
    
    0 讨论(0)
提交回复
热议问题