Expand data.frame by creating duplicates based on group condition (3)

前端 未结 2 539
说谎
说谎 2021-01-25 02:04

Starting from this SO question.

Example data.frame:

df = read.table(text = \'ID  Day Count   Count_group
            18  1933    6   15
             


        
2条回答
  •  一生所求
    2021-01-25 02:22

    I attach a rather mechanical method, but I believe it is a good starting point. I have noticed that in your original table the entry

    ID Day Count Count_group 18 1933 6 14

    is duplicated; I have left it untouched for sake of clarity.

    Structure of the approach:

    1. Read original data
    2. Generate list of data frames, for each Day
    3. Generate final data frame, collapsing the list in 2.

    1. Read original data

    We start with

    df = read.table(text = 'ID  Day Count   Count_group
                    18  1933    6   14
                    33  1933    6   14
                    37  1933    6   14
                    18  1933    6   14
                    16  1933    6   14
                    11  1933    6   14
                    111 1932    5   9
                    34  1932    5   9
                    60  1932    5   9
                    88  1932    5   9
                    18  1932    5   9
                    33  1931    3   4
                    13  1931    3   4
                    56  1931    3   4
                    23  1930    1   1
                    6   1800    6   12
                    37  1800    6   12
                    98  1800    6   12
                    52  1800    6   12
                    18  1800    6   12
                    76  1800    6   12
                    55  1799    4   6
                    6   1799    4   6
                    52  1799    4   6
                    133 1799    4   6
                    112 1798    2   2
                    677 1798    2   2
                    778 888     4   7
                    111 888     4   7
                    88  888     4   7
                    10  888     4   7
                    37  887     2   4
                    26  887     2   4
                    8   886     1   2
                    56  885     1   1', header = TRUE)
    
    # ordered vector of unique values for "Day"
    ord_day <- unique(df$Day[order(df$Day)])
    ord_day
     [1]  885  886  887  888 1798 1799 1800 1930 1931 1932 1933
    

    2. Generate list of data frames, for each Day

    For each element in ord_day we introduce a data.frame as element of a list called df_new_aug. Such data frames are defined through a for loop for all values in ord_day except ord_day[2] and ord_day[1] which are treated separately.

    Idea behind the looping: for each unique ord_day[i] with i > 2 we check which days between ord_day[i-1] and ord_day[i-2] (or both!) contribute (through the variable "Count") to the value "Count_Group" at ord_day[i].

    We therefore introduce if else statements in the loop. Here we go

    # Recursive generation of the list of data.frames (for days > 886)
    #-----------------------------------------------------------------
    df_new <- list()
    df_new_aug <- list()
    
    # we exclude cases  i=1, 2: they are manually treated below
    for ( i in 3: length(ord_day) ) {
    
      # is "Count_Group" for ord_day[i] equal to the sum of "Count" at ord_day[i-1] and ord_day[i-2]?
      if ( unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"])  +  
           unique(df[df$Day == ord_day[i-1], "Count"]) + unique(df[df$Day == ord_day[i-2], "Count"])
           ) {
    
            # we create columns ID | Day | Count
            df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1] | df$Day == ord_day[i-2], 
                                         c("ID", "Day", "Count")])
    
            # we append the Count_Group of the Day in ord_day[i]
            df_new_aug[[i]] <- data.frame( df_new[[i]],
                                       Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )
    
    
             } else if (unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"])  +  
                        unique(df[df$Day == ord_day[i-1], "Count"]) ) #only "Count" at i and i-1 contribute to "Count_group" at i
                        {
    
                        df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1], 
                                                     c("ID", "Day", "Count")])
    
                        # we append the Count_Group of the Day in ord_day[2]
                        df_new_aug[[i]] <- data.frame(df_new[[i]],
                                                      Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )
    
                         } else #only "Count" at i contributes to "Count_group" at i
    
                                df_new[[i]] <- data.frame(df[df$Day == ord_day[i], 
                                                             c("ID", "Day", "Count")])
    
                                # we append the Count_Group of the Day in ord_day[i]
                                df_new_aug[[i]] <- data.frame(df_new[[i]],
                                                              Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )
    
    
      #closing the for loop     
      }
    
    
    # for ord_day[2] = "886" (both "Count" at i =2 and i = 1 contribute to "Count_group" at i=2)
    #-------------------------------------------------------------------------------------
    df_new[[2]] <- data.frame(df[df$Day == ord_day[2] | df$Day == ord_day[1], 
                                 c("ID", "Day", "Count")])
    
    # we append the Count_Group of the Day in ord_day[2]
    df_new_aug[[2]] <- data.frame(df_new[[2]],
                                  Count_group = rep(unique(df[df$Day == ord_day[2], "Count_group"]), nrow(df_new[[2]]) ) )
    
    # for ord_day[1] = "885" (only "count" at i = 1 contributes to "Count_group" at i =1)
    #------------------------------------------------------------------------------------
    df_new[[1]] <- data.frame(df[df$Day == ord_day[1], c("ID", "Day", "Count")])
    
    # we append the Count_Group of the Day in ord_day[i]
    df_new_aug[[1]] <- data.frame(df_new[[1]], Count_group = rep(unique(df[df$Day == ord_day[1], "Count_group"]), nrow(df_new[[1]]) ) )
    
    
    # produced list
    df_new_aug
    

    3. Generate final data frame, collapsing the list in 2.

    We collapse df_new_aug through an ugly loop, but other solutions (for example with Reduce() and merge() are possible):

    # merging the list (mechanically): final result
    df_result <- df_new_aug[[1]]
    for (i in 1:10){
      df_result <- rbind(df_result, df_new_aug[[i+1]])      
    }
    

    One arrives at df_result and the analysis is stopped.

提交回复
热议问题