Expand data.frame by creating duplicates based on group condition (3)

前端未结

关注

 2  539

说谎 2021-01-25 02:04

Starting from this SO question.

Example data.frame:

df = read.table(text = \'ID  Day Count   Count_group
            18  1933    6   15


      
      
        
          2条回答        

        
                    
            
            
                         
                
              
              
                
                   一生所求
                                             
                
                
                (楼主)
            
              
              
                2021-01-25 02:22
              

            
            
                        
I attach a rather mechanical method, but I believe it is a good starting point.
I have noticed that in your original table the entry


ID  Day   Count Count_group
18  1933     6     14


is duplicated; I have left it untouched for sake of clarity.

Structure of the approach:


Read original data
Generate list of data frames, for each Day
Generate final data frame, collapsing the list in 2.


1. Read original data

We start with

df = read.table(text = 'ID  Day Count   Count_group
                18  1933    6   14
                33  1933    6   14
                37  1933    6   14
                18  1933    6   14
                16  1933    6   14
                11  1933    6   14
                111 1932    5   9
                34  1932    5   9
                60  1932    5   9
                88  1932    5   9
                18  1932    5   9
                33  1931    3   4
                13  1931    3   4
                56  1931    3   4
                23  1930    1   1
                6   1800    6   12
                37  1800    6   12
                98  1800    6   12
                52  1800    6   12
                18  1800    6   12
                76  1800    6   12
                55  1799    4   6
                6   1799    4   6
                52  1799    4   6
                133 1799    4   6
                112 1798    2   2
                677 1798    2   2
                778 888     4   7
                111 888     4   7
                88  888     4   7
                10  888     4   7
                37  887     2   4
                26  887     2   4
                8   886     1   2
                56  885     1   1', header = TRUE)

# ordered vector of unique values for "Day"
ord_day <- unique(df$Day[order(df$Day)])
ord_day
 [1]  885  886  887  888 1798 1799 1800 1930 1931 1932 1933


 2. Generate list of data frames, for each Day

For each element in ord_day we introduce a data.frame as element of a list called df_new_aug.
Such data frames are defined through a for loop for all values in ord_day except ord_day[2] and ord_day[1] which are treated separately.

Idea behind the looping: for each unique ord_day[i] with i > 2 we check which days between ord_day[i-1] and ord_day[i-2] (or both!) contribute (through the variable "Count") to the value "Count_Group" at ord_day[i]. 

We therefore introduce if else statements in the loop.
Here we go

# Recursive generation of the list of data.frames (for days > 886)
#-----------------------------------------------------------------
df_new <- list()
df_new_aug <- list()

# we exclude cases  i=1, 2: they are manually treated below
for ( i in 3: length(ord_day) ) {

  # is "Count_Group" for ord_day[i] equal to the sum of "Count" at ord_day[i-1] and ord_day[i-2]?
  if ( unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"])  +  
       unique(df[df$Day == ord_day[i-1], "Count"]) + unique(df[df$Day == ord_day[i-2], "Count"])
       ) {

        # we create columns ID | Day | Count
        df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1] | df$Day == ord_day[i-2], 
                                     c("ID", "Day", "Count")])

        # we append the Count_Group of the Day in ord_day[i]
        df_new_aug[[i]] <- data.frame( df_new[[i]],
                                   Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )


         } else if (unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"])  +  
                    unique(df[df$Day == ord_day[i-1], "Count"]) ) #only "Count" at i and i-1 contribute to "Count_group" at i
                    {

                    df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1], 
                                                 c("ID", "Day", "Count")])

                    # we append the Count_Group of the Day in ord_day[2]
                    df_new_aug[[i]] <- data.frame(df_new[[i]],
                                                  Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )

                     } else #only "Count" at i contributes to "Count_group" at i

                            df_new[[i]] <- data.frame(df[df$Day == ord_day[i], 
                                                         c("ID", "Day", "Count")])

                            # we append the Count_Group of the Day in ord_day[i]
                            df_new_aug[[i]] <- data.frame(df_new[[i]],
                                                          Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )


  #closing the for loop     
  }


# for ord_day[2] = "886" (both "Count" at i =2 and i = 1 contribute to "Count_group" at i=2)
#-------------------------------------------------------------------------------------
df_new[[2]] <- data.frame(df[df$Day == ord_day[2] | df$Day == ord_day[1], 
                             c("ID", "Day", "Count")])

# we append the Count_Group of the Day in ord_day[2]
df_new_aug[[2]] <- data.frame(df_new[[2]],
                              Count_group = rep(unique(df[df$Day == ord_day[2], "Count_group"]), nrow(df_new[[2]]) ) )

# for ord_day[1] = "885" (only "count" at i = 1 contributes to "Count_group" at i =1)
#------------------------------------------------------------------------------------
df_new[[1]] <- data.frame(df[df$Day == ord_day[1], c("ID", "Day", "Count")])

# we append the Count_Group of the Day in ord_day[i]
df_new_aug[[1]] <- data.frame(df_new[[1]], Count_group = rep(unique(df[df$Day == ord_day[1], "Count_group"]), nrow(df_new[[1]]) ) )


# produced list
df_new_aug


3. Generate final data frame, collapsing the list in 2.

We collapse df_new_aug through an ugly loop, but other solutions (for example with Reduce() and merge() are possible):

# merging the list (mechanically): final result
df_result <- df_new_aug[[1]]
for (i in 1:10){
  df_result <- rbind(df_result, df_new_aug[[i+1]])      
}


One arrives at df_result and the analysis is stopped.
    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它2个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复