Expand data.frame by creating duplicates based on group condition (3)

前端 未结 2 538
说谎
说谎 2021-01-25 02:04

Starting from this SO question.

Example data.frame:

df = read.table(text = \'ID  Day Count   Count_group
            18  1933    6   15
             


        
相关标签:
2条回答
  • 2021-01-25 02:09

    A generalised version of my previous answer...

    #first add grouping variables
    days <- 5 #grouping no of days
    df$smalldaygroup <- c(0,cumsum(sapply(2:nrow(df),function(i) df$Day[i]!=df$Day[i-1]))) #individual days
    df$bigdaygroup <- c(0,cumsum(sapply(2:nrow(df),function(i) df$Day[i]<df$Day[i-1]-days+1))) #blocks of linked days
    
    #duplicate days in each big group
    df2 <- lapply(split(df,df$bigdaygroup),function(x) {
      n <- max(x$Day)-min(x$Day)+1 #number of consecutive days in big group
      dayvec <- (max(x$Day):min(x$Day)) #possible days in range
      daylog <- dayvec[dayvec %in% x$Day] #actual days in range
      pattern <- data.frame(base=rep(dayvec,each=days))
      pattern$rep <- sapply(1:nrow(pattern),function(i) pattern$base[i]+1-sum(pattern$base[1:i]==pattern$base[i])) #indices to repeat
      pattern$offset <- match(pattern$rep,daylog)-match(pattern$base,daylog) #offsets (used later)
      pattern <- pattern[(pattern$base %in% x$Day) & (pattern$rep %in% x$Day),] #remove invalid elements
      #store pattern in list as offsets needed in next loop
      return(list(df=split(x,x$smalldaygroup)[match(pattern$rep,daylog)],pat=pattern))
    })
    
    #change the Count_group to previous value in added entries
    df2 <- lapply(df2,function(L) lapply(1:length(L$df),function(i) {
      x <- L$df[[i]]
      offset <- L$pat$offset #pointer to day to copy Count_group from
      x$Count_group <- L$df[[i-offset[i]]]$Count_group[1]
      return(x)
    }))
    
    df2 <- do.call(rbind,unlist(df2,recursive=FALSE)) #bind back together
    
    df2[,5:6] <- NULL #remove grouping variables
    
    head(df2,30) #ignore rownames!
    
           ID  Day Count Count_group
    01.1   18 1933     6          15
    01.2   33 1933     6          15
    01.3   37 1933     6          15
    01.4   18 1933     6          15
    01.5   16 1933     6          15
    01.6   11 1933     6          15
    02.7  111 1932     5          15
    02.8   34 1932     5          15
    02.9   60 1932     5          15
    02.10  88 1932     5          15
    02.11  18 1932     5          15
    03.12  33 1931     3          15
    03.13  13 1931     3          15
    03.14  56 1931     3          15
    04     23 1930     1          15
    05.7  111 1932     5           9
    05.8   34 1932     5           9
    05.9   60 1932     5           9
    05.10  88 1932     5           9
    05.11  18 1932     5           9
    06.12  33 1931     3           9
    06.13  13 1931     3           9
    06.14  56 1931     3           9
    07     23 1930     1           9
    08.12  33 1931     3           4
    08.13  13 1931     3           4
    08.14  56 1931     3           4
    09     23 1930     1           4
    010    23 1930     1           1
    11.16   6 1800     6          12
    
    0 讨论(0)
  • 2021-01-25 02:22

    I attach a rather mechanical method, but I believe it is a good starting point. I have noticed that in your original table the entry

    ID Day Count Count_group 18 1933 6 14

    is duplicated; I have left it untouched for sake of clarity.

    Structure of the approach:

    1. Read original data
    2. Generate list of data frames, for each Day
    3. Generate final data frame, collapsing the list in 2.

    1. Read original data

    We start with

    df = read.table(text = 'ID  Day Count   Count_group
                    18  1933    6   14
                    33  1933    6   14
                    37  1933    6   14
                    18  1933    6   14
                    16  1933    6   14
                    11  1933    6   14
                    111 1932    5   9
                    34  1932    5   9
                    60  1932    5   9
                    88  1932    5   9
                    18  1932    5   9
                    33  1931    3   4
                    13  1931    3   4
                    56  1931    3   4
                    23  1930    1   1
                    6   1800    6   12
                    37  1800    6   12
                    98  1800    6   12
                    52  1800    6   12
                    18  1800    6   12
                    76  1800    6   12
                    55  1799    4   6
                    6   1799    4   6
                    52  1799    4   6
                    133 1799    4   6
                    112 1798    2   2
                    677 1798    2   2
                    778 888     4   7
                    111 888     4   7
                    88  888     4   7
                    10  888     4   7
                    37  887     2   4
                    26  887     2   4
                    8   886     1   2
                    56  885     1   1', header = TRUE)
    
    # ordered vector of unique values for "Day"
    ord_day <- unique(df$Day[order(df$Day)])
    ord_day
     [1]  885  886  887  888 1798 1799 1800 1930 1931 1932 1933
    

    2. Generate list of data frames, for each Day

    For each element in ord_day we introduce a data.frame as element of a list called df_new_aug. Such data frames are defined through a for loop for all values in ord_day except ord_day[2] and ord_day[1] which are treated separately.

    Idea behind the looping: for each unique ord_day[i] with i > 2 we check which days between ord_day[i-1] and ord_day[i-2] (or both!) contribute (through the variable "Count") to the value "Count_Group" at ord_day[i].

    We therefore introduce if else statements in the loop. Here we go

    # Recursive generation of the list of data.frames (for days > 886)
    #-----------------------------------------------------------------
    df_new <- list()
    df_new_aug <- list()
    
    # we exclude cases  i=1, 2: they are manually treated below
    for ( i in 3: length(ord_day) ) {
    
      # is "Count_Group" for ord_day[i] equal to the sum of "Count" at ord_day[i-1] and ord_day[i-2]?
      if ( unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"])  +  
           unique(df[df$Day == ord_day[i-1], "Count"]) + unique(df[df$Day == ord_day[i-2], "Count"])
           ) {
    
            # we create columns ID | Day | Count
            df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1] | df$Day == ord_day[i-2], 
                                         c("ID", "Day", "Count")])
    
            # we append the Count_Group of the Day in ord_day[i]
            df_new_aug[[i]] <- data.frame( df_new[[i]],
                                       Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )
    
    
             } else if (unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"])  +  
                        unique(df[df$Day == ord_day[i-1], "Count"]) ) #only "Count" at i and i-1 contribute to "Count_group" at i
                        {
    
                        df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1], 
                                                     c("ID", "Day", "Count")])
    
                        # we append the Count_Group of the Day in ord_day[2]
                        df_new_aug[[i]] <- data.frame(df_new[[i]],
                                                      Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )
    
                         } else #only "Count" at i contributes to "Count_group" at i
    
                                df_new[[i]] <- data.frame(df[df$Day == ord_day[i], 
                                                             c("ID", "Day", "Count")])
    
                                # we append the Count_Group of the Day in ord_day[i]
                                df_new_aug[[i]] <- data.frame(df_new[[i]],
                                                              Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )
    
    
      #closing the for loop     
      }
    
    
    # for ord_day[2] = "886" (both "Count" at i =2 and i = 1 contribute to "Count_group" at i=2)
    #-------------------------------------------------------------------------------------
    df_new[[2]] <- data.frame(df[df$Day == ord_day[2] | df$Day == ord_day[1], 
                                 c("ID", "Day", "Count")])
    
    # we append the Count_Group of the Day in ord_day[2]
    df_new_aug[[2]] <- data.frame(df_new[[2]],
                                  Count_group = rep(unique(df[df$Day == ord_day[2], "Count_group"]), nrow(df_new[[2]]) ) )
    
    # for ord_day[1] = "885" (only "count" at i = 1 contributes to "Count_group" at i =1)
    #------------------------------------------------------------------------------------
    df_new[[1]] <- data.frame(df[df$Day == ord_day[1], c("ID", "Day", "Count")])
    
    # we append the Count_Group of the Day in ord_day[i]
    df_new_aug[[1]] <- data.frame(df_new[[1]], Count_group = rep(unique(df[df$Day == ord_day[1], "Count_group"]), nrow(df_new[[1]]) ) )
    
    
    # produced list
    df_new_aug
    

    3. Generate final data frame, collapsing the list in 2.

    We collapse df_new_aug through an ugly loop, but other solutions (for example with Reduce() and merge() are possible):

    # merging the list (mechanically): final result
    df_result <- df_new_aug[[1]]
    for (i in 1:10){
      df_result <- rbind(df_result, df_new_aug[[i+1]])      
    }
    

    One arrives at df_result and the analysis is stopped.

    0 讨论(0)
提交回复
热议问题