Remove rows based on factor-levels

前端未结

关注

 2  1729

I have a data.frame df in format \"long\".

df <- data.frame(site = rep(c(\"A\",\"B\",\"C\"), 1, 7),
                 time = c(11,11,11,22,22,


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  有刺的猬        
                
              
                            
                2021-01-19 10:46
              
            
            
                                                                       
Would rle work for you?

df <- df[order(df$time), ]
df <- subset(df, time != rle(df$time)$value[rle(df$time)$lengths == 1])
df <- df[order(df$site), ]
df
##   site time value
## 1    A   11    17
## 4    A   22    -3
## 2    B   11     8
## 5    B   22     5
## 3    C   11     0
## 6    C   22    13


Re-looking at your data, it seems that this solution might be too simple for your needs though....

Update

Here's an approach that should be better than the rle solution that I put above. Rather than look for a run-length of "1", will delete rows that do not match certain conditions of the results of table(df$site, df$time). To illustrate, I've also added some more fake data.

df <- data.frame(site = rep(c("A","B","C"), 1, 7),
                 time = c(11,11,11,22,22,22,33),
                 value = ceiling(rnorm(7)*10))
df2 <- data.frame(site = rep(c("A","B","C"), 1, 7),
                 time = c(14,14,15,15,16,16,16),
                 value = ceiling(rnorm(7)*10))
df <- rbind(df, df2)
df <- df[order(df$site), ]

temp <- as.numeric(names(which(colSums(with(df, table(site, time)))
                               >= length(levels(df$site)))))
df2 <- merge(df, data.frame(temp), by.x = "time", by.y = "temp")
df2 <- df2[order(df2$site), ]
df2
##   time site value
## 3   11    A    -2
## 4   16    A    -2
## 7   22    A     2
## 1   11    B   -16
## 5   16    B     3
## 8   22    B    -6
## 2   11    C     8
## 6   16    C    11
## 9   22    C   -10


Here's the result of tabulating and summing up the site/time combination:

colSums(with(df, table(site, time)))
## 11 14 15 16 22 33 
##  3  2  2  3  3  1 


Thus, if we were interested in including sites where at least two sites had the timestamp, we could change the line >= length(levels(df$site)) (in this example, 3) to >= length(levels(df$site))-1 (obviously, 2).

Not sure if this solution is useful to you at all, but I thought I would share it to show the flexibility in solutions we have with R.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  没有蜡笔的小新        
                
              
                            
                2021-01-19 10:52
              
            
            
                                                                       
Here's another possible solution using the data.table package:

unTime <- unique(df$time)

library(data.table)

DT <- data.table(df, key = "site")

(notInAll <- unique(DT[, list(ans = which(!unTime %in% time)), by = key(DT)]$ans))
# [1] 3

DT[time %in% unTime[-notInAll]]

#      site time value
# [1,]    A   11     3
# [2,]    A   22    11
# [3,]    B   11    -6
# [4,]    B   22    -2
# [5,]    C   11   -19
# [6,]    C   22   -14


EDIT from Matthew

Nice. Or a slightly more direct way :

DT = as.data.table(df)
tt = DT[,length(unique(site)),by=time]
tt
   time V1
1:   11  3
2:   22  3
3:   33  1

tt = tt[V1==max(V1)]      # See * below
tt
   time V1
1:   11  3
2:   22  3

DT[time %in% tt$time]
   site time value
1:    A   11     7
2:    A   22    -2
3:    B   11     8
4:    B   22   -10
5:    C   11     3
6:    C   22     1


In case no time is present in all sites, when final result should be empty (as Ben pointed out in comments), the step marked * above could be :

tt = tt[V1==length(unique(DT$site))]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复