Split data.frame based on levels of a factor into new data.frames

前端未结

关注

 2  1715

I\'m trying to create separate data.frame objects based on levels of a factor. So if I have:

df <- data.frame(
  x=rnorm(25),
  y=rnorm(25),


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  北恋        
                
              
                            
                2020-11-21 17:31
              
            
            
                                                                       
I think that split does exactly what you want.

Notice that X is a list of data frames, as seen by str:

X <- split(df, df$g)
str(X)


If you want individual object with the group g names you could assign the elements of X from split to objects of those names, though this seems like extra work when you can just index the data frames from the list split creates.

#I used lapply just to drop the third column g which is no longer needed.
Y <- lapply(seq_along(X), function(x) as.data.frame(X[[x]])[, 1:2]) 

#Assign the dataframes in the list Y to individual objects
A <- Y[[1]]
B <- Y[[2]]
C <- Y[[3]]
D <- Y[[4]]
E <- Y[[5]]

#Or use lapply with assign to assign each piece to an object all at once
lapply(seq_along(Y), function(x) {
    assign(c("A", "B", "C", "D", "E")[x], Y[[x]], envir=.GlobalEnv)
    }
)


Edit Or even better than using lapply to assign to the global environment use list2env:

names(Y) <- c("A", "B", "C", "D", "E")
list2env(Y, envir = .GlobalEnv)
A

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  無奈伤痛        
                
              
                            
                2020-11-21 17:41
              
            
            
                                                                       
Since dplyr 0.8.0 , we can also use group_split which has similar behavior as base::split
library(dplyr)
df %>% group_split(g)

#[[1]]
# A tibble: 5 x 3
#       x      y g    
#   <dbl>  <dbl> <fct>
#1 -1.21  -1.45  A    
#2  0.506  1.10  A    
#3 -0.477 -1.17  A    
#4 -0.110  1.45  A    
#5  0.134 -0.969 A    

#[[2]]
# A tibble: 5 x 3
#       x      y g    
#   <dbl>  <dbl> <fct>
#1  0.277  0.575 B    
#2 -0.575 -0.476 B    
#3 -0.998 -2.18  B    
#4 -0.511 -1.07  B    
#5 -0.491 -1.11  B  
#....

It also comes with argument .keep (which is TRUE by default) to specify whether or not the grouped column should be kept.
df %>% group_split(g, .keep = FALSE)

#[[1]]
# A tibble: 5 x 2
#       x      y
#   <dbl>  <dbl>
#1 -1.21  -1.45 
#2  0.506  1.10 
#3 -0.477 -1.17 
#4 -0.110  1.45 
#5  0.134 -0.969

#[[2]]
# A tibble: 5 x 2
#       x      y
#   <dbl>  <dbl>
#1  0.277  0.575
#2 -0.575 -0.476
#3 -0.998 -2.18 
#4 -0.511 -1.07 
#5 -0.491 -1.11 
#....

The difference between base::split and dplyr::group_split is that group_split does not name the elements of the list based on grouping. So
df1 <- df %>% group_split(g)
names(df1) #gives 
NULL

whereas
df2 <- split(df, df$g)
names(df2) #gives
#[1] "A" "B" "C" "D" "E"

data
set.seed(1234)
df <- data.frame(
      x=rnorm(25),
      y=rnorm(25),
      g=rep(factor(LETTERS[1:5]), 5)
)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复