Reshaping a data.frame so a column containing multiple features becomes multiple binary columns

后端未结

关注

 4  754

I have a dataframe like this

df <-data.frame(id = c(1,2),
                value = c(25,24),
                features = c(\"A,B,D,F\",\"C,B,E\"))

print(df)

i


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  执念已碎        
                
              
                            
                2021-01-27 08:46
              
            
            
                                                                       
Another one using splitstackshape and data.table (installation instructions here):

require(splitstackshape)
require(data.table) # v1.9.5+
ans <- cSplit(df, 'features', sep = ',', 'long')
dcast(ans, id + value ~ features, fun.aggregate = length)
#    id value A B C D E F
# 1:  1    25 1 1 0 1 0 1
# 2:  2    24 0 1 1 0 1 0


If you're using data.table v1.9.4, then replace dcast with dcast.data.table.

Alternatively, you can use cSplit_e, like this:

cSplit_e(df, "features", ",", type = "character", fill = 0)
##   id value features features_A features_B features_C features_D features_E features_F
## 1  1    25  A,B,D,F          1          1          0          1          0          1
## 2  2    24    C,B,E          0          1          1          0          1          0

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  孤独总比滥情好        
                
              
                            
                2021-01-27 08:50
              
            
            
                                                                       
A dplyr/tidyr solution

library(dplyr)
library(tidyr)

separate(df,features,1:4,",",extra="merge") %>%
  gather(key,letter,-id,-value) %>%
  filter(!is.na(letter)) %>%
  select(-key) %>%
  mutate(n=1) %>%
  spread(letter,n) %>%
  mutate_each(funs(ifelse(is.na(.),0,1)),A:F)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  时光取名叫无心        
                
              
                            
                2021-01-27 09:02
              
            
            
                                                                       
This is yet another use case for merge after suitable transformation. 

library(reshape2)
f<-with(df,stack(setNames(strsplit(as.character(features),","),id)))
d<-dcast(f,ind~values,length,value.var="ind")
out<-merge(df[,1:2],d,by.x="id",by.y="ind")

print(out)


  id value A B C D E F
1  1    25 1 1 0 1 0 1
2  2    24 0 1 1 0 1 0


This can also be done using only default libraries (without reshape2) in a variety of slightly messier ways. In the above, you can substitute the d and out lines with the following instead:

d<-xtabs(count~ind+values,transform(f,count=1))
out<-merge(df[,1:2],as.data.frame.matrix(d),by.x="id",by.y="row.names")

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  青春惊慌失措        
                
              
                            
                2021-01-27 09:06
              
            
            
                                                                       
You can do:

library(splitstackshape)
library(qdapTools)

df1 = data.frame(cSplit(df, 'features', sep=',', type.convert=F))
cbind(df1[1:2], mtabulate(as.data.frame(t(df1[-c(1,2)]))))

#   id value A B C D E F
#1:  1    25 1 1 0 1 0 1
#2:  2    24 0 1 1 0 1 0

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复