Flattening a delimited composite column

后端未结

关注

 3  887

I got a data frame in R where one of the fields is composite (delimited). Here\'s an example of what I got:

users=c(1,2,3)
items=c(\"23 77 49\", \"10 18 28\"


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  抹茶落季        
                
              
                            
                2020-12-07 06:30
              
            
            
                                                                       
items <- strsplit(df$items, " ")
data.frame(user = rep(df$users, sapply(items, length)), item = unlist(items))

##   user item                                                                                                                                                                                                                                
## 1    1   23                                                                                                                                                                                                                                
## 2    1   77                                                                                                                                                                                                                                
## 3    1   49                                                                                                                                                                                                                                
## 4    2   10                                                                                                                                                                                                                                
## 5    2   18                                                                                                                                                                                                                                
## 6    2   28                                                                                                                                                                                                                                
## 7    3   20                                                                                                                                                                                                                                
## 8    3   31                                                                                                                                                                                                                                
## 9    3   84  


or

library(data.table)

DT <- data.table(df)    
DT[, list(item = unlist(strsplit(items, " "))), by = users]

##    users item                                                                                                                                                                                                                              
## 1:     1   23                                                                                                                                                                                                                              
## 2:     1   77                                                                                                                                                                                                                              
## 3:     1   49                                                                                                                                                                                                                              
## 4:     2   10                                                                                                                                                                                                                              
## 5:     2   18                                                                                                                                                                                                                              
## 6:     2   28                                                                                                                                                                                                                              
## 7:     3   20                                                                                                                                                                                                                              
## 8:     3   31                                                                                                                                                                                                                              
## 9:     3   84 

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  臣服心动        
                
              
                            
                2020-12-07 06:38
              
            
            
                                                                       
If you're willing to install my "SOfun" package or load my concat.split.DT function, AND if there are the same number of items in each "item" string (in your example, there are 3), then the following might be an option:

library(reshape2)
library(data.table)

melt(concat.split.DT(indf, "items", " "), id.vars="users")


Here's an example.

Sample data: 3 rows, 3000 rows, and 3,000,000 rows

I've added an "id" column so you can compare the output across the two options.

## your sample data.frame
df <- data.frame(users=c(1,2,3),
                 items=c("23 77 49", "10 18 28", "20 31 84"))

## extended to 3000 rows
df1k <- df[rep(rownames(df), 1000), ]
df1k$id <- sequence(nrow(df1k))

## extended to 3 million rows
df1m <- df1M <- df[rep(rownames(df), 1000000), ]
df1m$id <- sequence(nrow(df1m))


Load the required packages


"SOfun" (only on GitHub) for concat.split.DT which makes use of fread from "data.table" to split concatenated values. 
"reshape2" for melt
"data.table" for its awesomeness, at least version 1.8.11




# library(devtools)
# install_github("SOfun", "mrdwab")
library(SOfun)
library(data.table)
library(reshape2)
packageVersion("data.table")
# [1] ‘1.8.11’


Here are some functions to test the speed of Jake's answer and this one. Later I'll try to update with "dplyr" too.

fun1 <- function(indf) {
  DT <- melt(concat.split.DT(indf, "items", " "), 
             id.vars=c("id", "users"))
  setkeyv(DT, c("id", "users"))
  DT
}

fun2 <- function(indf) {
  DT <- data.table(indf)    
  DT[, list(item = unlist(strsplit(as.character(items), " "))), 
     by = list(id, users)]
}


Testing on 3,000 rows

microbenchmark(fun1(df1k), fun2(df1k))
# Unit: milliseconds
#        expr       min        lq    median        uq      max neval
#  fun1(df1k)  17.64675  18.21658  18.79859  21.21943  71.7737   100
#  fun2(df1k) 152.97974 158.44148 163.12707 199.77297 345.7508   100


Testing (just once) on 3,000,000 rows

Time would be in seconds here....

system.time(fun1(df1m))
#    user  system elapsed 
#    7.71    0.94    8.69 
system.time(fun2(df1m))
#    user  system elapsed 
#  177.80    0.50  178.97 




Update

@Jake makes a good point in the comments that adding an "id" made a very big difference in timings. I added it just so that the output of the two data.table approaches could be easily compared to see that the results were the same. 

Removing the "id" column and removing reference to "id" in fun1 and fun2 gives us the following:

microbenchmark(fun1a(df1M), fun2a(df1M), fun3(df1M), times = 5)
# Unit: seconds
#         expr       min        lq    median        uq       max neval
#  fun1a(df1M)  2.307313  2.420845  2.630284  2.822011  3.074464     5
#  fun2a(df1M) 12.480502 12.491783 12.761392 13.069169 13.733686     5
#   fun3(df1M) 13.976329 14.281856 14.471252 15.041450 15.089593     5


Also benchmarked above is fun3, which is @mnel's "dplyr" approach. 

fun3 <- function(indf) {
  rbind_all(do(indf %.% group_by(users), 
               .f = function(d) data.frame(
                 d[,1,drop=FALSE], 
                 items = unlist(strsplit(as.character(d[['items']]),' ')), 
                 stringsAsFactors=FALSE)))
}


Pretty nice performance all answers!
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  无人及你        
                
              
                            
                2020-12-07 06:50
              
            
            
                                                                       
Here is a dplyr solution

users=c(1,2,3)
items=c("23 77 49", "10 18 28", "20 31 84")
df = data.frame(users,items,stringsAsFactors=FALSE)
rbind_all(do(df %.% group_by(users), 
          .f = function(d) data.frame(d[,1,drop=FALSE], 
              items = unlist(strsplit(d[['items']],' ')), 
           stringsAsFactors=FALSE)))


It would be really nice to have an expand function, i.e. the opposite of summarise

eg. if the following would work.

df %.% group_by(users) %.% expand(unlist(strsplit(items,' ')))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复