R data.table: subgroup weighted percent of group

前端未结

关注

 3  576

I have a data.table like:

library(data.table)
widgets <- data.table(serial_no=1:100, 
                      color=rep_len(c(\"red\",\"green\"


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  时光取名叫无心        
                
              
                            
                2020-12-16 19:59
              
            
            
                                                                       
This is almost a single step:

# A
widgets[,{
    totwt = .N
    .SD[,.(frac=.N/totwt),by=style]
},by=color]
    # color  style frac
 # 1:   red  round 0.36
 # 2:   red pointy 0.32
 # 3:   red   flat 0.32
 # 4: green pointy 0.36
 # 5: green   flat 0.32
 # 6: green  round 0.32
 # 7:  blue   flat 0.36
 # 8:  blue  round 0.32
 # 9:  blue pointy 0.32
# 10: black  round 0.36
# 11: black pointy 0.32
# 12: black   flat 0.32

# B
widgets[,{
    totwt = sum(weight)
    .SD[,.(frac=sum(weight)/totwt),by=style]
},by=color]
 #    color  style      frac
 # 1:   red  round 0.3466667
 # 2:   red pointy 0.3466667
 # 3:   red   flat 0.3066667
 # 4: green pointy 0.3333333
 # 5: green   flat 0.3200000
 # 6: green  round 0.3466667
 # 7:  blue   flat 0.3866667
 # 8:  blue  round 0.2933333
 # 9:  blue pointy 0.3200000
# 10: black  round 0.3733333
# 11: black pointy 0.3333333
# 12: black   flat 0.2933333


How it works: Construct your denominator for the top-level group (color) before going to the finer group (color with style) to tabulate.



Alternatives. If styles repeat within each color and this is only for display purposes, try a table:

# A
widgets[,
  prop.table(table(color,style),1)
]
#        style
# color   flat pointy round
#   black 0.32   0.32  0.36
#   blue  0.36   0.32  0.32
#   green 0.32   0.36  0.32
#   red   0.32   0.32  0.36

# B
widgets[,rep(1L,sum(weight)),by=.(color,style)][,
  prop.table(table(color,style),1)
]

#        style
# color        flat    pointy     round
#   black 0.2933333 0.3333333 0.3733333
#   blue  0.3866667 0.3200000 0.2933333
#   green 0.3200000 0.3333333 0.3466667
#   red   0.3066667 0.3466667 0.3466667


For B, this expands the data so that there is one observation for each unit of weight. With large data, such an expansion would be a bad idea (since it costs so much memory). Also, weight has to be an integer; otherwise, its sum will be silently truncated to one (e.g., try rep(1,2.5) # [1] 1 1).
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  野的像风        
                
              
                            
                2020-12-16 20:18
              
            
            
                                                                       
Calculate a frequency table for each style within color and then for each row look up the frequency for that row's style in that table finally dividing by the number of rows within that color.  

widgets[, frac := table(style)[style] / .N, by = color]


giving:

  > widgets
     serial_no color  style weight frac
  1:         1   red  round      1 0.36
  2:         2 green pointy      2 0.36
  3:         3  blue   flat      3 0.36
  4:         4 black  round      4 0.36
  5:         5   red pointy      5 0.32
  6:         6 green   flat      1 0.32
  7:         7  blue  round      2 0.32
  8:         8 black pointy      3 0.32
  9:         9   red   flat      4 0.32
 10:        10 green  round      5 0.32
 ... etc ...


This could readily be translated to base or dplyr, if desired:

# base
prop <- function(x) table(x)[x] / length(x)
transform(widgets, frac = ave(style, color, FUN = prop))

# dplyr - uses prop function from above
library(dplyr)
widgets %>% group_by(color) %>% mutate(frac = prop(style)) %>% ungroup

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  南旧        
                
              
                            
                2020-12-16 20:23
              
            
            
                                                                       
it may be a good idea to use dplyr

df <- widgets %>% 
  group_by(color, style) %>%
  summarise(count = n()) %>%
  mutate(freq = count/sum(count))

df2 <- widgets %>% 
  group_by(color, style) %>%
  summarise(count_w = sum(weight)) %>%
  mutate(freq = count_w/sum(count_w))  

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复