How do I count the values from a pandas column which is a list of strings?

前端未结

关注

 5  2016

I have a dataframe column which is a list of strings:

df[\'colors\']

0              [\'blue\',\'green\',\'brown\']
1              []
2              [\'green\


                      
              相关标签:


      
      
        
          5条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  天命终不由人        
                
              
                            
                2021-01-19 21:58
              
            
            
                                                                       
You can use Counter from the collections module:
import pandas as pd
from collections import Counter
from  itertools import chain

df = pd.DataFrame({'colors':[['blue','green','brown'],
                             [],
                             ['green','red','blue'],
                             ['purple'],
                             ['brown']]})

df = pd.Series(Counter(chain(*df.colors)))

print (df)

Output:
blue      2
green     2
brown     2
red       1
purple    1
dtype: int64

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  清歌不尽        
                
              
                            
                2021-01-19 22:04
              
            
            
                                                                       
I would use .apply with pd.Series to accomplish this:
# 1. Expand columns and count them
df_temp = df["colors"].apply(pd.Series.value_counts)

    blue    brown   green   purple  red
0   1.0 1.0 1.0 NaN NaN
1   NaN NaN NaN NaN NaN
2   1.0 NaN 1.0 NaN 1.0
3   NaN NaN NaN 1.0 NaN
4   NaN 1.0 NaN NaN NaN

# 2. Get the value counts from this:
df_temp.sum()

blue      2.0
brown     2.0
green     2.0
purple    1.0
red       1.0

# Alternatively, convert to a dict
df_temp.sum().to_dict()
# {'blue': 2.0, 'brown': 2.0, 'green': 2.0, 'purple': 1.0, 'red': 1.0}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  失恋的感觉        
                
              
                            
                2021-01-19 22:18
              
            
            
                                                                       
Use a Counter + chain, which is meant to do exactly this. Then construct the Series from the Counter object.
import pandas as pd
from collections import Counter
from itertools import chain

s = pd.Series([['blue','green','brown'], [], ['green','red','blue']])

pd.Series(Counter(chain.from_iterable(s)))
#blue     2
#green    2
#brown    1
#red      1
#dtype: int64


While explode + value_counts are the pandas way to do things, they're slower for shorter lists.
import perfplot
import pandas as pd
import numpy as np

from collections import Counter
from itertools import chain

def counter(s):
    return pd.Series(Counter(chain.from_iterable(s)))

def explode(s):
    return s.explode().value_counts()

perfplot.show(
    setup=lambda n: pd.Series([['blue','green','brown'], [], ['green','red','blue']]*n), 
    kernels=[
        lambda s: counter(s),
        lambda s: explode(s),
    ],
    labels=['counter', 'explode'],
    n_range=[2 ** k for k in range(17)],
    equality_check=np.allclose,  
    xlabel='~len(s)'
)


                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  野性不改        
                
              
                            
                2021-01-19 22:18
              
            
            
                                                                       
A quick and dirty solution would be something like this I imagine.
You'd still have to add a condition to get the empty list, though.
colors = df.colors.tolist()
d = {}
for l in colors:
    for c in l:
        if c not in d.keys():
            d.update({c: 1})
        else:
            current_val = d.get(c)
            d.update({c: current_val+1})

this produces a dictionary looking like this:
{'blue': 2, 'green': 2, 'brown': 2, 'red': 1, 'purple': 1}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  抹茶落季        
                
              
                            
                2021-01-19 22:22
              
            
            
                                                                       
Solution
Best option: df.colors.explode().dropna().value_counts().
However, if you also want to have counts for empty lists ([]), use Method-1.B/C similar to what was suggested by Quang Hoang in the comments.
You can use any of the following two methods.

Method-1: Use pandas methods alone ⭐⭐⭐

explode --> dropna --> value_counts


Method-2: Use list.extend --> pd.Series.value_counts

## Method-1
# A. If you don't want counts for empty []
df.colors.explode().dropna().value_counts() 

# B. If you want counts for empty [] (classified as NaN)
df.colors.explode().value_counts(dropna=False) # returns [] as Nan

# C. If you want counts for empty [] (classified as [])
df.colors.explode().fillna('[]').value_counts() # returns [] as []

## Method-2
colors = []
_ = [colors.extend(e) for e in df.colors if len(e)>0]
pd.Series(colors).value_counts()

Output:
green     2
blue      2
brown     2
red       1
purple    1
# NaN     1  ## For Method-1.B
# []      1  ## For Method-1.C
dtype: int64

Dummy Data
import pandas as pd

df = pd.DataFrame({'colors':[['blue','green','brown'],
                             [],
                             ['green','red','blue'],
                             ['purple'],
                             ['brown']]})

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复