create a bigram from a column in pandas df

前端未结

关注

 2  1157

i have this test table in pandas dataframe

   Leaf_category_id  session_id  product_id
0               111           1         987
3               111


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  别那么骄傲        
                
              
                            
                2021-01-15 09:02
              
            
            
                                                                       
try this code

from itertools import combinations
import pandas as pd

df = pd.DataFrame.from_csv("data.csv")
#consecutive
grouped_consecutive_product_ids = df.groupby(['Leaf_category_id','session_id'])['product_id'].apply(lambda x: [tuple(sorted(pair)) for pair in zip(x,x[1:])]).reset_index()

df1=pd.DataFrame(grouped_consecutive_product_ids)
s=df1.product_id.apply(lambda x: pd.Series(x)).unstack()
df2=pd.DataFrame(s.reset_index(level=0,drop=True)).dropna()
df2.rename(columns = {0:'Bigram'}, inplace = True)
df2["freq"] = df2.groupby('Bigram')['Bigram'].transform('count')
bigram_frequency_consecutive = df2.drop_duplicates(keep="first").sort_values("Bigram").reset_index()
del bigram_frequency_consecutive["index"]


for combinations (all possible bi-grams)

from itertools import combinations
import pandas as pd

df = pd.DataFrame.from_csv("data.csv")
#combinations
grouped_combination_product_ids = df.groupby(['Leaf_category_id','session_id'])['product_id'].apply(lambda x: [tuple(sorted(pair)) for pair in combinations(x,2)]).reset_index()

df1=pd.DataFrame(grouped_combination_product_ids)
s=df1.product_id.apply(lambda x: pd.Series(x)).unstack()
df2=pd.DataFrame(s.reset_index(level=0,drop=True)).dropna()
df2.rename(columns = {0:'Bigram'}, inplace = True)
df2["freq"] = df2.groupby('Bigram')['Bigram'].transform('count')
bigram_frequency_combinations = df2.drop_duplicates(keep="first").sort_values("Bigram").reset_index()
del bigram_frequency_combinations["index"]


where data.csv contains

Leaf_category_id,session_id,product_id
0,111,1,111
3,111,4,987
4,111,1,741
1,222,2,654
2,333,3,321
5,111,1,87
6,111,1,34
7,111,1,12
8,111,1,987
9,111,4,1232
10,222,2,12
11,222,2,324
12,222,2,465
13,222,2,342
14,222,2,32
15,333,3,321
16,333,3,741
17,333,3,987
18,333,3,324
19,333,3,654
20,333,3,862
21,222,1,123
22,222,1,987
23,222,1,741
24,222,1,34
25,222,1,12


The resultant bigram_frequency_consecutive will be 

         Bigram  freq
0      (12, 34)     2
1     (12, 324)     1
2     (12, 654)     1
3     (12, 987)     1
4     (32, 342)     1
5      (34, 87)     1
6     (34, 741)     1
7     (87, 741)     1
8    (111, 741)     1
9    (123, 987)     1
10   (321, 321)     1
11   (321, 741)     1
12   (324, 465)     1
13   (324, 654)     1
14   (324, 987)     1
15   (342, 465)     1
16   (654, 862)     1
17   (741, 987)     2
18  (987, 1232)     1


The resultant bigram_frequency_combinations will be 

           Bigram  freq
0      (12, 32)     1
1      (12, 34)     2
2      (12, 87)     1
3     (12, 111)     1
4     (12, 123)     1
5     (12, 324)     1
6     (12, 342)     1
7     (12, 465)     1
8     (12, 654)     1
9     (12, 741)     2
10    (12, 987)     2
11    (32, 324)     1
12    (32, 342)     1
13    (32, 465)     1
14    (32, 654)     1
15     (34, 87)     1
16    (34, 111)     1
17    (34, 123)     1
18    (34, 741)     2
19    (34, 987)     2
20    (87, 111)     1
21    (87, 741)     1
22    (87, 987)     1
23   (111, 741)     1
24   (111, 987)     1
25   (123, 741)     1
26   (123, 987)     1
27   (321, 321)     1
28   (321, 324)     2
29   (321, 654)     2
30   (321, 741)     2
31   (321, 862)     2
32   (321, 987)     2
33   (324, 342)     1
34   (324, 465)     1
35   (324, 654)     2
36   (324, 741)     1
37   (324, 862)     1
38   (324, 987)     1
39   (342, 465)     1
40   (342, 654)     1
41   (465, 654)     1
42   (654, 741)     1
43   (654, 862)     1
44   (654, 987)     1
45   (741, 862)     1
46   (741, 987)     3
47   (862, 987)     1
48  (987, 1232)     1


in the above case it groups by both
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  北海茫月        
                
              
                            
                2021-01-15 09:14
              
            
            
                                                                       
We are going to pull out the values from product_id, create bigrams that are sorted and thus deduplicated, and count them to get the frequency, and then populate a data frame.

from collections import Counter

# assuming your data frame is called 'df'

bigrams = [list(zip(x,x[1:])) for x in df.product_id.values.tolist()]
bigram_set = [tuple(sorted(xx) for x in bigrams for xx in x]
freq_dict = Counter(bigram_set)
df_freq = pd.DataFrame([list(f) for f in freq_dict], columns=['bigram','freq'])

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复