create a bigram from a column in pandas df

前端 未结 2 1157
你的背包
你的背包 2021-01-15 08:21

i have this test table in pandas dataframe

   Leaf_category_id  session_id  product_id
0               111           1         987
3               111                


        
相关标签:
2条回答
  • 2021-01-15 09:02

    try this code

    from itertools import combinations
    import pandas as pd
    
    df = pd.DataFrame.from_csv("data.csv")
    #consecutive
    grouped_consecutive_product_ids = df.groupby(['Leaf_category_id','session_id'])['product_id'].apply(lambda x: [tuple(sorted(pair)) for pair in zip(x,x[1:])]).reset_index()
    
    df1=pd.DataFrame(grouped_consecutive_product_ids)
    s=df1.product_id.apply(lambda x: pd.Series(x)).unstack()
    df2=pd.DataFrame(s.reset_index(level=0,drop=True)).dropna()
    df2.rename(columns = {0:'Bigram'}, inplace = True)
    df2["freq"] = df2.groupby('Bigram')['Bigram'].transform('count')
    bigram_frequency_consecutive = df2.drop_duplicates(keep="first").sort_values("Bigram").reset_index()
    del bigram_frequency_consecutive["index"]
    

    for combinations (all possible bi-grams)

    from itertools import combinations
    import pandas as pd
    
    df = pd.DataFrame.from_csv("data.csv")
    #combinations
    grouped_combination_product_ids = df.groupby(['Leaf_category_id','session_id'])['product_id'].apply(lambda x: [tuple(sorted(pair)) for pair in combinations(x,2)]).reset_index()
    
    df1=pd.DataFrame(grouped_combination_product_ids)
    s=df1.product_id.apply(lambda x: pd.Series(x)).unstack()
    df2=pd.DataFrame(s.reset_index(level=0,drop=True)).dropna()
    df2.rename(columns = {0:'Bigram'}, inplace = True)
    df2["freq"] = df2.groupby('Bigram')['Bigram'].transform('count')
    bigram_frequency_combinations = df2.drop_duplicates(keep="first").sort_values("Bigram").reset_index()
    del bigram_frequency_combinations["index"]
    

    where data.csv contains

    Leaf_category_id,session_id,product_id
    0,111,1,111
    3,111,4,987
    4,111,1,741
    1,222,2,654
    2,333,3,321
    5,111,1,87
    6,111,1,34
    7,111,1,12
    8,111,1,987
    9,111,4,1232
    10,222,2,12
    11,222,2,324
    12,222,2,465
    13,222,2,342
    14,222,2,32
    15,333,3,321
    16,333,3,741
    17,333,3,987
    18,333,3,324
    19,333,3,654
    20,333,3,862
    21,222,1,123
    22,222,1,987
    23,222,1,741
    24,222,1,34
    25,222,1,12
    

    The resultant bigram_frequency_consecutive will be

             Bigram  freq
    0      (12, 34)     2
    1     (12, 324)     1
    2     (12, 654)     1
    3     (12, 987)     1
    4     (32, 342)     1
    5      (34, 87)     1
    6     (34, 741)     1
    7     (87, 741)     1
    8    (111, 741)     1
    9    (123, 987)     1
    10   (321, 321)     1
    11   (321, 741)     1
    12   (324, 465)     1
    13   (324, 654)     1
    14   (324, 987)     1
    15   (342, 465)     1
    16   (654, 862)     1
    17   (741, 987)     2
    18  (987, 1232)     1
    

    The resultant bigram_frequency_combinations will be

               Bigram  freq
    0      (12, 32)     1
    1      (12, 34)     2
    2      (12, 87)     1
    3     (12, 111)     1
    4     (12, 123)     1
    5     (12, 324)     1
    6     (12, 342)     1
    7     (12, 465)     1
    8     (12, 654)     1
    9     (12, 741)     2
    10    (12, 987)     2
    11    (32, 324)     1
    12    (32, 342)     1
    13    (32, 465)     1
    14    (32, 654)     1
    15     (34, 87)     1
    16    (34, 111)     1
    17    (34, 123)     1
    18    (34, 741)     2
    19    (34, 987)     2
    20    (87, 111)     1
    21    (87, 741)     1
    22    (87, 987)     1
    23   (111, 741)     1
    24   (111, 987)     1
    25   (123, 741)     1
    26   (123, 987)     1
    27   (321, 321)     1
    28   (321, 324)     2
    29   (321, 654)     2
    30   (321, 741)     2
    31   (321, 862)     2
    32   (321, 987)     2
    33   (324, 342)     1
    34   (324, 465)     1
    35   (324, 654)     2
    36   (324, 741)     1
    37   (324, 862)     1
    38   (324, 987)     1
    39   (342, 465)     1
    40   (342, 654)     1
    41   (465, 654)     1
    42   (654, 741)     1
    43   (654, 862)     1
    44   (654, 987)     1
    45   (741, 862)     1
    46   (741, 987)     3
    47   (862, 987)     1
    48  (987, 1232)     1
    

    in the above case it groups by both

    0 讨论(0)
  • 2021-01-15 09:14

    We are going to pull out the values from product_id, create bigrams that are sorted and thus deduplicated, and count them to get the frequency, and then populate a data frame.

    from collections import Counter
    
    # assuming your data frame is called 'df'
    
    bigrams = [list(zip(x,x[1:])) for x in df.product_id.values.tolist()]
    bigram_set = [tuple(sorted(xx) for x in bigrams for xx in x]
    freq_dict = Counter(bigram_set)
    df_freq = pd.DataFrame([list(f) for f in freq_dict], columns=['bigram','freq'])
    
    0 讨论(0)
提交回复
热议问题