Constructing a co-occurrence matrix in python pandas

后端 未结 4 1649
南笙
南笙 2020-11-28 04:29

I know how to do this in R. But, is there any function in pandas that transforms a dataframe to an nxn co-occurrence matrix containing the counts of two aspects co-occurring

相关标签:
4条回答
  • 2020-11-28 04:49

    In case that you have larger corpus and term-frequency matrix, using sparse matrix multiplication might be more efficient. I use the same trick of matrix multiplication refered to algo answer on this page.

    import scipy.sparse as sp
    X = sp.csr_matrix(df.astype(int).values) # convert dataframe to sparse matrix
    Xc = X.T * X # multiply sparse matrix # 
    Xc.setdiag(0) # reset diagonal
    print(Xc.todense()) # to print co-occurence matrix in dense format
    

    Xc here will be the co-occurence matrix in sparse csr format

    0 讨论(0)
  • 2020-11-28 04:55

    It's a simple linear algebra, you multiply matrix with its transpose (your example contains strings, don't forget to convert them to integer):

    >>> df_asint = df.astype(int)
    >>> coocc = df_asint.T.dot(df_asint)
    >>> coocc
           Dop  Snack  Trans
    Dop      4      2      3
    Snack    2      3      2
    Trans    3      2      4
    

    if, as in R answer, you want to reset diagonal, you can use numpy's fill_diagonal:

    >>> import numpy as np
    >>> np.fill_diagonal(coocc.values, 0)
    >>> coocc
           Dop  Snack  Trans
    Dop      0      2      3
    Snack    2      0      2
    Trans    3      2      0
    
    0 讨论(0)
  • 2020-11-28 05:03

    Demo in NumPy:

    import numpy as np
    np.random.seed(3) # for reproducibility
    
    # Generate data: 5 labels, 10 examples, binary.
    label_headers = 'Alice Bob Carol Dave Eve'.split(' ')
    label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.
    print('labels:\n{0}'.format(label_data))
    
    # Compute cooccurrence matrix 
    cooccurrence_matrix = np.dot(label_data.transpose(),label_data)
    print('\ncooccurrence_matrix:\n{0}'.format(cooccurrence_matrix)) 
    
    # Compute cooccurrence matrix in percentage
    # FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element
    #      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804
    cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)
    with np.errstate(divide='ignore', invalid='ignore'):
        cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))
    print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage))
    

    Output:

    labels:
    [[0 0 1 1 0]
     [0 0 1 1 1]
     [0 1 1 1 0]
     [1 1 0 0 0]
     [0 1 1 0 0]
     [0 1 0 0 0]
     [0 1 0 1 1]
     [0 1 0 0 1]
     [1 0 0 1 0]
     [1 0 1 1 1]]
    
    cooccurrence_matrix:
    [[3 1 1 2 1]
     [1 6 2 2 2]
     [1 2 5 4 2]
     [2 2 4 6 3]
     [1 2 2 3 4]]
    
    cooccurrence_matrix_percentage:
    [[ 1.          0.33333333  0.33333333  0.66666667  0.33333333]
     [ 0.16666667  1.          0.33333333  0.33333333  0.33333333]
     [ 0.2         0.4         1.          0.8         0.4       ]
     [ 0.33333333  0.33333333  0.66666667  1.          0.5       ]
     [ 0.25        0.5         0.5         0.75        1.        ]]
    

    With a heatmap using matplotlib:

    import numpy as np
    np.random.seed(3) # for reproducibility
    
    import matplotlib.pyplot as plt
    
    
    def show_values(pc, fmt="%.2f", **kw):
        '''
        Heatmap with text in each cell with matplotlib's pyplot
        Source: http://stackoverflow.com/a/25074150/395857 
        By HYRY
        '''
        from itertools import izip
        pc.update_scalarmappable()
        ax = pc.get_axes()
        for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
            x, y = p.vertices[:-2, :].mean(0)
            if np.all(color[:3] > 0.5):
                color = (0.0, 0.0, 0.0)
            else:
                color = (1.0, 1.0, 1.0)
            ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)
    
    def cm2inch(*tupl):
        '''
        Specify figure size in centimeter in matplotlib
        Source: http://stackoverflow.com/a/22787457/395857
        By gns-ank
        '''
        inch = 2.54
        if type(tupl[0]) == tuple:
            return tuple(i/inch for i in tupl[0])
        else:
            return tuple(i/inch for i in tupl)
    
    def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels):
        '''
        Inspired by:
        - http://stackoverflow.com/a/16124677/395857 
        - http://stackoverflow.com/a/25074150/395857
        '''
    
        # Plot it out
        fig, ax = plt.subplots()    
        c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)
    
        # put the major ticks at the middle of each cell
        ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
        ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)
    
        # set tick labels
        #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
        ax.set_xticklabels(xticklabels, minor=False)
        ax.set_yticklabels(yticklabels, minor=False)
    
        # set title and x/y labels
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)      
    
        # Remove last blank column
        plt.xlim( (0, AUC.shape[1]) )
    
        # Turn off all the ticks
        ax = plt.gca()    
        for t in ax.xaxis.get_major_ticks():
            t.tick1On = False
            t.tick2On = False
        for t in ax.yaxis.get_major_ticks():
            t.tick1On = False
            t.tick2On = False
    
        # Add color bar
        plt.colorbar(c)
    
        # Add text in each cell 
        show_values(c)
    
        # Proper orientation (origin at the top left instead of bottom left)
        ax.invert_yaxis()
        ax.xaxis.tick_top()
    
        # resize 
        fig = plt.gcf()
        fig.set_size_inches(cm2inch(40, 20))
    
    
    
    def main():
    
        # Generate data: 5 labels, 10 examples, binary.
        label_headers = 'Alice Bob Carol Dave Eve'.split(' ')
        label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.
        print('labels:\n{0}'.format(label_data))
    
        # Compute cooccurrence matrix 
        cooccurrence_matrix = np.dot(label_data.transpose(),label_data)
        print('\ncooccurrence_matrix:\n{0}'.format(cooccurrence_matrix)) 
    
        # Compute cooccurrence matrix in percentage
        # FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element
        #      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804
        cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)
        with np.errstate(divide='ignore', invalid='ignore'):
            cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))
        print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage))
    
        # Add count in labels
        label_header_with_count = [ '{0} ({1})'.format(label_header, cooccurrence_matrix_diagonal[label_number]) for label_number, label_header in enumerate(label_headers)]  
        print('\nlabel_header_with_count: {0}'.format(label_header_with_count))
    
        # Plotting
        x_axis_size = cooccurrence_matrix_percentage.shape[0]
        y_axis_size = cooccurrence_matrix_percentage.shape[1]
        title = "Co-occurrence matrix\n"
        xlabel= ''#"Labels"
        ylabel= ''#"Labels"
        xticklabels = label_header_with_count
        yticklabels = label_header_with_count
        heatmap(cooccurrence_matrix_percentage, title, xlabel, ylabel, xticklabels, yticklabels)
        plt.savefig('image_output.png', dpi=300, format='png', bbox_inches='tight') # use format='svg' or 'pdf' for vectorial pictures
        #plt.show()
    
    
    if __name__ == "__main__":
        main()
        #cProfile.run('main()') # if you want to do some profiling
    

    (PS: a neat visualization of a co-occurrence matrix in D3.js.)

    0 讨论(0)
  • 2020-11-28 05:08

    To further elaborate this question, If you want to construct co-occurrence matrix from sentences you can do this:

    import numpy as np
    import pandas as pd
    
    def create_cooccurrence_matrix(sentences, window_size=2):
        """Create co occurrence matrix from given list of sentences.
    
        Returns:
        - vocabs: dictionary of word counts
        - co_occ_matrix_sparse: sparse co occurrence matrix
    
        Example:
        ===========
        sentences = ['I love nlp',    'I love to learn',
                     'nlp is future', 'nlp is cool']
    
        vocabs,co_occ = create_cooccurrence_matrix(sentences)
    
        df_co_occ  = pd.DataFrame(co_occ.todense(),
                                  index=vocabs.keys(),
                                  columns = vocabs.keys())
    
        df_co_occ = df_co_occ.sort_index()[sorted(vocabs.keys())]
    
        df_co_occ.style.applymap(lambda x: 'color: red' if x>0 else '')
    
        """
        import scipy
        import nltk
    
        vocabulary = {}
        data = []
        row = []
        col = []
    
        tokenizer = nltk.tokenize.word_tokenize
    
        for sentence in sentences:
            sentence = sentence.strip()
            tokens = [token for token in tokenizer(sentence) if token != u""]
            for pos, token in enumerate(tokens):
                i = vocabulary.setdefault(token, len(vocabulary))
                start = max(0, pos-window_size)
                end = min(len(tokens), pos+window_size+1)
                for pos2 in range(start, end):
                    if pos2 == pos:
                        continue
                    j = vocabulary.setdefault(tokens[pos2], len(vocabulary))
                    data.append(1.)
                    row.append(i)
                    col.append(j)
    
        cooccurrence_matrix_sparse = scipy.sparse.coo_matrix((data, (row, col)))
        return vocabulary, cooccurrence_matrix_sparse
    
    

    Usage:

    sentences = ['I love nlp',    'I love to learn',
                 'nlp is future', 'nlp is cool']
    
    vocabs,co_occ = create_cooccurrence_matrix(sentences)
    
    df_co_occ  = pd.DataFrame(co_occ.todense(),
                              index=vocabs.keys(),
                              columns = vocabs.keys())
    
    df_co_occ = df_co_occ.sort_index()[sorted(vocabs.keys())]
    
    df_co_occ.style.applymap(lambda x: 'color: red' if x>0 else '')
    
    # If not in jupyter notebook, print(df_co_occ)
    

    output

    0 讨论(0)
提交回复
热议问题