Co-occurrence matrix from nested list of words

前端 未结 8 769
无人及你
无人及你 2020-11-30 10:27

I have a list of names like:

names = [\'A\', \'B\', \'C\', \'D\']

and a list of documents, that in each documents some of these names are m

相关标签:
8条回答
  • 2020-11-30 10:30

    Obviously this can be extended for your purposes, but it performs the general operation in mind:

    import math
    
    for a in 'ABCD':
        for b in 'ABCD':
            count = 0
    
            for x in document:
                if a != b:
                    if a in x and b in x:
                        count += 1
    
                else:
                    n = x.count(a)
                    if n >= 2:
                        count += math.factorial(n)/math.factorial(n - 2)/2
    
            print '{} x {} = {}'.format(a, b, count)
    
    0 讨论(0)
  • 2020-11-30 10:31

    Another option is to use the constructor csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)]) from scipy.sparse.csr_matrix where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k].

    The trick is to generate row_ind and col_ind by iterating over the documents and creating a list of tuples (doc_id, word_id). data would simply be a vector of ones of the same length.

    Multiplying the docs-words matrix by its transpose would give you the co-occurences matrix.

    Additionally, this is efficient in terms of both run times and memory usage, so it should also handle big corpuses.

    import numpy as np
    import itertools
    from scipy.sparse import csr_matrix
    
    
    def create_co_occurences_matrix(allowed_words, documents):
        print(f"allowed_words:\n{allowed_words}")
        print(f"documents:\n{documents}")
        word_to_id = dict(zip(allowed_words, range(len(allowed_words))))
        documents_as_ids = [np.sort([word_to_id[w] for w in doc if w in word_to_id]).astype('uint32') for doc in documents]
        row_ind, col_ind = zip(*itertools.chain(*[[(i, w) for w in doc] for i, doc in enumerate(documents_as_ids)]))
        data = np.ones(len(row_ind), dtype='uint32')  # use unsigned int for better memory utilization
        max_word_id = max(itertools.chain(*documents_as_ids)) + 1
        docs_words_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(documents_as_ids), max_word_id))  # efficient arithmetic operations with CSR * CSR
        words_cooc_matrix = docs_words_matrix.T * docs_words_matrix  # multiplying docs_words_matrix with its transpose matrix would generate the co-occurences matrix
        words_cooc_matrix.setdiag(0)
        print(f"words_cooc_matrix:\n{words_cooc_matrix.todense()}")
        return words_cooc_matrix, word_to_id 
    

    Run example:

    allowed_words = ['A', 'B', 'C', 'D']
    documents = [['A', 'B'], ['C', 'B', 'K'],['A', 'B', 'C', 'D', 'Z']]
    words_cooc_matrix, word_to_id = create_co_occurences_matrix(allowed_words, documents)
    

    Output:

    allowed_words:
    ['A', 'B', 'C', 'D']
    
    documents:
    [['A', 'B'], ['C', 'B', 'K'], ['A', 'B', 'C', 'D', 'Z']]
    
    words_cooc_matrix:
    [[0 2 1 1]
     [2 0 2 1]
     [1 2 0 1]
     [1 1 1 0]]
    
    0 讨论(0)
  • 2020-11-30 10:32

    '''for a window of 2, data_corpus is the series consisting of text data, words is the list consisting of words for which co-occurence matrix is build'''

    "co_oc is the co-occurence matrix"

    co_oc=pd.DataFrame(index=words,columns=words)
    
    for j in tqdm(data_corpus):
    
        k=j.split()
    
        for l in range(len(k)):
    
            if l>=5 and l<(len(k)-6):
                if k[l] in words:
                    for m in range(l-5,l+6):
                        if m==l:
                            continue
                        elif k[m] in words:
                            co_oc[k[l]][k[m]]+=1
    
            elif l>=(len(k)-6):
                if k[l] in words:
                    for m in range(l-5,len(k)):
                        if m==l:
                            continue
                        elif k[m] in words:
                            co_oc[k[l]][k[m]]+=1
    
            else:
                if k[l] in words:
                    for m in range(0,l+5):
                        if m==l:
                            continue
                        elif k[m] in words:
                            co_oc[k[l]][k[m]]+=1
    print(co_oc.head())
    
    0 讨论(0)
  • 2020-11-30 10:35

    You can also use matrix tricks in order to find the co-occurrence matrix too. Hope this works well when you have bigger vocabulary.

    import scipy.sparse as sp
    voc2id = dict(zip(names, range(len(names))))
    rows, cols, vals = [], [], []
    for r, d in enumerate(document):
        for e in d:
            if voc2id.get(e) is not None:
                rows.append(r)
                cols.append(voc2id[e])
                vals.append(1)
    X = sp.csr_matrix((vals, (rows, cols)))
    

    Now, you can find coocurrence matrix by simple multiply X.T with X

    Xc = (X.T * X) # coocurrence matrix
    Xc.setdiag(0)
    print(Xc.toarray())
    
    0 讨论(0)
  • 2020-11-30 10:36

    Here is another solution using itertools and the Counter class from the collections module.

    import numpy
    import itertools
    from collections import Counter
    
    document =[['A', 'B'], ['C', 'B'],['A', 'B', 'C', 'D']]
    
    # Get all of the unique entries you have
    varnames = tuple(sorted(set(itertools.chain(*document))))
    
    # Get a list of all of the combinations you have
    expanded = [tuple(itertools.combinations(d, 2)) for d in document]
    expanded = itertools.chain(*expanded)
    
    # Sort the combinations so that A,B and B,A are treated the same
    expanded = [tuple(sorted(d)) for d in expanded]
    
    # count the combinations
    c = Counter(expanded)
    
    
    # Create the table
    table = numpy.zeros((len(varnames),len(varnames)), dtype=int)
    
    for i, v1 in enumerate(varnames):
        for j, v2 in enumerate(varnames[i:]):        
            j = j + i 
            table[i, j] = c[v1, v2]
            table[j, i] = c[v1, v2]
    
    # Display the output
    for row in table:
        print(row)
    

    The output (which could be easilty turned into a DataFrame) is:

    [0 2 1 1]
    [2 0 2 1]
    [1 2 0 1]
    [1 1 1 0]
    
    0 讨论(0)
  • 2020-11-30 10:41
    from collections import OrderedDict
    
    document = [['A', 'B'], ['C', 'B'], ['A', 'B', 'C', 'D']]
    names = ['A', 'B', 'C', 'D']
    
    occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)
    
    # Find the co-occurrences:
    for l in document:
        for i in range(len(l)):
            for item in l[:i] + l[i + 1:]:
                occurrences[l[i]][item] += 1
    
    # Print the matrix:
    print(' ', ' '.join(occurrences.keys()))
    for name, values in occurrences.items():
        print(name, ' '.join(str(i) for i in values.values()))
    

    Output;

      A B C D
    A 0 2 1 1 
    B 2 0 2 1 
    C 1 2 0 1 
    D 1 1 1 0 
    
    0 讨论(0)
提交回复
热议问题