PyMC3 how to implement latent dirichlet allocation?

半腔热情 提交于 2019-12-11 02:19:24

问题


I am trying to implement lda using PyMC3.

However, when defining the last part of the model in which words are sampled based on their topics, I keep getting the error: TypeError: list indices must be integers, not TensorVariable

How to tackle the problem?

The code is as follows:

## Data Preparation

K = 2 # number of topics
N = 4 # number of words
D = 3 # number of documents

import numpy as np

data = np.array([[1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0]])
Wd = [len(doc) for doc in data]  # length of each document

## Model Specification

from pymc3 import Model, Normal, HalfNormal, Dirichlet, Categorical, constant

lda_model = Model()

with lda_model:

    # Priors for unknown model parameters
    alpha = HalfNormal('alpha', sd=1)
    eta = HalfNormal('eta', sd=1)

    a1 = eta*np.ones(shape=N)
    a2 = alpha*np.ones(shape=K)

    beta = [Dirichlet('beta_%i' % i, a1, shape=N) for i in range(K)]
    theta = [Dirichlet('theta_%s' % i, a2, shape=K) for i in range(D)]

    z = [Categorical('z_%i' % d, p = theta[d], shape=Wd[d]) for d in range(D)]

    # That's when you get the error. It is caused by: beta[z[d][w]]
    w = [Categorical('w_%i_%i' % (d, w), p = beta[z[d][w]], observed = data[i,j]) for d in range(D) for w in range(Wd[d])]

Any help would be much appreciated!


回答1:


beta[z[d][w]] is naturally incorrect because z[d][w] is a variable stored by PyMC instead of being an fixed index.

In pymc2 it is solved by lambda function

p=pm.Lambda("phi_z_%s_%s" % (d,i), 
             lambda z=z[d][w], beta=beta: beta[z])

In pymc3 it is suppose to be solved by

@theano.compile.ops.as_op
def your_function

But there is a problem here that it seems like Theano doesn't allow sending a python list of pymc variable. t.lvector baisically don't work.

More discussion is in this question: Unable to create lambda function in hierarchical pymc3 model




回答2:


check out this blog post. I haven't tested it.

 import numpy as np  
 import pymc as pc  


 def wordDict(collection):  
  word_id  = {}  
  idCounter = 0  
  for d in collection:  
    for w in d:  
      if (w not in word_id):  
        word_id[w] = idCounter  
        idCounter+=1  
  return word_id  

 def toNpArray(word_id, collection):  
  ds = []  
  for d in collection:  
    ws = []  
    for w in d:  
      ws.append(word_id.get(w,0))  
    ds.append(ws)  
  return np.array(ds)  

 ###################################################  

 #doc1, doc2, ..., doc7  
 docs = [["sepak","bola","sepak","bola","bola","bola","sepak"],  
         ["uang","ekonomi","uang","uang","uang","ekonomi","ekonomi"],  
         ["sepak","bola","sepak","bola","sepak","sepak"],  
         ["ekonomi","ekonomi","uang","uang"],  
         ["sepak","uang","ekonomi"],  
         ["komputer","komputer","teknologi","teknologi","komputer","teknologi"],  
         ["teknologi","komputer","teknologi"]]  

 word_dict = wordDict(docs)  
 collection = toNpArray(word_dict,docs)  

 #number of topics  
 K = 3  

 #number of words (vocab)  
 V = len(word_dict)  

 #number of documents  
 D = len(collection)  

 #array([1, 1, 1, ..., 1]) K times  
 alpha = np.ones(K)  

 #array([1, 1, 1, ..., 1]) V times  
 beta = np.ones(V)  

 #array containing the information about doc length in our collection
 Nd = [len(doc) for doc in collection]  


 ######################## LDA model ##################################  

 #topic distribution per-document  
 theta = pc.Container([pc.CompletedDirichlet("theta_%s" % i,   
                                             pc.Dirichlet("ptheta_%s"%i, theta=alpha))  
                      for i in range(D)])  

 #word distribution per-topic  
 phi = pc.Container([pc.CompletedDirichlet("phi_%s" % j,   
                                           pc.Dirichlet("pphi_%s" % j, theta=beta))  
                     for j in range(K)])  


 #Please note that this is the tricky part :)  
 z = pc.Container([pc.Categorical("z_%i" % d,  
                                  p = theta[d],  
                                  size = Nd[d],  
                                  value = np.random.randint(K, size=Nd[d]))   
                   for d in range(D)])  

 #word generated from phi, given a topic z  
 w = pc.Container([pc.Categorical("w_%i_%i" % (d,i),  
                                  p = pc.Lambda("phi_z_%i_%i" % (d,i),  
                                                lambda z=z[d][i], phi=phi : phi[z]),
                                  value=collection[d][i],  
                                  observed=True)  
                   for d in range(D) for i in range(Nd[d])])  

 ####################################################################  

 model = pc.Model([theta, phi, z, w])  
 mcmc = pc.MCMC(model)  
 mcmc.sample(iter=5000, burn=1000)  


 #show the topic assignment for each word, using the last trace  
 for d in range(D):  
    print(mcmc.trace('z_%i'%d)[3999])  



回答3:


The following code was adapted from what has been referenced by @Hanan. I've somehow made it work with pymc3.

import numpy as np
import pymc3 as pm

def get_word_dict(collection):
    vocab_list = list({word for doc in collection for word in doc})
    idx_list = [i for i in range(len(vocab_list))]
    return dict(zip(vocab_list,idx_list))

def word_to_idx(dict_vocab_idx, collection):
    return [[dict_vocab_idx[word] for word in doc] for doc in collection]

docs = [["sepak","bola","sepak","bola","bola","bola","sepak"],  
         ["uang","ekonomi","uang","uang","uang","ekonomi","ekonomi"],  
         ["sepak","bola","sepak","bola","sepak","sepak"],  
         ["ekonomi","ekonomi","uang","uang"],  
         ["sepak","uang","ekonomi"],  
         ["komputer","komputer","teknologi","teknologi","komputer","teknologi"],  
         ["teknologi","komputer","teknologi"]]  

dict_vocab_idx = get_word_dict(docs)
idxed_collection = word_to_idx(dict_vocab_idx, docs)

n_topics = 3
n_vocab = len(dict_vocab_idx)
n_docs = len(idxed_collection)
length_docs = [len(doc) for doc in idxed_collection]

alpha = np.ones([n_docs, n_topics])
beta = np.ones([n_topics, n_vocab])

with pm.Model() as model:
    theta = pm.distributions.Dirichlet('theta', a=alpha, shape=(n_docs, n_topics))
    phi = pm.distributions.Dirichlet('phi', a=beta, shape=(n_topics, n_vocab))
    zs = [pm.Categorical("z_d{}".format(d), p=theta[d], shape=length_docs[d]) for d in range(n_docs)]
    ws = [pm.Categorical("w_{}_{}".format(d,i), p=phi[zs[d][i]], observed=idxed_collection[d][i]) 
    for d in range(n_docs) for i in range(length_docs[d])]
    trace = pm.sample(2000)

for d in range(n_docs):
    value_z=trace.get_values("z_d{}".format(d))
    print(value_z[1999])


来源:https://stackoverflow.com/questions/31473459/pymc3-how-to-implement-latent-dirichlet-allocation

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!