from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import csv
import jieba
import codecs
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
import pyLDAvis.gensim
from gensim import corpora
from gensim.models import LdaModel
def is_number(s):
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
info = []
def data_g(filename):
csv_reader = csv.reader(open(filename))
sta = 0
for row in csv_reader:
if sta == 0:
sta = 1
continue
if len(row[8]) != 0:
info.append(row[8])
data_g('C:\\Users\\imac\\Desktop\\2018\\bigdata\\py\\data.csv')
stopwords = codecs.open('C:\\Users\\imac\\Desktop\\2018\\bigdata\
\\py\\stop.txt','r',encoding='GBK').readlines()
stw = [',','\n',' ','―','副','专业','学习','主任','市',"委员",\
"书记","其间","干部","成员","工作","政府"]
for wd in stopwords:
stw.append(wd[:-2])
#print(stw)
#print(stopwords[7])
train = []
def data_p():
for line in info:
tmp = list(jieba.cut(line))
train.append([ w for w in tmp if w not in stw and not is_number(w)])
data_p()
print(train)
dictionary = corpora.Dictionary(train)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in train]
# generate LDA model
lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=10)
topic_list=lda.print_topics(20)
print (type(lda.print_topics(20)))
print (len(lda.print_topics(20)))
for topic in topic_list:
print (topic)
print ('输出其主题分布')
for p in train:
#test_doc = list(new_doc) #新文档进行分词
test_doc= p#查看训练集中第三个样本的主题分布
doc_bow = dictionary.doc2bow(test_doc) #文档转换成bow
doc_lda = lda[doc_bow] #得到新文档的主题分布
#输出新文档的主题分布
print (doc_lda)
#for topic in doc_lda:
#print ("%s\t%f\n"%(lda.print_topic(topic[0]), topic[1]))
#print(train)
def test_lda():
#corpus, dictionary = get_corpus_dictionary()
#lda = LdaModel(corpus=corpus,num_topics=5)
data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.show(data,open_browser=True)
if __name__ == "__main__":
test_lda()
来源:CSDN
作者:c_cl
链接:https://blog.csdn.net/c_czl/article/details/80137448