from pprint import pprint import warnings warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') from gensim import corpora stopWordsList = set('for a of the and to in'.split()) with open('./Data/mycorpus.txt', encoding='utf-8') as f: texts = [[word for word in line.lower().split() if word not in stopWordsList]for line in f] dictionary = corpora.Dictionary.load('./Data/sampleDict.dict') corpus = [dictionary.doc2bow(doc) for doc in texts] pprint(corpus)
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]
corpus
。不过这个词袋模型的效用显然一般。在本教程中,我们将对原始的表示法进行变换,从而达到以下目的:
- 在语料库中找出隐藏的结构、发现单词之间的关系,并以一种更语义化的方式描述文档
- 使文档表示更紧凑(降维)。这既提高了效率(新的表示法消耗的资源更少),又提高了效力(忽略了边际数据趋势,减少了噪音)
models
模块中定义,形如models.xxxModel
,一般而言我们需要将旧有的语料库corpus
喂给这些模型进行训练从而达到变换的目的。
from gensim import models # 训练一个模型 tfidf = models.TfidfModel(corpus)
[([(0, 1), (1, 1)])]
,则我们可以这样来对其进行变换。这里,我们以Tf-Idf模型作为案例,该模型可以提取文档的关键词,并赋予高表征的词以较大的权值,从而让向量更好的表征文档的特性:
docBow = [(0, 1), (1, 1)] print(tfidf[docBow])
[(0, 0.7071067811865476), (1, 0.7071067811865476)]
[]
就可以很方便地通过训练好的模型完成变换,现在不妨试着对整个语料库进行变换:
for doc in corpus: print(tfidf[doc])
[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)] [(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)] [(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)] [(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)] [(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)] [(9, 1.0)] [(9, 0.7071067811865475), (10, 0.7071067811865475)] [(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)] [(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]
corpusTdIdf = tfidf[corpus] for vec in corpusTdIdf: print(vec)
[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)] [(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)] [(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)] [(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)] [(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)] [(9, 1.0)] [(9, 0.7071067811865475), (10, 0.7071067811865475)] [(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)] [(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]
model[corpus]
只在旧的语料库文档流周围创建一个包装器――实际的转换是在文档迭代过程中实时完成的。在调用corpus_transform = model[corpus]
时,实际上并不能对整个语料库进行转换,因为这意味着将结果存储在主存中――而这这与gensim的内存不确定性目标相矛盾。
如果你有可能会将对转换后的corpus_transform
进行多次迭代,并且转换代价高昂,那么最好首先将转换结果序列化到磁盘,然后继续使用它(否则每次迭代都会重复进行一次转换)。
LSI(隐式语义索引)模型,又称LSA(隐式语义分析)。利用它我们构造一个隐式的2-D空间(2-D是因为我们设置了num_topics=2)。
lsi = models.LsiModel(corpusTdIdf, num_topics=2, id2word=dictionary) corpusLsi = lsi[corpusTdIdf] with open('./Data/mycorpus.txt', encoding='utf-8') as f: rawDocs = f.readlines() for i, v in enumerate(corpusLsi): print('-'.center(70, '-')) print(rawDocs[i]) print(v)
---------------------------------------------------------------------- Human machine interface for lab abc computer applications [(0, 0.06600783396090126), (1, 0.520070330636185)] ---------------------------------------------------------------------- A survey of user opinion of computer system response time [(0, 0.19667592859142144), (1, 0.7609563167700049)] ---------------------------------------------------------------------- The EPS user interface management system [(0, 0.08992639972446115), (1, 0.7241860626752514)] ---------------------------------------------------------------------- System and human system engineering testing of EPS [(0, 0.07585847652177896), (1, 0.6320551586003434)] ---------------------------------------------------------------------- Relation of user perceived response time to error measurement [(0, 0.10150299184979877), (1, 0.5737308483002953)] ---------------------------------------------------------------------- The generation of random binary unordered trees [(0, 0.7032108939378318), (1, -0.16115180214025432)] ---------------------------------------------------------------------- The intersection graph of paths in trees [(0, 0.8774787673119838), (1, -0.16758906864658976)] ---------------------------------------------------------------------- Graph minors IV Widths of trees and well quasi ordering [(0, 0.9098624686818584), (1, -0.14086553628718593)] ---------------------------------------------------------------------- Graph minors A survey [(0, 0.6165825350569278), (1, 0.05392907566389622)]
models.LsiModel.print_topics()
来检查一下这个过程到底产生了什么变化:
lsi.print_topics(2)
[(0, '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'), (1, '0.460*"system" + 0.373*"user" + 0.332*"eps" + 0.328*"interface" + 0.320*"response" + 0.320*"time" + 0.293*"computer" + 0.280*"human" + 0.171*"survey" + -0.161*"trees"')]
lsi.save(r'./Data/model.lsi') lsi = models.LsiModel.load(r'./Data/model.lsi')
bow_corpus
表示词袋表示法的语料库,tfidf_corpus
表示经过Td-Idf变换的语料库。
- 词频-逆文档频(Term Frequency * Inverse Document Frequency, Tf-Idf)
接受一个词袋形式(整数值)的训练语料库来初始化/训练。
变换过程中,将会接收一个向量同时返回一个相同维度的向量,在语料库中非常稀有的属性的权重将会提高。整个变换会将整数型的向量(BoW)转化为实数型的向量,同时让维度不变。当然,你可以选择是否将返回结果标准化(normalize
)使L2范数。
model = tfidfmodel.TfidfModel(bow_corpus, normalize=True)
- 隐式语义索引(Latent Semantic Indexing,LSI,or sometimes LSA)
将文档从 词袋或TfIdf权重空间(preferrably) 转化为一个低维的隐式空间。对于我们上面用到的玩具级的语料库,我们使用了二维的隐式维度,但是在真正的语料库上,推荐200-500的目标维度为标准。
model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
另外注意,LSI训练的独特之处是它支持所谓的在线学习――我们仅需提供更多的训练文本即可。我们只需要调用add_documents
方法就可以了:
model.add_documents(another_tfidf_corpus) # 现在LSI已经使用tfidf_corpus + another_tfidf_corpus进行过训练了 lsi_vec = model[tfidf_vec] # 将新文档转化到LSI空间不会影响该模型 # ... model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents lsi_vec = model[tfidf_vec] # ...
更多的内容,请看gensim.models.lsimodel
的帮助文档。
- 随机映射(Random Projections,RP)
目的在于减小空维度。这是一个非常高效(对CPU和内存均是如此)方法,通过引入一点点随机性,来近似得到两个文档之间的Tfidf距离。推荐目标维度也在数百到数千不等,具体数值要视你的数据集大小而定。
model = rpmodel.RpModel(tfidf_corpus, num_topics=500)
- 隐式狄利克雷分配(Latent Dirichlet Allocation, LDA)
也是将词袋计数转化为一个低维主题空间的转换。LDA是LSA(也叫多项式PCA)的概率扩展,因此LDA的主题可以被解释为词语的概率分布。这些分布式从训练语料库中自动推断的,就像LSA一样。相应地,文档可以被解释为这些主题的一个(软)混合(又是就像LSA一样)。
model = ldamodel.LdaModel(bow_corpus, id2word=dictionary, num_topics=100)
- 分层狄利克雷过程(Hierarchical Dirichlet Process,HDP)
是一个无参数贝叶斯方法(注意:这里没有num_topics参数):
model = hdpmodel.HdpModel(bow_corpus, id2word=dictionary)
1
1
1
1
1