问题
I'm running the following python script on a large dataset (around 100 000 items). Currently the execution is unacceptably slow, it would probably take a month to finish at least (no exaggeration). Obviously I would like it to run faster.
I've added a comment belong to highlight where I think the bottleneck is. I have written my own database functions which are imported.
Any help is appreciated!
# -*- coding: utf-8 -*-
import database
from gensim import corpora, models, similarities, matutils
from gensim.models.ldamulticore import LdaMulticore
import pandas as pd
from sklearn import preprocessing
def getTopFiveSimilarAuthors(author, authors, ldamodel, dictionary):
vec_bow = dictionary.doc2bow([researcher['full_proposal_text']])
vec_lda = ldamodel[vec_bow]
# normalization
try:
vec_lda = preprocessing.normalize(vec_lda)
except:
pass
similar_authors = []
for index, other_author in authors.iterrows():
if(other_author['id'] != author['id']):
other_vec_bow = dictionary.doc2bow([other_author['full_proposal_text']])
other_vec_lda = ldamodel[other_vec_bow]
# normalization
try:
other_vec_lda = preprocessing.normalize(vec_lda)
except:
pass
sim = matutils.cossim(vec_lda, other_vec_lda)
similar_authors.append({'id': other_author['id'], 'cosim': sim})
similar_authors = sorted(similar_authors, key=lambda k: k['cosim'], reverse=True)
return similar_authors[:5]
def get_top_five_similar(author, authors, ldamodel, dictionary):
top_five_similar_authors = getTopFiveSimilarAuthors(author, authors, ldamodel, dictionary)
database.insert_top_five_similar_authors(author['id'], top_five_similar_authors, cursor)
connection = database.connect()
authors = []
authors = pd.read_sql("SELECT id, full_text FROM author WHERE full_text IS NOT NULL;", connection)
# create the dictionary
dictionary = corpora.Dictionary([authors["full_text"].tolist()])
# create the corpus/ldamodel
author_text = []
for text in author_text['full_text'].tolist():
word_list = []
for word in text:
word_list.append(word)
author_text.append(word_list)
corpus = [dictionary.doc2bow(text) for text in author_text]
ldamodel = LdaMulticore(corpus, num_topics=50, id2word = dictionary, workers=30)
#BOTTLENECK: the script hangs after this point.
authors.apply(lambda x: get_top_five_similar(x, authors, ldamodel, dictionary), axis=1)
回答1:
I noticed these problems in your code.. but I'm not sure the they are the reason for the slow execution.. this loop here is useless it well never run:
for text in author_text['full_text'].tolist():
word_list = []
for word in text:
word_list.append(word)
author_text.append(word_list)
also there is no need to loop the words of the text it is enough to use split function on it and it will be a list of words, by lopping authors courser..
try to write it like this: first:
all_authors_text = []
for author in authors:
all_authors_text.append(author['full_text'].split())
and after that make the dictionary:
dictionary = corpora.Dictionary(all_authors_text)
来源:https://stackoverflow.com/questions/54431187/gensim-lda-multicore-python-script-runs-much-too-slow