问题
As we know machines can't understand the text but it understands numbers so in NLP we convert text to some numeric representation and one of them is BOW representation. Here, my objective is to convert every document to some numeric representation and save it for future use. And I am following the below way to do that by converting text to BOW and saving it in a pickle file. My question is, whether we can do this in a better and reliable way? so that every document can be saved as some vector into a file and new documents are appended in the same way without losing any structure or information.
from gensim import corpora
import pickle
tokenized_corpus = [
['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time', 'survey'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey'],
['hello', 'system', 'i', 'love', 'graph', 'minor', 'trees']
]
file_name = 'corpus_sparse_rep.pkl'
bow = []
dct = corpora.Dictionary([tokenized_corpus[0]]) # added first doc as it needs corpus as argument
with open(file_name, 'wb+') as fp:
# adding each doc sequentially
for doc in tokenized_corpus:
dct.add_documents([doc]) # updating vocab in dictionary
bow.append(dct.doc2bow(doc)) # adding file representation to bow just to check contents before and after in
# pickle
pickle.dump(dct.doc2bow(doc), fp)
print(f'Saving bow data to pickle = {bow}')
print(f'Dictionary = {dct}')
# To load bow data from pickle file
pickle_data = []
with open(file_name, 'rb') as fr:
while True:
try:
pickle_data.append(pickle.load(fr))
except EOFError:
break
print(f'Loading bow data from pickle = {pickle_data}')
# corpora.MmCorpus.serialize('t.mm', bow) # serialize data and save to market matrix (.mm) format
# Output
# Saving bow data to pickle = [[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)], [(5, 1), (9, 1), (10, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]
# Dictionary = Dictionary(16 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
# Loading bow data from pickle = [[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)], [(5, 1), (9, 1), (10, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]
来源:https://stackoverflow.com/questions/64820037/what-is-the-reliable-way-to-convert-text-data-document-to-numerical-data-vect