问题
I want to find out the lemmas using WordNet Lemmatizer and also I need to compute each word frequency.
I am getting the following error.
The trace is as follows:
TypeError: unhashable type: 'list'
Note: The corpus is available on the nltk
package itself.
What I have tried so far is as follows:
import nltk, re
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import TweetTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import gutenberg, stopwords
from nltk.stem import WordNetLemmatizer
def remove_punctuation(from_text):
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in from_text]
return stripped
def preprocessing():
raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
tokens_sentences = sent_tokenize(raw_data)
tokens = [[word.lower() for word in line.split()] for line in tokens_sentences]
print(len(tokens))
global stripped_tokens
stripped_tokens = [remove_punctuation(i) for i in tokens]
sw = (stopwords.words('english'))
filter_set = [[token for token in sentence if (token.lower() not in sw and token.isalnum())] for sentence in stripped_tokens]
lemma= WordNetLemmatizer()
global lem
lem = []
for w in filter_set:
lem.append(lemma.lemmatize(w))
preprocessing()
Please help me in resolving the issue.
回答1:
The problem is that lemma.lemmatize
expects a string
and you are passing a list
. The elements of filter_set
are lists
. You need to change the line:
lem.append(lemma.lemmatize(w))
to something like this:
lem.append([wi for wi in map(lemma.lemmatize, w)])
The above code applies lemma.lemmatize to each token (wi
) in w
. Full code:
import nltk, re
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import TweetTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import gutenberg, stopwords
from nltk.stem import WordNetLemmatizer
def remove_punctuation(from_text):
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in from_text]
return stripped
def preprocessing():
raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
tokens_sentences = sent_tokenize(raw_data)
tokens = [[word.lower() for word in line.split()] for line in tokens_sentences]
print(len(tokens))
stripped_tokens = [remove_punctuation(i) for i in tokens]
sw = (stopwords.words('english'))
filter_set = [[token for token in sentence if (token.lower() not in sw and token.isalnum())] for sentence in
stripped_tokens]
lemma = WordNetLemmatizer()
lem = []
for w in filter_set:
lem.append([wi for wi in map(lemma.lemmatize, w)])
return lem
result = preprocessing()
for e in result[:10]: # take the first 10 results
print(e)
Output
['tragedie', 'hamlet', 'william', 'shakespeare', '1599', 'actus', 'primus']
['scoena', 'prima']
['enter', 'barnardo', 'francisco', 'two', 'centinels']
['barnardo']
['who']
['fran']
['nay', 'answer', 'stand', 'vnfold', 'selfe', 'bar']
['long', 'liue', 'king', 'fran']
['barnardo']
['bar']
UPDATE
To get the frequencies you can use Counter
:
result = preprocessing()
frequencies = Counter(word for sentence in result for word in sentence)
for word, frequency in frequencies.most_common(10): # get the 10 most frequent words
print(word, frequency)
Output
ham 337
lord 217
king 180
haue 175
come 127
let 107
shall 107
hamlet 107
thou 105
good 98
来源:https://stackoverflow.com/questions/52860350/how-to-find-the-lemmas-and-frequency-count-of-each-word-in-list-of-sentences-in