问题
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import wordcloud
from wordcloud import WordCloud,STOPWORDS
# Read the whole text.
remarks = open(r'C:\Users\marmar\Remarks.txt').read().split()
#Create words over an image
mask = np.array(Image.open(r'C:\users\marmar\Documents\cloud.png'))
#set the stopwords list
stopwords= set(STOPWORDS)
#append new words to the stopwords list
new_words =open(r'C:\Users\marmar\Documents\comments.txt').read().split()
new_stopwords=stopwords.union(new_words)
#generate the word cloud with parameters
wc =
WordCloud(
background_color="white", max_words=2000,
mask=mask,min_font_size =15, max_font_size=40, relative_scaling =
0.5, stopwords=new_stopwords,normalize_plurals= True)
wc.generate(remarks)
plt.figure(figsize=(25,25))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
#Show the wordcloud
plt.show()
So, if I remove .read()
.split()
from my Remarks text file, than it actually works and returns the wordcloud. However, I want to be able to tokenize the words and actually have the wordcloud be accurate. ( It's not showing the words split apart). However, every time I do it, I get this error.
TypeError Traceback (most recent call last)
<ipython-input-7-76f0df420fc2> in <module>()
19 wc = WordCloud(background_color="white", max_words=2000,
mask=mask,min_font_size =15, max_font_size=40, relative_scaling = 0.5,
stopwords=new_stopwords,
20 normalize_plurals= True)
---> 21 wc.generate(remarks)
22 plt.figure(figsize=(25,25))
23 plt.imshow(wc, interpolation="bilinear")
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\wordcloud\wordcloud.py in generate(self, text)
563 self
564 """
--> 565 return self.generate_from_text(text)
566
567 def _check_generated(self):
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\wordcloud\wordcloud.py in generate_from_text(self, text)
544 self
545 """
--> 546 words = self.process_text(text)
547 self.generate_from_frequencies(words)
548 return self
~\AppData\Local\Continuum\anaconda3\lib\site-packages\wordcloud\wordcloud.py
in process_text(self, text)
511 regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
512
--> 513 words = re.findall(regexp, text, flags)
514 # remove stopwords
515 words = [word for word in words if word.lower() not in
stopwords]
~\AppData\Local\Continuum\anaconda3\lib\re.py in findall(pattern, string,
flags)
220
221 Empty matches are included in the result."""
--> 222 return _compile(pattern, flags).findall(string)
223
224 def finditer(pattern, string, flags=0):
TypeError: expected string or bytes-like object
What I tried to do is actually remove all punctuation from my Remarks text file as well as convert the text file to unicode. The comments text file is working fine, not sure why my Remarks file isn't.
Thanks!
来源:https://stackoverflow.com/questions/47779462/typeerror-expected-string-or-bytes-like-object-python-3-wordcloud