问题
# search for a term in twitter
rdmTweets <- searchTwitteR("machine learning", n=500, lang="en")
dtm.control <- list(
tolower = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE,
removestopWords = TRUE,
stemming = TRUE, # false for sentiment
wordLengths = c(3, "inf"))
# create a dataframe around the results
df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
# Here are the columns
names(df)
# And some example content
head(df,10)
counts = table(df$screenName)
barplot(counts)
# Plot the data as received from Twitter
cc <- subset(counts,counts>1)
barplot(cc,las = 2,cex.names = 0.3)
# the most commonly cited words in the tweets
rdm_texts <- sapply(rdmTweets, function(x) x$getText())
rdm_corpus <- Corpus(VectorSource(rdm_texts))
dtm <- DocumentTermMatrix(rdm_corpus, control = dtm.control) # throws error
Throws error as -
Error in simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow = length(allTerms), :
'i, j, v' different lengths
In addition: Warning messages:
1: In mclapply(unname(content(x)), termFreq, control) :
all scheduled cores encountered errors in user code
2: In simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow = length(allTerms), :
NAs introduced by coercion
Trying to search the twitter for the keywords and then create a wordcloud. Removing all punctuations, stopping words, removing numbers but still seems not to work.
any help will be appreciated.
回答1:
utf8towcs - breaking in this solved the issue.
#the cainfo parameter is necessary only on Windows
r_stats <- searchTwitter("#IpadPro", n=500, lang="en")
#should get 500
length(r_stats)
#save text
r_stats_text <- sapply(r_stats, function(x) x$getText())
r_stats_text_corpus <- Corpus(VectorSource(r_stats_text))
r_stats_text_corpus <- tm_map(r_stats_text_corpus,
content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')),
mc.cores=1)
r_stats_text_corpus <- tm_map(r_stats_text_corpus, content_transformer(tolower))
r_stats_text_corpus <- tm_map(r_stats_text_corpus, removePunctuation)
r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x)removeWords(x,stopwords()))
# Creating a term document matrix
tdm <- TermDocumentMatrix(r_stats_text_corpus)
m <- as.matrix(tdm)
word_freqs <- sort(rowSums(m), decreasing = TRUE)
# create the data frame with the words and their frequencies
dm <- data.frame(word = names(word_freqs), freq = word_freqs)
wordcloud(dm$word, dm$freq, random.order = FALSE , colors = brewer.pal(8,"Dark2"))
来源:https://stackoverflow.com/questions/33722202/twitter-data-error-in-termdocumentmatrix