1. 下载jre最新版
2. 打开环境变量:控制面板―系统―环境变量 or 我的电脑―属性
a 新增变量“JAVA”,变量值=C:\Program Files\Java\jre6\bin
加载rJava包和Rwordseg包
代码:
>install.packages(“rJava”) >library(rJava) >install.packages("Rwordseg",repos= "http://R-Forge.R-project.org", type = "source") >library(Rwordseg)--可以测试了:
>teststring1 <- "李建督促你将R语言学习到底。" >word1 <- segmentCN(teststring1)二、 导入文本、清理文本、词频统计、词云图
library(Rwordseg) #分词的包#导入数据
sale<-read.csv(file.choose()) neg <- readLines(file.choose(), encoding = 'UTF-8') data = read.csv("E:/111/wuli.csv",stringsAsFactors=F) data<-read.csv(file.choose(),stringsAsFactors=F)#去除数字,英文字符
data = gsub("[a-z0-9A-Z_]","",data)
#分词,Rwordseg包, D:/R-3.3.3/library/Rwordseg
words = segmentCN(data)#生成停词表stopwordsCN.txt,读入,确保是utf-8编码(E:/111/321123.txt为停词表地址)
stopwordsCN = as.character(readLines("E:/111/321123.txt")) stopwordsCN = enc2utf8(stopwordsCN) stopwordsCN<-stopwordsCN[Encoding(stopwordsCN)!="unknown"]
#编写去停词函数
emoveStopWords <- function(x,stopwords) { temp <- character(0) index <- 1 xLen <- length(x) while (index <= xLen) { if (length(stopwords[stopwords==x[index]]) <1) temp<- c(temp,x[index]) index <- index +1 } temp }#去停词
words = lapply(words,removeStopWords,stopwordsCN)#画词云图
install.packages(“wordcloud”) library(wordcloud)#计算词频
wordsnum = table(unlist(words)) wordsnum = sort(wordsnum) #排序#选出词频最高的250个
wordsnum = tail(wordsnum,250)#画词云图
wordcloud(names(wordsnum),as.vector(wordsnum),random.order=FALSE, random.color=FALSE,colors=brewer.pal(8,"Dark2"),family="myFont3")
下面两个图为python爬虫华为手机与三星手机评论的文本数据所制作
华为评论数据
三星评论数据
文章来源: Rwordseg中文分词 画词云图