机器学习实战——朴素贝叶斯法
朴素贝叶斯就是在假设各个特征相互独立的条件下运用贝叶斯准则通过先验概率计算出后验概率,输出后验概率最大的类别。
先讲讲在下代码时候遇到的问题,代码参考机器学习实战这本书。
问题一:
《机器学习实战》P66中运行程序清单4-5时,总是报错:
UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0xae in position 199: illegal multib
解决办法:
打开23.txt 文件发现,包含一个?字符。“SciFinance?is”,删除该“?”即可。
问题二:
del(trainingSet[randIndex])这段代码报错
出现错误 ‘range’ object doesn’t support item deletion
主要原因:版本不同,书上python2版本,在python3中range返回的是range对象,而不是列表
解决办法:把 trainingSet = range(50) 改为 trainingSet = list(range(50))
问题三:
在文本解析时
strings = 'Hello python,the best language!'
print(strings)
import re
listOfTokens = re.split(r'\W*',strings)
print(listOfTokens)
输出结果
Hello python,the best language!
['', 'H', 'e', 'l', 'l', 'o', '', 'p', 'y', 't', 'h', 'o', 'n', '', 't', 'h', 'e', '', 'b', 'e', 's', 't', '', 'l', 'a', 'n', 'g', 'u', 'a', 'g', 'e', '', '']
可见文件解析不对。
解决办法:
将listOfTokens = re.split(r’\W*’,strings)改为listOfTokens = re.split(r’\W’,strings) 去掉*即可
代码如下注释中给出了自己的理解
from numpy import *
"""从文本中构建词向量"""
def loadDataSet():
"""创建实验样本"""
postingList=[
['my','dog','has','flea','probiems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licke','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','atupid']
]
classVec = [0,1,0,1,0,1] #1 代表侮辱性文字,0 代表正常言论
return postingList,classVec
def creatVocabList(dataSet):
"""创建一个包含所有文档且不出现重复词的列表"""
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document) #set()去掉列表中的重复词
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
#词集模型
"""输入为词汇表和文档,检查文档中的单词是否在词汇表中"""
returnVet = [0]*len(vocabList) #创建一个长度为vocabList的0向量
for word in inputSet:
if word in vocabList:
returnVet[vocabList.index(word)] = 1
else:
print("the word: " + word + " is not in my Vocabulary!")
return returnVet
def bagOfWords2Vec(vocabList,inputSet):
#词袋模型
"""输入为词汇表和文档,检查文档中的单词在词汇表中出现的次数"""
returnVet = [0]*len(vocabList) #创建一个长度为vocabList的0向量
for word in inputSet:
if word in vocabList:
returnVet[vocabList.index(word)] += 1
else:
print("the word: " + word + " is not in my Vocabulary!")
return returnVet
"""从词向量计算概率"""
def trainNB0(trainMatrix,trainCategory):
"""输入:文档矩阵,文档类别标签向量"""
numTrainDocs = len(trainMatrix) #文档数
numWords = len(trainMatrix[0]) #文档中的单词数
pAbusive = sum(trainCategory)/float(numTrainDocs) # P(1)
#初始化概率
p0Nnm = ones(numWords) #类别为0的文档中每个单词在词汇表中出现的次数向量
p1Num = ones(numWords) #类别为1的文件
p0Denom = 2.0 #类别为0的文档且出现在词汇表中的单词总数
p1Denom = 2.0 #类别为1的文档且出现在词汇表中的单词总数
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Nnm += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom) #p(w|c1) 其中w是每个文档的单词向量
p0Vect = log(p0Nnm/p0Denom) #p(w|c0)
return p0Vect,p1Vect,pAbusive
def classfyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
#p(w1|c1)*p(w2|c1)*...*p(wn|c1)*p(1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
#p(w1|c1)*p(w2|c1)*...*p(wn|c1)*p(0)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = creatVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ['him','to','dog']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print(testEntry,'classfied as:',classfyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print(testEntry,'classfied as:',classfyNB(thisDoc,p0V,p1V,pAb))
def textParse(bigString):
"""文本解析"""
import re
listOfTokens = re.split(r'\W',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = []; classList = []; fullText = []
for i in range(1,26): #解析文本
wordList = textParse(open(r'E:\machine learning\machinelearninginaction\Ch04\email\spam\%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1) # 分类标签
wordList = textParse(open(r'E:\machine learning\machinelearninginaction\Ch04\email\ham\%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = creatVocabList(docList) #词汇表
trainingSet = list(range(50)) #训练数据集坐标
testSet = [] #测试数据集坐标
for i in range(10): #随机选10个文本作为测试集
randIndex = int(random.uniform(0,len(trainingSet))) #随机选取文本
testSet.append(trainingSet[randIndex]) #加入测试集
del(trainingSet[randIndex]) #将测试数据从训练集中删除
trainMat = []; trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex])) #训练集
trainClasses.append(classList[docIndex]) #训练集类别
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #计算概率
errorCount = 0
for docIndex in testSet: #测试
wordVector = setOfWords2Vec(vocabList,docList[docIndex])
if classfyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print('classcation error:',docList[docIndex])
print('the error rate is:',float(errorCount)/len(testSet))
if __name__ == "__main__":
spamTest()
运行结果
classcation error: ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don抰', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts']
the error rate is: 0.1
来源:CSDN
作者:QianLong_
链接:https://blog.csdn.net/QianLong_/article/details/104135983