训练数据
-0.214824 0.662756 -1.000000
-0.061569 -0.091875 1.000000
0.406933 0.648055 -1.000000
0.223650 0.130142 1.000000
0.231317 0.766906 -1.000000
-0.748800 -0.531637 -1.000000
-0.557789 0.375797 -1.000000
0.207123 -0.019463 1.000000
0.286462 0.719470 -1.000000
0.195300 -0.179039 1.000000
-0.152696 -0.153030 1.000000
0.384471 0.653336 -1.000000
-0.117280 -0.153217 1.000000
-0.238076 0.000583 1.000000
-0.413576 0.145681 1.000000
0.490767 -0.680029 -1.000000
0.199894 -0.199381 1.000000
-0.356048 0.537960 -1.000000
-0.392868 -0.125261 1.000000
0.353588 -0.070617 1.000000
0.020984 0.925720 -1.000000
-0.475167 -0.346247 -1.000000
0.074952 0.042783 1.000000
0.394164 -0.058217 1.000000
0.663418 0.436525 -1.000000
0.402158 0.577744 -1.000000
-0.449349 -0.038074 1.000000
0.619080 -0.088188 -1.000000
0.268066 -0.071621 1.000000
-0.015165 0.359326 1.000000
0.539368 -0.374972 -1.000000
-0.319153 0.629673 -1.000000
0.694424 0.641180 -1.000000
0.079522 0.193198 1.000000
0.253289 -0.285861 1.000000
-0.035558 -0.010086 1.000000
-0.403483 0.474466 -1.000000
-0.034312 0.995685 -1.000000
-0.590657 0.438051 -1.000000
-0.098871 -0.023953 1.000000
-0.250001 0.141621 1.000000
-0.012998 0.525985 -1.000000
0.153738 0.491531 -1.000000
0.388215 -0.656567 -1.000000
0.049008 0.013499 1.000000
0.068286 0.392741 1.000000
0.747800 -0.066630 -1.000000
0.004621 -0.042932 1.000000
-0.701600 0.190983 -1.000000
0.055413 -0.024380 1.000000
0.035398 -0.333682 1.000000
0.211795 0.024689 1.000000
-0.045677 0.172907 1.000000
0.595222 0.209570 -1.000000
0.229465 0.250409 1.000000
-0.089293 0.068198 1.000000
0.384300 -0.176570 1.000000
0.834912 -0.110321 -1.000000
-0.307768 0.503038 -1.000000
-0.777063 -0.348066 -1.000000
0.017390 0.152441 1.000000
-0.293382 -0.139778 1.000000
-0.203272 0.286855 1.000000
0.957812 -0.152444 -1.000000
0.004609 -0.070617 1.000000
-0.755431 0.096711 -1.000000
-0.526487 0.547282 -1.000000
-0.246873 0.833713 -1.000000
0.185639 -0.066162 1.000000
0.851934 0.456603 -1.000000
-0.827912 0.117122 -1.000000
0.233512 -0.106274 1.000000
0.583671 -0.709033 -1.000000
-0.487023 0.625140 -1.000000
-0.448939 0.176725 1.000000
0.155907 -0.166371 1.000000
0.334204 0.381237 -1.000000
0.081536 -0.106212 1.000000
0.227222 0.527437 -1.000000
0.759290 0.330720 -1.000000
0.204177 -0.023516 1.000000
0.577939 0.403784 -1.000000
-0.568534 0.442948 -1.000000
-0.011520 0.021165 1.000000
0.875720 0.422476 -1.000000
0.297885 -0.632874 -1.000000
-0.015821 0.031226 1.000000
0.541359 -0.205969 -1.000000
-0.689946 -0.508674 -1.000000
-0.343049 0.841653 -1.000000
0.523902 -0.436156 -1.000000
0.249281 -0.711840 -1.000000
0.193449 0.574598 -1.000000
-0.257542 -0.753885 -1.000000
-0.021605 0.158080 1.000000
0.601559 -0.727041 -1.000000
-0.791603 0.095651 -1.000000
-0.908298 -0.053376 -1.000000
0.122020 0.850966 -1.000000
-0.725568 -0.292022 -1.000000
测试数据
3.542485 1.977398 -1
3.018896 2.556416 -1
7.551510 -1.580030 1
2.114999 -0.004466 -1
8.127113 1.274372 1
7.108772 -0.986906 1
8.610639 2.046708 1
2.326297 0.265213 -1
3.634009 1.730537 -1
0.341367 -0.894998 -1
3.125951 0.293251 -1
2.123252 -0.783563 -1
0.887835 -2.797792 -1
7.139979 -2.329896 1
1.696414 -1.212496 -1
8.117032 0.623493 1
8.497162 -0.266649 1
4.658191 3.507396 -1
8.197181 1.545132 1
1.208047 0.213100 -1
1.928486 -0.321870 -1
2.175808 -0.014527 -1
7.886608 0.461755 1
3.223038 -0.552392 -1
3.628502 2.190585 -1
7.407860 -0.121961 1
7.286357 0.251077 1
2.301095 -0.533988 -1
-0.232542 -0.547690 -1
3.457096 -0.082216 -1
3.023938 -0.057392 -1
8.015003 0.885325 1
8.991748 0.923154 1
7.916831 -1.781735 1
7.616862 -0.217958 1
2.450939 0.744967 -1
7.270337 -2.507834 1
1.749721 -0.961902 -1
1.803111 -0.176349 -1
8.804461 3.044301 1
1.231257 -0.568573 -1
2.074915 1.410550 -1
-0.743036 -1.736103 -1
3.536555 3.964960 -1
8.410143 0.025606 1
7.382988 -0.478764 1
6.960661 -0.245353 1
8.234460 0.701868 1
8.168618 -0.903835 1
1.534187 -0.622492 -1
9.229518 2.066088 1
7.886242 0.191813 1
2.893743 -1.643468 -1
1.870457 -1.040420 -1
5.286862 -2.358286 1
6.080573 0.418886 1
2.544314 1.714165 -1
6.016004 -3.753712 1
0.926310 -0.564359 -1
0.870296 -0.109952 -1
2.369345 1.375695 -1
1.363782 -0.254082 -1
7.279460 -0.189572 1
1.896005 0.515080 -1
8.102154 -0.603875 1
2.529893 0.662657 -1
1.963874 -0.365233 -1
8.132048 0.785914 1
8.245938 0.372366 1
6.543888 0.433164 1
-0.236713 -5.766721 -1
8.112593 0.295839 1
9.803425 1.495167 1
1.497407 -0.552916 -1
1.336267 -1.632889 -1
9.205805 -0.586480 1
1.966279 -1.840439 -1
8.398012 1.584918 1
7.239953 -1.764292 1
7.556201 0.241185 1
9.015509 0.345019 1
8.266085 -0.230977 1
8.545620 2.788799 1
9.295969 1.346332 1
2.404234 0.570278 -1
2.037772 0.021919 -1
1.727631 -0.453143 -1
1.979395 -0.050773 -1
8.092288 -1.372433 1
1.667645 0.239204 -1
9.854303 1.365116 1
7.921057 -1.327587 1
8.500757 1.492372 1
1.339746 -0.291183 -1
3.107511 0.758367 -1
2.609525 0.902979 -1
3.263585 1.367898 -1
2.912122 -0.202359 -1
1.731786 0.589096 -1
2.387003 1.573131 -1
原始测试数据
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 4 16:58:16 2018
支持向量机代码实现
SMO(Sequential Minimal Optimization)最小序列优化
@author: weixw
"""
import numpy as np
#核转换函数(一个特征空间映射到另一个特征空间,低维空间映射到高维空间)
#高维空间解决线性问题,低维空间解决非线性问题
#线性内核 = 原始数据矩阵(100*2)与原始数据第一行矩阵转秩乘积(2*1) =>(100*1)
#非线性内核公式:k(x,y) = exp(-||x - y||**2/2*(e**2))
#1.原始数据每一行与原始数据第一行作差,
#2.平方
def kernelTrans(dataMat, rowDataMat, kTup):
m,n=np.shape(dataMat)
#初始化核矩阵 m*1
K = np.mat(np.zeros((m,1)))
if kTup[0] == 'lin': #线性核
K = dataMat*rowDataMat.T
elif kTup[0] == 'rbf':#非线性核
for j in range(m):
#xi - xj
deltaRow = dataMat[j,:] - rowDataMat
K[j] = deltaRow*deltaRow.T
#1*m m*1 => 1*1
K = np.exp(K/(-2*kTup[1]**2))
else: raise NameError('Houston We Have a Problem -- That Kernel is not recognized')
return K
#定义数据结构体,用于缓存,提高运行速度
class optStruct:
def __init__(self, dataSet, labelSet, C, toler, kTup):
self.dataMat = np.mat(dataSet) #原始数据,转换成m*n矩阵
self.labelMat = np.mat(labelSet).T #标签数据 m*1矩阵
self.C = C #惩罚参数,C越大,容忍噪声度小,需要优化;反之,容忍噪声度高,不需要优化;
#所有的拉格朗日乘子都被限制在了以C为边长的矩形里
self.toler = toler #容忍度
self.m = np.shape(self.dataMat)[0] #原始数据行长度
self.alphas = np.mat(np.zeros((self.m,1))) # alpha系数,m*1矩阵
self.b = 0 #偏置
self.eCache = np.mat(np.zeros((self.m,2))) # 保存原始数据每行的预测值
self.K = np.mat(np.zeros((self.m,self.m))) # 核转换矩阵 m*m
for i in range(self.m):
self.K[:,i] = kernelTrans(self.dataMat, self.dataMat[i,:], kTup)
#计算原始数据第k项对应的预测误差 1*m m*1 =>1*1
#oS:结构数据
#k: 原始数据行索引
def calEk(oS, k):
#f(x) = w*x + b
fXk = float(np.multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b)
Ek = fXk - float(oS.labelMat[k])
return Ek
#在alpha有改变都要更新缓存
def updateEk(oS, k):
Ek = calEk(oS, k)
oS.eCache[k] = [1, Ek]
#第一次通过selectJrand()随机选取j,之后选取与i对应预测误差最大的j(步长最大)
def selectJ(i, oS, Ei):
#初始化
maxK = -1 #误差最大时对应索引
maxDeltaE = 0 #最大误差
Ej = 0 # j索引对应预测误差
#保存每一行的预测误差值 1相对于初始化为0的更改
oS.eCache[i] = [1,Ei]
#获取数据缓存结构中非0的索引列表(先将矩阵第0列转化为数组)
validEcacheList = np.nonzero(oS.eCache[:,0].A)[0]
#遍历索引列表,寻找最大误差对应索引
if len(validEcacheList) > 1:
for k in validEcacheList:
if k == i:
continue
Ek = calEk(oS, k)
deltaE = abs(Ei - Ek)
if(deltaE > maxDeltaE):
maxK = k
maxDeltaE = deltaE
Ej = Ek
return maxK, Ej
else:
#随机选取一个不等于i的j
j = selectJrand(i, oS.m)
Ej = calEk(oS, j)
return j,Ej
#随机选取一个不等于i的索引
def selectJrand(i, m):
j = i
while (j == i):
j = int(np.random.uniform(0, m))
return j
#alpha范围剪辑
def clipAlpha(aj, L, H):
if aj > H:
aj = H
if aj < L:
aj = L
return aj
#从文件获取特征数据,标签数据
def loadDataSet(fileName):
dataSet = []; labelSet = []
fr = open(fileName)
for line in fr.readlines():
#分割
lineArr = line.strip().split('\t')
dataSet.append([float(lineArr[0]), float(lineArr[1])])
labelSet.append(float(lineArr[2]))
return dataSet, labelSet
#计算 w 权重系数
def calWs(alphas, dataSet, labelSet):
dataMat = np.mat(dataSet)
#1*100 => 100*1
labelMat = np.mat(labelSet).T
m, n = np.shape(dataMat)
w = np.zeros((n, 1))
for i in range(m):
w += np.multiply(alphas[i]*labelMat[i], dataMat[i,:].T)
return w
#计算原始数据每一行alpha,b,保存到数据结构中,有变化及时更新
def innerL(i, oS):
#计算预测误差
Ei = calEk(oS, i)
#选择第一个alpha,违背KKT条件2
#正间隔,负间隔
if ((oS.labelMat[i] * Ei < -oS.toler) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.toler) and (oS.alphas[i] > 0)):
#第一次随机选取不等于i的数据项,其后根据误差最大选取数据项
j, Ej = selectJ(i, oS, Ei)
#初始化,开辟新的内存
alphaIold = oS.alphas[i].copy()
alphaJold = oS.alphas[j].copy()
#通过 a1y1 + a2y2 = 常量
# 0 <= a1,a2 <= C 求出L,H
if oS.labelMat[i] != oS.labelMat[j]:
L = max(0, oS.alphas[j] - oS.alphas[i])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
if L == H :
print ("L == H")
return 0
#内核分母 K11 + K22 - 2K12
eta = oS.K[i, i] + oS.K[j, j] - 2.0*oS.K[i, j]
if eta <= 0:
print ("eta <= 0")
return 0
#计算第一个alpha j
oS.alphas[j] += oS.labelMat[j]*(Ei - Ej)/eta
#修正alpha j的范围
oS.alphas[j] = clipAlpha(oS.alphas[j], L, H)
#alpha有改变,就需要更新缓存数据
updateEk(oS, j)
#如果优化后的alpha 与之前的alpha变化很小,则舍弃,并重新选择数据项的alpha
if (abs(oS.alphas[j] - alphaJold) < 0.00001):
print ("j not moving enough, abandon it.")
return 0
#计算alpha对的另一个alpha i
# ai_new*yi + aj_new*yj = 常量
# ai_old*yi + ai_old*yj = 常量
# 作差=> ai = ai_old + yi*yj*(aj_old - aj_new)
oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
#alpha有改变,就需要更新缓存数据
updateEk(oS, i)
#计算b1,b2
# y(x) = w*x + b => b = y(x) - w*x
# w = aiyixi(i= 1->N求和)
#b1_new = y1_new - (a1_new*y1*k11 + a2_new*y2*k21 + ai*yi*ki1(i = 3 ->N求和 常量))
#b1_old = y1_old - (a1_old*y1*k11 + a2_old*y2*k21 + ai*yi*ki1(i = 3 ->N求和 常量))
#作差=> b1_new = b1_old + (y1_new - y1_old) - y1*k11*(a1_new - a1_old) - y2*k21*(a2_new - a2_old)
# => b1_new = b1_old + Ei - yi*(ai_new - ai_old)*kii - yj*(aj_new - aj_old)*kij
#同样可推得 b2_new = b2_old + Ej - yi*(ai_new - ai_old)*kij - yj*(aj_new - aj_old)*kjj
bi = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j] - alphaJold)*oS.K[i,j]
bj = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,j] - oS.labelMat[j]*(oS.alphas[j] - alphaJold)*oS.K[j,j]
#首选alpha i,相对alpha j 更准确
if (0 < oS.alphas[i]) and (oS.alphas[i] < oS.C):
oS.b = bi
elif (0 < oS.alphas[j]) and (oS.alphas[j] < oS.C):
oS.b = bj
else:
oS.b = (bi + bj)/2.0
return 1
else:
return 0
#完整SMO核心算法,包含线性核核非线性核,返回alpha,b
#dataSet 原始特征数据
#labelSet 标签数据
#C 凸二次规划参数
#toler 容忍度
#maxInter 循环次数
#kTup 指定核方式
#程序逻辑:
#第一次全部遍历,遍历后根据alpha对是否有修改判断,
#如果alpha对没有修改,外循环终止;如果alpha对有修改,则继续遍历属于支持向量的数据。
#直至外循环次数达到maxIter
#相比简单SMO算法,运行速度更快,原因是:
#1.不是每一次都全量遍历原始数据,第一次遍历原始数据,
#如果alpha有优化,就遍历支持向量数据,直至alpha没有优化,然后再转全量遍历,这是如果alpha没有优化,循环结束;
#2.外循环不需要达到maxInter次数就终止;
def smoP(dataSet, labelSet, C, toler, maxInter, kTup = ('lin', 0)):
#初始化结构体类,获取实例
oS = optStruct(dataSet, labelSet, C, toler, kTup)
iter = 0
#全量遍历标志
entireSet = True
#alpha对是否优化标志
alphaPairsChanged = 0
#外循环 终止条件:1.达到最大次数 或者 2.alpha对没有优化
while (iter < maxInter) and ((alphaPairsChanged > 0) or (entireSet)):
alphaPairsChanged = 0
#全量遍历 ,遍历每一行数据 alpha对有修改,alphaPairsChanged累加
if entireSet:
for i in range(oS.m):
alphaPairsChanged += innerL(i, oS)
print ("fullSet, iter: %d i:%d, pairs changed %d" %(iter, i, alphaPairsChanged))
iter += 1
else:
#获取(0,C)范围内数据索引列表,也就是只遍历属于支持向量的数据
nonBounds = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBounds:
alphaPairsChanged += innerL(i, oS)
print ("non-bound, iter: %d i:%d, pairs changed %d" %(iter, i, alphaPairsChanged))
iter += 1
#全量遍历->支持向量遍历
if entireSet:
entireSet = False
#支持向量遍历->全量遍历
elif alphaPairsChanged == 0:
entireSet = True
print ("iteation number: %d"% iter)
print ("entireSet :%s"% entireSet)
print ("alphaPairsChanged :%d"% alphaPairsChanged)
return oS.b,oS.alphas
#绘制支持向量
def drawDataMap(dataArr,labelArr,b,alphas):
import matplotlib.pyplot as plt
#alphas.A>0 获取大于0的索引列表,只有>0的alpha才对分类起作用
svInd=np.nonzero(alphas.A>0)[0]
#分类数据点
classified_pts = {'+1':[],'-1':[]}
for point,label in zip(dataArr,labelArr):
if label == 1.0:
classified_pts['+1'].append(point)
else:
classified_pts['-1'].append(point)
fig = plt.figure()
ax = fig.add_subplot(111)
#绘制数据点
for label,pts in classified_pts.items():
pts = np.array(pts)
ax.scatter(pts[:, 0], pts[:, 1], label = label)
#绘制分割线
w = calWs(alphas, dataArr, labelArr)
#函数形式:max( x ,key=lambda a : b ) # x可以是任何数值,可以有多个x值
#先把x值带入lambda函数转换成b值,然后再将b值进行比较
x1, _=max(dataArr, key=lambda x:x[0])
x2, _=min(dataArr, key=lambda x:x[0])
a1, a2 = w
y1, y2 = (-b - a1*x1)/a2, (-b - a1*x2)/a2
#矩阵转化为数组.A
ax.plot([x1, x2],[y1.A[0][0], y2.A[0][0]])
#绘制支持向量
for i in svInd:
x, y= dataArr[i]
ax.scatter([x], [y], s=150, c ='none', alpha=0.7, linewidth=1.5, edgecolor = '#AB3319')
plt.show()
#alpha>0对应的数据才是支持向量,过滤不是支持向量的数据
sVs= np.mat(dataArr)[svInd] #get matrix of only support vectors
print ("there are %d Support Vectors.\n" % np.shape(sVs)[0])
#训练结果
def getTrainingDataResult(dataSet, labelSet, b, alphas, k1=1.3):
datMat = np.mat(dataSet)
#100*1
labelMat = np.mat(labelSet).T
#alphas.A>0 获取大于0的索引列表,只有>0的alpha才对分类起作用
svInd=np.nonzero(alphas.A>0)[0]
sVs=datMat[svInd]
labelSV = labelMat[svInd];
m,n = np.shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
# y(x) = w*x + b => b = y(x) - w*x
# w = aiyixi(i= 1->N求和)
predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b
if np.sign(predict)!=np.sign(labelSet[i]): errorCount += 1
print ("the training error rate is: %f" % (float(errorCount)/m))
def getTestDataResult(dataSet, labelSet, b, alphas, k1=1.3):
datMat = np.mat(dataSet)
#100*1
labelMat = np.mat(labelSet).T
#alphas.A>0 获取大于0的索引列表,只有>0的alpha才对分类起作用
svInd=np.nonzero(alphas.A>0)[0]
sVs=datMat[svInd]
labelSV = labelMat[svInd];
m,n = np.shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
# y(x) = w*x + b => b = y(x) - w*x
# w = aiyixi(i= 1->N求和)
predict=kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b
if np.sign(predict)!=np.sign(labelSet[i]): errorCount += 1
print ("the test error rate is: %f" % (float(errorCount)/m))
SMO算法实现
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 5 15:22:26 2018
@author: weixw
"""
import mySVMMLiA as sm
#通过训练数据计算 b, alphas
dataArr,labelArr = sm.loadDataSet('trainingData.txt')
b, alphas = sm.smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', 0.10))
sm.drawDataMap(dataArr,labelArr,b,alphas)
sm.getTrainingDataResult(dataArr, labelArr, b, alphas, 0.10)
dataArr1,labelArr1 = sm.loadDataSet('testData.txt')
#测试结果
sm.getTestDataResult(dataArr1, labelArr1, b, alphas, 0.10)
测试代码
来源:CSDN
作者:刘根生
链接:https://blog.csdn.net/qq_24163555/article/details/103605222