支持向量机python代码实现

*爱你&永不变心* 提交于 2019-12-19 01:05:06

训练数据

-0.214824    0.662756    -1.000000
-0.061569    -0.091875    1.000000
0.406933    0.648055    -1.000000
0.223650    0.130142    1.000000
0.231317    0.766906    -1.000000
-0.748800    -0.531637    -1.000000
-0.557789    0.375797    -1.000000
0.207123    -0.019463    1.000000
0.286462    0.719470    -1.000000
0.195300    -0.179039    1.000000
-0.152696    -0.153030    1.000000
0.384471    0.653336    -1.000000
-0.117280    -0.153217    1.000000
-0.238076    0.000583    1.000000
-0.413576    0.145681    1.000000
0.490767    -0.680029    -1.000000
0.199894    -0.199381    1.000000
-0.356048    0.537960    -1.000000
-0.392868    -0.125261    1.000000
0.353588    -0.070617    1.000000
0.020984    0.925720    -1.000000
-0.475167    -0.346247    -1.000000
0.074952    0.042783    1.000000
0.394164    -0.058217    1.000000
0.663418    0.436525    -1.000000
0.402158    0.577744    -1.000000
-0.449349    -0.038074    1.000000
0.619080    -0.088188    -1.000000
0.268066    -0.071621    1.000000
-0.015165    0.359326    1.000000
0.539368    -0.374972    -1.000000
-0.319153    0.629673    -1.000000
0.694424    0.641180    -1.000000
0.079522    0.193198    1.000000
0.253289    -0.285861    1.000000
-0.035558    -0.010086    1.000000
-0.403483    0.474466    -1.000000
-0.034312    0.995685    -1.000000
-0.590657    0.438051    -1.000000
-0.098871    -0.023953    1.000000
-0.250001    0.141621    1.000000
-0.012998    0.525985    -1.000000
0.153738    0.491531    -1.000000
0.388215    -0.656567    -1.000000
0.049008    0.013499    1.000000
0.068286    0.392741    1.000000
0.747800    -0.066630    -1.000000
0.004621    -0.042932    1.000000
-0.701600    0.190983    -1.000000
0.055413    -0.024380    1.000000
0.035398    -0.333682    1.000000
0.211795    0.024689    1.000000
-0.045677    0.172907    1.000000
0.595222    0.209570    -1.000000
0.229465    0.250409    1.000000
-0.089293    0.068198    1.000000
0.384300    -0.176570    1.000000
0.834912    -0.110321    -1.000000
-0.307768    0.503038    -1.000000
-0.777063    -0.348066    -1.000000
0.017390    0.152441    1.000000
-0.293382    -0.139778    1.000000
-0.203272    0.286855    1.000000
0.957812    -0.152444    -1.000000
0.004609    -0.070617    1.000000
-0.755431    0.096711    -1.000000
-0.526487    0.547282    -1.000000
-0.246873    0.833713    -1.000000
0.185639    -0.066162    1.000000
0.851934    0.456603    -1.000000
-0.827912    0.117122    -1.000000
0.233512    -0.106274    1.000000
0.583671    -0.709033    -1.000000
-0.487023    0.625140    -1.000000
-0.448939    0.176725    1.000000
0.155907    -0.166371    1.000000
0.334204    0.381237    -1.000000
0.081536    -0.106212    1.000000
0.227222    0.527437    -1.000000
0.759290    0.330720    -1.000000
0.204177    -0.023516    1.000000
0.577939    0.403784    -1.000000
-0.568534    0.442948    -1.000000
-0.011520    0.021165    1.000000
0.875720    0.422476    -1.000000
0.297885    -0.632874    -1.000000
-0.015821    0.031226    1.000000
0.541359    -0.205969    -1.000000
-0.689946    -0.508674    -1.000000
-0.343049    0.841653    -1.000000
0.523902    -0.436156    -1.000000
0.249281    -0.711840    -1.000000
0.193449    0.574598    -1.000000
-0.257542    -0.753885    -1.000000
-0.021605    0.158080    1.000000
0.601559    -0.727041    -1.000000
-0.791603    0.095651    -1.000000
-0.908298    -0.053376    -1.000000
0.122020    0.850966    -1.000000
-0.725568    -0.292022    -1.000000

测试数据

3.542485    1.977398    -1
3.018896    2.556416    -1
7.551510    -1.580030    1
2.114999    -0.004466    -1
8.127113    1.274372    1
7.108772    -0.986906    1
8.610639    2.046708    1
2.326297    0.265213    -1
3.634009    1.730537    -1
0.341367    -0.894998    -1
3.125951    0.293251    -1
2.123252    -0.783563    -1
0.887835    -2.797792    -1
7.139979    -2.329896    1
1.696414    -1.212496    -1
8.117032    0.623493    1
8.497162    -0.266649    1
4.658191    3.507396    -1
8.197181    1.545132    1
1.208047    0.213100    -1
1.928486    -0.321870    -1
2.175808    -0.014527    -1
7.886608    0.461755    1
3.223038    -0.552392    -1
3.628502    2.190585    -1
7.407860    -0.121961    1
7.286357    0.251077    1
2.301095    -0.533988    -1
-0.232542    -0.547690    -1
3.457096    -0.082216    -1
3.023938    -0.057392    -1
8.015003    0.885325    1
8.991748    0.923154    1
7.916831    -1.781735    1
7.616862    -0.217958    1
2.450939    0.744967    -1
7.270337    -2.507834    1
1.749721    -0.961902    -1
1.803111    -0.176349    -1
8.804461    3.044301    1
1.231257    -0.568573    -1
2.074915    1.410550    -1
-0.743036    -1.736103    -1
3.536555    3.964960    -1
8.410143    0.025606    1
7.382988    -0.478764    1
6.960661    -0.245353    1
8.234460    0.701868    1
8.168618    -0.903835    1
1.534187    -0.622492    -1
9.229518    2.066088    1
7.886242    0.191813    1
2.893743    -1.643468    -1
1.870457    -1.040420    -1
5.286862    -2.358286    1
6.080573    0.418886    1
2.544314    1.714165    -1
6.016004    -3.753712    1
0.926310    -0.564359    -1
0.870296    -0.109952    -1
2.369345    1.375695    -1
1.363782    -0.254082    -1
7.279460    -0.189572    1
1.896005    0.515080    -1
8.102154    -0.603875    1
2.529893    0.662657    -1
1.963874    -0.365233    -1
8.132048    0.785914    1
8.245938    0.372366    1
6.543888    0.433164    1
-0.236713    -5.766721    -1
8.112593    0.295839    1
9.803425    1.495167    1
1.497407    -0.552916    -1
1.336267    -1.632889    -1
9.205805    -0.586480    1
1.966279    -1.840439    -1
8.398012    1.584918    1
7.239953    -1.764292    1
7.556201    0.241185    1
9.015509    0.345019    1
8.266085    -0.230977    1
8.545620    2.788799    1
9.295969    1.346332    1
2.404234    0.570278    -1
2.037772    0.021919    -1
1.727631    -0.453143    -1
1.979395    -0.050773    -1
8.092288    -1.372433    1
1.667645    0.239204    -1
9.854303    1.365116    1
7.921057    -1.327587    1
8.500757    1.492372    1
1.339746    -0.291183    -1
3.107511    0.758367    -1
2.609525    0.902979    -1
3.263585    1.367898    -1
2.912122    -0.202359    -1
1.731786    0.589096    -1
2.387003    1.573131    -1

原始测试数据
# -*- coding: utf-8 -*-
"""
Created on Tue Sep  4 16:58:16 2018
支持向量机代码实现
SMO(Sequential Minimal Optimization)最小序列优化
@author: weixw
"""
import numpy as np
#核转换函数(一个特征空间映射到另一个特征空间,低维空间映射到高维空间)
#高维空间解决线性问题,低维空间解决非线性问题
#线性内核 = 原始数据矩阵(100*2)与原始数据第一行矩阵转秩乘积(2*1) =>(100*1)
#非线性内核公式:k(x,y) = exp(-||x - y||**2/2*(e**2))
#1.原始数据每一行与原始数据第一行作差, 
#2.平方   
def kernelTrans(dataMat, rowDataMat, kTup):
    m,n=np.shape(dataMat)
    #初始化核矩阵 m*1
    K = np.mat(np.zeros((m,1)))
    if kTup[0] == 'lin': #线性核
        K = dataMat*rowDataMat.T
    elif kTup[0] == 'rbf':#非线性核
        for j in range(m):
            #xi - xj
            deltaRow = dataMat[j,:] - rowDataMat
            K[j] = deltaRow*deltaRow.T
        #1*m m*1 => 1*1
        K = np.exp(K/(-2*kTup[1]**2))
    else: raise NameError('Houston We Have a Problem -- That Kernel is not recognized')
    return K
        
#定义数据结构体,用于缓存,提高运行速度
class optStruct:
    def __init__(self, dataSet, labelSet, C, toler, kTup):
        self.dataMat = np.mat(dataSet) #原始数据,转换成m*n矩阵
        self.labelMat = np.mat(labelSet).T #标签数据 m*1矩阵
        self.C = C #惩罚参数,C越大,容忍噪声度小,需要优化;反之,容忍噪声度高,不需要优化;
                   #所有的拉格朗日乘子都被限制在了以C为边长的矩形里
        self.toler = toler #容忍度
        self.m = np.shape(self.dataMat)[0] #原始数据行长度
        self.alphas = np.mat(np.zeros((self.m,1))) # alpha系数,m*1矩阵
        self.b = 0 #偏置
        self.eCache = np.mat(np.zeros((self.m,2))) # 保存原始数据每行的预测值
        self.K = np.mat(np.zeros((self.m,self.m))) # 核转换矩阵 m*m
        for i in range(self.m):
            self.K[:,i] = kernelTrans(self.dataMat, self.dataMat[i,:], kTup)
            
#计算原始数据第k项对应的预测误差  1*m m*1 =>1*1  
#oS:结构数据
#k: 原始数据行索引           
def calEk(oS, k):
    #f(x) = w*x + b 
    fXk = float(np.multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b)
    Ek = fXk - float(oS.labelMat[k])
    return Ek

#在alpha有改变都要更新缓存
def updateEk(oS, k):
    Ek = calEk(oS, k)
    oS.eCache[k] = [1, Ek]
    

#第一次通过selectJrand()随机选取j,之后选取与i对应预测误差最大的j(步长最大)
def selectJ(i, oS, Ei):
    #初始化
    maxK = -1  #误差最大时对应索引
    maxDeltaE = 0 #最大误差
    Ej = 0 # j索引对应预测误差
    #保存每一行的预测误差值 1相对于初始化为0的更改
    oS.eCache[i] = [1,Ei]
    #获取数据缓存结构中非0的索引列表(先将矩阵第0列转化为数组)
    validEcacheList = np.nonzero(oS.eCache[:,0].A)[0]
    #遍历索引列表,寻找最大误差对应索引
    if len(validEcacheList) > 1:
        for k in validEcacheList:
            if k == i:
                continue
            Ek = calEk(oS, k)
            deltaE = abs(Ei - Ek)
            if(deltaE > maxDeltaE):
                maxK = k
                maxDeltaE = deltaE
                Ej = Ek
        return maxK, Ej
    else:
        #随机选取一个不等于i的j
        j = selectJrand(i, oS.m)
        Ej = calEk(oS, j)
    return j,Ej

#随机选取一个不等于i的索引          
def selectJrand(i, m):
    j = i
    while (j == i):
       j = int(np.random.uniform(0, m))
    return j

#alpha范围剪辑
def clipAlpha(aj, L, H):
    if aj > H:
        aj = H
    if aj < L:
        aj = L
    return aj

#从文件获取特征数据,标签数据
def loadDataSet(fileName):
    dataSet = []; labelSet = []
    fr = open(fileName)
    for line in fr.readlines():
        #分割
        lineArr = line.strip().split('\t')
        dataSet.append([float(lineArr[0]), float(lineArr[1])])
        labelSet.append(float(lineArr[2]))
    return dataSet, labelSet

#计算 w 权重系数
def calWs(alphas, dataSet, labelSet):
    dataMat = np.mat(dataSet)
    #1*100 => 100*1
    labelMat = np.mat(labelSet).T
    m, n = np.shape(dataMat)    
    w = np.zeros((n, 1))    
    for i in range(m):
        w += np.multiply(alphas[i]*labelMat[i], dataMat[i,:].T)        
    return w
#计算原始数据每一行alpha,b,保存到数据结构中,有变化及时更新       
def innerL(i, oS):
    #计算预测误差
    Ei = calEk(oS, i)
    #选择第一个alpha,违背KKT条件2
    #正间隔,负间隔
    if ((oS.labelMat[i] * Ei < -oS.toler) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.toler) and (oS.alphas[i] > 0)):
        #第一次随机选取不等于i的数据项,其后根据误差最大选取数据项
        j, Ej = selectJ(i, oS, Ei)
        #初始化,开辟新的内存
        alphaIold = oS.alphas[i].copy()
        alphaJold = oS.alphas[j].copy()
        #通过 a1y1 + a2y2 = 常量
        #    0 <= a1,a2 <= C 求出L,H
        if oS.labelMat[i] != oS.labelMat[j]:
            L = max(0, oS.alphas[j] - oS.alphas[i])
            H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
        else:
            L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
            H = min(oS.C, oS.alphas[j] + oS.alphas[i])
        if L == H : 
            print ("L == H")
            return 0
        #内核分母 K11 + K22 - 2K12
        eta = oS.K[i, i] + oS.K[j, j] - 2.0*oS.K[i, j]
        if eta <= 0:
            print ("eta <= 0")
            return 0
        #计算第一个alpha j
        oS.alphas[j] += oS.labelMat[j]*(Ei - Ej)/eta
        #修正alpha j的范围
        oS.alphas[j] = clipAlpha(oS.alphas[j], L, H)
        #alpha有改变,就需要更新缓存数据
        updateEk(oS, j)
        #如果优化后的alpha 与之前的alpha变化很小,则舍弃,并重新选择数据项的alpha
        if (abs(oS.alphas[j] - alphaJold) < 0.00001):
            print ("j not moving enough, abandon it.")
            return 0
        #计算alpha对的另一个alpha i
        # ai_new*yi + aj_new*yj = 常量
        # ai_old*yi + ai_old*yj = 常量 
        # 作差=> ai = ai_old + yi*yj*(aj_old - aj_new)
        oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
        #alpha有改变,就需要更新缓存数据
        updateEk(oS, i)
        #计算b1,b2
        # y(x) = w*x + b => b = y(x) - w*x
        # w = aiyixi(i= 1->N求和)
        #b1_new = y1_new - (a1_new*y1*k11 + a2_new*y2*k21 + ai*yi*ki1(i = 3 ->N求和 常量))
        #b1_old = y1_old - (a1_old*y1*k11 + a2_old*y2*k21 + ai*yi*ki1(i = 3 ->N求和 常量))
        #作差=> b1_new = b1_old + (y1_new - y1_old) - y1*k11*(a1_new - a1_old) - y2*k21*(a2_new - a2_old)
        # => b1_new = b1_old + Ei - yi*(ai_new - ai_old)*kii - yj*(aj_new - aj_old)*kij      
        #同样可推得 b2_new = b2_old + Ej - yi*(ai_new - ai_old)*kij - yj*(aj_new - aj_old)*kjj
        bi = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j] - alphaJold)*oS.K[i,j]
        bj = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,j] - oS.labelMat[j]*(oS.alphas[j] - alphaJold)*oS.K[j,j]
        #首选alpha i,相对alpha j 更准确
        if (0 < oS.alphas[i]) and (oS.alphas[i] < oS.C):
            oS.b = bi
        elif (0 < oS.alphas[j]) and (oS.alphas[j] < oS.C):
            oS.b = bj
        else:
            oS.b = (bi + bj)/2.0
        return 1
    else:
        return 0
    
#完整SMO核心算法,包含线性核核非线性核,返回alpha,b
#dataSet 原始特征数据
#labelSet 标签数据
#C 凸二次规划参数
#toler 容忍度
#maxInter 循环次数
#kTup 指定核方式
#程序逻辑:
#第一次全部遍历,遍历后根据alpha对是否有修改判断,
#如果alpha对没有修改,外循环终止;如果alpha对有修改,则继续遍历属于支持向量的数据。
#直至外循环次数达到maxIter
#相比简单SMO算法,运行速度更快,原因是:
#1.不是每一次都全量遍历原始数据,第一次遍历原始数据,
#如果alpha有优化,就遍历支持向量数据,直至alpha没有优化,然后再转全量遍历,这是如果alpha没有优化,循环结束;
#2.外循环不需要达到maxInter次数就终止;
def smoP(dataSet, labelSet, C, toler, maxInter, kTup = ('lin', 0)):
    #初始化结构体类,获取实例
    oS = optStruct(dataSet, labelSet, C, toler, kTup)
    iter = 0
    #全量遍历标志
    entireSet = True
    #alpha对是否优化标志
    alphaPairsChanged = 0
    #外循环 终止条件:1.达到最大次数 或者 2.alpha对没有优化
    while (iter < maxInter) and ((alphaPairsChanged > 0) or (entireSet)):
        alphaPairsChanged = 0
        #全量遍历 ,遍历每一行数据 alpha对有修改,alphaPairsChanged累加
        if entireSet:
            for i in range(oS.m):
                alphaPairsChanged += innerL(i, oS)
                print ("fullSet, iter: %d i:%d, pairs changed %d" %(iter, i, alphaPairsChanged))
            iter += 1
        else:
            #获取(0,C)范围内数据索引列表,也就是只遍历属于支持向量的数据
            nonBounds = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
            for i in nonBounds:
                alphaPairsChanged += innerL(i, oS)
                print ("non-bound, iter: %d i:%d, pairs changed %d" %(iter, i, alphaPairsChanged))
            iter += 1
        #全量遍历->支持向量遍历
        if entireSet:
            entireSet = False
        #支持向量遍历->全量遍历
        elif alphaPairsChanged == 0:
            entireSet = True
        print ("iteation number: %d"% iter)
        print ("entireSet :%s"% entireSet)
        print ("alphaPairsChanged :%d"% alphaPairsChanged)
    return oS.b,oS.alphas

#绘制支持向量
def drawDataMap(dataArr,labelArr,b,alphas):
    import matplotlib.pyplot as plt
    #alphas.A>0 获取大于0的索引列表,只有>0的alpha才对分类起作用
    svInd=np.nonzero(alphas.A>0)[0]           
     #分类数据点
    classified_pts = {'+1':[],'-1':[]}
    for point,label in zip(dataArr,labelArr):
        if label == 1.0:
            classified_pts['+1'].append(point)
        else:
            classified_pts['-1'].append(point)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    #绘制数据点
    for label,pts in classified_pts.items():
        pts = np.array(pts)
        ax.scatter(pts[:, 0], pts[:, 1], label = label)
    #绘制分割线
    w = calWs(alphas, dataArr, labelArr)
    #函数形式:max( x ,key=lambda a : b )        #    x可以是任何数值,可以有多个x值
    #先把x值带入lambda函数转换成b值,然后再将b值进行比较
    x1, _=max(dataArr, key=lambda x:x[0])
    x2, _=min(dataArr, key=lambda x:x[0])    
    a1, a2 = w
    y1, y2 = (-b - a1*x1)/a2, (-b - a1*x2)/a2
    #矩阵转化为数组.A
    ax.plot([x1, x2],[y1.A[0][0], y2.A[0][0]])
    
    #绘制支持向量
    for i in svInd:
        x, y= dataArr[i]        
        ax.scatter([x], [y], s=150, c ='none', alpha=0.7, linewidth=1.5, edgecolor = '#AB3319')
    plt.show()
    
     #alpha>0对应的数据才是支持向量,过滤不是支持向量的数据
    sVs= np.mat(dataArr)[svInd] #get matrix of only support vectors
    print ("there are %d Support Vectors.\n" % np.shape(sVs)[0])
    
#训练结果    
def getTrainingDataResult(dataSet, labelSet, b, alphas, k1=1.3):
    datMat = np.mat(dataSet)
    #100*1
    labelMat = np.mat(labelSet).T
    #alphas.A>0 获取大于0的索引列表,只有>0的alpha才对分类起作用
    svInd=np.nonzero(alphas.A>0)[0]
    sVs=datMat[svInd]
    labelSV = labelMat[svInd];
    m,n = np.shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
        # y(x) = w*x + b => b = y(x) - w*x
        # w = aiyixi(i= 1->N求和)
        predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b
        if np.sign(predict)!=np.sign(labelSet[i]): errorCount += 1
    print ("the training error rate is: %f" % (float(errorCount)/m))
    
def getTestDataResult(dataSet, labelSet, b, alphas, k1=1.3):
    datMat = np.mat(dataSet)
    #100*1
    labelMat = np.mat(labelSet).T
    #alphas.A>0 获取大于0的索引列表,只有>0的alpha才对分类起作用
    svInd=np.nonzero(alphas.A>0)[0]
    sVs=datMat[svInd]
    labelSV = labelMat[svInd];
    m,n = np.shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
        # y(x) = w*x + b => b = y(x) - w*x
        # w = aiyixi(i= 1->N求和)
        predict=kernelEval.T * np.multiply(labelSV,alphas[svInd]) + b
        if np.sign(predict)!=np.sign(labelSet[i]): errorCount += 1    
    print ("the test error rate is: %f" % (float(errorCount)/m))  
    
    

SMO算法实现
# -*- coding: utf-8 -*-
"""
Created on Wed Sep  5 15:22:26 2018

@author: weixw
"""


import mySVMMLiA as sm

#通过训练数据计算 b, alphas
dataArr,labelArr = sm.loadDataSet('trainingData.txt')
b, alphas = sm.smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', 0.10))
sm.drawDataMap(dataArr,labelArr,b,alphas)
sm.getTrainingDataResult(dataArr, labelArr, b, alphas, 0.10)
dataArr1,labelArr1 = sm.loadDataSet('testData.txt')
#测试结果
sm.getTestDataResult(dataArr1, labelArr1, b, alphas, 0.10)

测试代码

 

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!