#导入必要的包 import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline
构建一个KNN分类器
In [6]:
def classify0_1(train,test,k):#train数据集 test 测试集 k=k值 n = train.shape[1] - 1 m = test.shape[0] result = [] for i in range(m): #利用广播计算测试集每一行分别对训练集求距离 得到Series 并转换成list赋值给dist dist = list(((train.iloc[:, :n] - test.iloc[i, :n]) **2).sum(1)) #得到距离数值与标签列生成的DataFram dist_l = pd.DataFrame({'dist': dist, 'labels': (train.iloc[:,n])}) #按照距离排序(默认升序)截取前K行 dr = dist_l.sort_values(by = 'dist')[: k] #对截取的前K个标签进行计数 re = dr.loc[:, 'labels'].value_counts() #截取排名第一的标签赋值给result result.append(re.index[0]) #创建一个Series result = pd.Series(result) #在测试集创建一列,并且赋值是result test['predict'] = result return test
In [2]:
#测试分类器是否能正常工作 def createDataSet():#创建一组数据 group = np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels = ['A','A','B','B'] return group, labels
In [7]:
group, labels = createDataSet()#维度不一致用vstack进行纵拼接,横拼接用hstack train = np.vstack([group, [0, 0]]) labels.append('B')
In [8]:
train
Out[8]:
In [10]:
labels
Out[10]:
In [11]:
#然后生成DataFrame train = pd.DataFrame({'x1': train[:, 0], 'x2': train[:, 1], 'labels': labels}) train = train.reindex(['x1'] + ['x2'] + ['labels'], axis=1)#生成DataFrame是无序的需要用reindex 排序
In [12]:
#创建测试集 p1 = [1, 2] p2 = [0, 1] test = pd.DataFrame({'x1':p1, 'x2':p2})
In [16]:
test #注意 创建DataFrame时,值是看成列向量处理的
Out[16]:
In [18]:
result=classify0_1(train,test,3)#分类器运行正常 result
Out[18]:
让我们进一步完善我们的模型
可视化展示
In [19]:
#修改列标 result.columns = ['x1', 'x2', 'labels'] result
Out[19]:
In [20]:
#合并数据集 train input = pd.concat([train, result], ignore_index=True)#concat 拼接函数 ignore_index=True 忽略我们的索引 从新排列 input
Out[20]:
In [21]:
#添加2列用于作图,第一列区分标签,第二例区分测试集与训练集 input['Ind1'] = 1 for i in range(input.shape[0]): if(input.iloc[i, 2] == 'B'): input.iloc[i, 3] = 0 input['Ind2'] = [1, 1, 1, 1, 1, 0.5, 0.5] input
Out[21]:
In [22]:
#注:scatter 画散点图可以针对每个点做定制化处理 s大小 c 颜色 plt.scatter(input.iloc[:, 0], input.iloc[:, 1],s=200*input.iloc[:, 4], c=input.iloc[:, 3])
Out[22]:
用鸢尾花数据执行算法
In [40]:
iris = pd.read_csv("iris (1).txt",header = None)# header = 表明第一行不是标题 iris.columns = ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm', 'class'] iris.head()
Out[40]:
In [41]:
iris.shape
Out[41]:
手动区分训练集与测试集
In [42]:
#s手动区分训练集与测试集 import random def randSplit(dataSet, rate):# dataset 数据集 rate 训练集抽取比列 #截取索引并打乱排序 l = list(dataSet.index) random.shuffle(l) #将索引返回给数据 dataSet.index = l n = dataSet.shape[0] m = int(n * rate) #分别抽取训练集与测试集 train = dataSet.loc[range(m), :] test = dataSet.loc[range(m, n), :] #恢复索引 dataSet.index = range(dataSet.shape[0]) test.index = range(test.shape[0]) return train, test
In [43]:
train, test=randSplit(iris,0.5)
In [45]:
train.shape
Out[45]:
In [46]:
test.shape
Out[46]:
In [48]:
classify0_1(train,test,3)#因为鸢尾花数据的良好特性,所以分类结果非常完美
sepal_length_cm | sepal_width_cm | petal_length_cm | petal_width_cm | class | predict | |
---|---|---|---|---|---|---|
0 | 6.4 | 3.2 | 5.3 | 2.3 | Iris-virginica | |
1 | 6.9 | 3.1 | 4.9 | 1.5 | Iris-versicolor | Iris-versicolor |
2 | 5.1 | 3.8 | 1.9 | 0.4 | Iris-setosa | Iris-setosa |
3 | 5.7 | 3.0 | 4.2 | 1.2 | Iris-versicolor | Iris-versicolor |
4 | 7.7 | 3.8 | 6.7 | 2.2 | Iris-virginica | Iris-virginica |
5 | 4.6 | 3.2 | 1.4 | 0.2 | Iris-setosa | Iris-setosa |
6 | 5.4 | 3.0 | 4.5 | 1.5 | Iris-versicolor | Iris-versicolor |
7 | 5.0 | 3.5 | 1.6 | 0.6 | Iris-setosa | Iris-setosa |
8 | 6.7 | 3.0 | 5.0 | 1.7 | Iris-versicolor | Iris-virginica |
9 | 5.2 | 4.1 | 1.5 | 0.1 | Iris-setosa | Iris-setosa |
10 | 5.2 | 3.5 | 1.5 | 0.2 | Iris-setosa | Iris-setosa |
11 | 5.1 | 3.8 | 1.6 | 0.2 | Iris-setosa | Iris-setosa |
12 | 7.6 | 3.0 | 6.6 | 2.1 | Iris-virginica | Iris-virginica |
13 | 4.4 | 3.0 | 1.3 | 0.2 | Iris-setosa | Iris-setosa |
14 | 4.4 | 3.2 | 1.3 | 0.2 | Iris-setosa | Iris-setosa |
15 | 7.3 | 2.9 | 6.3 | 1.8 | Iris-virginica | Iris-virginica |
. . .
用一个不那么完美的数据测试
In [90]:
1
date=pd.read_table("datingTestSet.txt",header=None)
2
date.head()
Out[90]:
发现数据需要进行去量纲,先写好归一化函数
In [52]:
1
#0-1标准化
2
def MaxMinNormalization(dataSet):
3
maxDf = dataSet.max()
4
minDf = dataSet.min()
5
normSet = (dataSet - minDf) / (maxDf - minDf)
6
return normSet
In [54]:
1
#Z-score归一化
2
def Z_ScoreNormalization(dataSet):
3
stdDf = dataSet.std()
4
meanDf = dataSet.mean()
5
normSet = (dataSet - meanDf) / stdDf
6
return normSet
7
#两者区别在于 标准化容易受极端值影响,需要去除极端值,而归一化不受极端值影响但是运算量较大
In [67]:
1
#sigmod压缩法
2
def sigmodNormalization(dataSet):
3
normSet = 1 / (1 + np.exp(-dataSet))
4
return normSet
In [150]:
1
#标准化之后与原来数据集的最后一列拼接
2
date= pd.concat([MaxMinNormalization(date.iloc[:, :3]),
3
date.iloc[:, 3]], axis=1)
4
date.head()
Out[150]:
In [65]:
1
#2-8比列切分测试集与训练集
2
date_train,date_test=randSplit(date,0.8)
In [88]:
1
#K值取2进行分类
2
result=classify0_1(date_train,date_test,2)
3
result.head()
Out[88]:
下面进行模型效力判定
In [83]:
1
#首先封装一个准确率判断模型
2
def accuracyCalculation(dataSet):
3
m = dataSet.shape[0]
4
#如果后一列数据=前一列数据,对返回为True的行计数
5
res = (dataSet.iloc[:, -1] == dataSet.iloc[:, -2]).value_counts()
6
acc = res.loc[True] / m
7
print("Model accuracy is :{}".format(acc) )
8
return acc
In [84]:
1
accuracyCalculation(result)#准确率96.5%
Out[84]:
二分类问题的混淆矩阵
In [101]:
1
#dataSet 为数据集 pos为数据集中为1 的变量 neg 为数据集中为 0 的变量 在执行中需要人为指定
2
def confusionMatrix(dataSet,pos,neg):
3
TP = dataSet.loc[(dataSet.iloc[:, -1] == pos) & (dataSet.iloc[:,-2] == pos),].shape[0]
4
FN = dataSet.loc[(dataSet.iloc[:, -1] == neg) & (dataSet.iloc[:,-2] == pos),].shape[0]
5
TN = dataSet.loc[(dataSet.iloc[:, -1] == neg) & (dataSet.iloc[:,-2] == neg),].shape[0]
6
FP = dataSet.loc[(dataSet.iloc[:, -1] == pos) & (dataSet.iloc[:,-2] == neg),].shape[0]
7
dataSet_ac = (TP + TN) / (TP + TN + FP + FN)
8
dataSet_pr = TP / (TP + FP)
9
dataSet_re = TP / (TP + FN)
10
dataSet_sp = TN / (TN + FP)
11
dataSet_F = 2 * dataSet_pr * dataSet_re / (dataSet_pr + dataSet_re)
12
print("模型准确率为:{}".format(dataSet_ac))
13
print("模型精确度为:{}".format(dataSet_pr))
14
print("模型召回率为:{}".format(dataSet_re))
15
print("模型特异度为:{}".format(dataSet_sp))
16
print("模型F指标为:{}".format(dataSet_F))
17
return [dataSet_ac, dataSet_pr, dataSet_re, dataSet_sp, dataSet_F]
In [97]:
1
#为了测试混淆矩阵我们抽取出只有2分类问题的数据
2
cla = (date.iloc[:, 3] == 'smallDoses') | (date.iloc[:, 3] ==
3
'didntLike')
4
dating_part = date.loc[cla,]
5
dating_part.iloc[:, -1].value_counts()
6
dating_part.index = range(dating_part.shape[0])
7
dating_part.head()
Out[97]:
In [98]:
1
date_train,date_test=randSplit(dating_part,0.8)
In [99]:
1
result=classify0_1(date_train,date_test,2)
2
result.head()
Out[99]:
In [102]:
1
confusionMatrix(result,"smallDoses","didntLike")
Out[102]:
KNN模型计算距离之后实行一人一票制,下面我们加入权重,对各点的票数加以区分 惩罚因子公式为w=1/d(x',x)**2
In [105]:
1
def classify0_2(train,test,k):
2
n = train.shape[1] - 1
3
m = test.shape[0]
4
result = []
5
for i in range(m):
6
dist = list(((train.iloc[:, :n] - test.iloc[i, :n]) ** 2).sum(1))
7
dist_l = pd.DataFrame({'dist': dist, 'labels': (train.iloc[:, n])})
8
dr = dist_l.sort_values(by = 'dist')[: k]
9
dr['re'] = 1 / dr.iloc[:, 0]
10
re = dr.groupby('labels').sum()
11
re.sort_values(by = 're', ascending=False)
12
result.append(re.index[0])
13
result = pd.Series(result)
14
test['predict'] = result
15
return test
In [115]:
1
date_train,date_test=randSplit(date,0.8)
In [119]:
1
result1=classify0_1(date_train,date_test,3)
2
result2=classify0_2(date_train,date_test,3)
In [117]:
1
accuracyCalculation(result1)
Out[117]:
In [118]:
1
accuracyCalculation(result2)
Out[118]:
创建一个K值学习曲线
In [122]:
1
def kLearningCurve(classify, train, test, k):
2
"""
3
说明:
4
classify:此处输入我们的分类器classify0_1或classify0_1
5
teain:输入训练集
6
test:输入测试集
7
K :输入我们的K值取值范围 1-k 左右均包含
8
accuracyCalculation:这之前自定义的准确率判别函数
9
10
"""
11
yAc = []
12
for i in range(k):
13
yAc.append(accuracyCalculation(classify(train, test, i+1)))
14
plt.plot(range(1, k+1), yAc, '-o',color='black')
15
return yAc
In [124]:
1
date_train,date_test=randSplit(date,0.8)
In [125]:
1
kLearningCurve(classify0_1, date_train,date_test,10)
Out[125]:
In [126]:
1
kLearningCurve(classify0_2, date_train,date_test,10)
Out[126]:
可见加入惩罚因子之后模型表现较为稳定,下面试一试加入交叉验证看看效果
In [129]:
1
#首先利用index乱序方法对数据进行随机等分切分
2
def randSplit_1(dataSet, n):
3
"""说明:
4
dataset:我的数据集
5
n :我需要切分成多少份
6
"""
7
l = list(dataSet.index)
8
random.shuffle(l)
9
dataSet.index = l
10
m = dataSet.shape[0]
11
splitSet = []
12
k = m / n
13
for i in range(n):
14
if i < (n-1):
15
splitSet.append(dataSet.loc[range(i*int(k),(i+1)*int(k)), :])
16
else:
17
splitSet.append(dataSet.loc[range(i*int(k), m), :])
18
dataSet.index = range(dataSet.shape[0])
19
return splitSet
In [130]:
1
#试一试效果
2
sp = randSplit_1(date, 10)
In [132]:
1
sp[2].shape
Out[132]:
In [135]:
1
sp[0].head()
Out[135]:
在此基础上创建完整的交叉验证自定义函数
In [137]:
1
def crossVali(dataSet, randSplit, classify, n, k):
2
"""
3
交叉验证函数说明:
4
dataSet:进行分类测试的数据集
5
randSplit:自定义随机切分函数
6
classify:自定义KNN 分类器
7
n:数据等分个数
8
k:KNN中选取最近邻个数
9
accuracyCalculation:自定义准确率计算函数
10
"""
11
sp = randSplit(dataSet, n)#将数据切人完毕
12
result = np.array([])
13
for i in range(n):
14
test = sp[0]#取第一份数据作为测试集
15
del sp[0]#在原数据中删除位置
16
train = pd.concat(sp)#合并剩下的数据集作为训练集
17
train.index = range(train.shape[0])#更新索引
18
test.index = range(test.shape[0])
19
test = classify(train, test, k)#开始分类
20
result = np.append(result, accuracyCalculation(test))#将准确率返回result
21
test = pd.DataFrame(test.drop(['predict'], axis = 1))#删除分类之后的标签列
22
sp.append(test)#第一份数据使用完成之后,添加会原数据中
23
return result, result.mean(), result.var()
In [139]:
1
crossVali(date,randSplit_1,classify0_1, 10, 3)
Out[139]:
由此可见数据集中训练集与测试集的切分问题对模型是有影响的
下面将交叉验证函数与K值学习曲线嵌套在一起,利用交叉验证的均值来修正K值学习曲线的准确率结果,来选取K值
In [141]:
1
def kLearningCurve_1(dataSet, classify, n, k):
2
"""
3
K值学习曲线参数说明:
4
dataSet:我们的数据集
5
classify:用于指定分类函数
6
n :指定交叉验证中数据集切分数量
7
K : 指定K值选取范围,1-K,左右均包含
8
"""
9
yAc_mean = []
10
yAc_up = []
11
yAc_down = []
12
for i in range(k):
13
result_cv, result_mean, result_var = crossVali(dataSet,randSplit_1, classify, n, i+1)
14
yAc_mean.append(result_mean)
15
yAc_up.append(result_mean+result_var)
16
yAc_down.append(result_mean-result_var)
17
plt.plot(range(1, k+1), yAc_mean, '-o',color='black')
18
plt.plot(range(1, k+1), yAc_up, '--o',color='red')
19
plt.plot(range(1, k+1), yAc_down, '--o',color='red')
20
return yAc_mean, yAc_up, yAc_down
21
#用均值反应集中趋势,用方差表示离中程度,当方差较大时均值的效力将被削弱
In [142]:
1
kLearningCurve_1(date,classify0_1,10,10)
Out[142]:
In [ ]:
1
K值的选取,一般选择拐点,且方差较小的K
In [151]:
1
import time
2
%time kLearningCurve_1(date,classify0_2,10,10)
Out[151]:
In [149]:
1
date.iloc[:,3].value_counts()
Out[149]:
In [ ]:
1
KNN的sklean 方法
In [ ]:
1
from sklearn.neighbors import KNeighborsClassifier
2
k = 3
3
clf = KNeighborsClassifier(n_neighbors=k)
4
clf.fit(group, labels)