K邻近算法(kNeighbrClassifier/KNN):原理为 欧几里得距离+最近+投票(权重)+概率
根据距离的远近进行分类
欧几里得距离:多维空间中各点之间的距离
缺点:时间复杂度和空间复杂度较大
注意:当训练样本数据少的时候,样本比例一定要相同
KNN算法分类电影
import numpy import pandas #导入Excel文件 from sklearn.neighbors import KNeighborsClassifier #机器学习算法库,没有深度学习算法 movie=pandas.read_excel(r"D:\Python\代码\Machine-Learn\1-KNN\data\movie.xlsx",sheet_name=0) movie
电影名称 | 武打镜头 | 接吻镜头 | 分类情况 | |
---|---|---|---|---|
0 | 大话西游 | 36 | 1 | 动作片 |
1 | 杀破狼 | 43 | 2 | 动作片 |
2 | 前任3 | 0 | 10 | 爱情片 |
3 | 战狼2 | 59 | 1 | 动作片 |
4 | 泰坦尼克号 | 1 | 15 | 爱情片 |
5 | 新余心愿 | 2 | 19 | 爱情片 |
movie=pandas.read_excel(r"D:\Python\代码\Machine-Learn\1-KNN\data\movie.xlsx",sheet_name=0) x=movie[["武打镜头","接吻镜头"]] #取出训练数据中的训练数据 y=movie["分类情况"] #取出目标值 knn=KNeighborsClassifier(n_neighbors=5) knn.fit(x,y) #训练数据 #预测电影《飞车》武打镜头50,接吻镜头2 x_text=pandas.DataFrame({"武打镜头":[50,3],"接吻镜头":[2,50]}) x_text
武打镜头 | 接吻镜头 | |
---|---|---|
0 | 50 | 2 |
1 | 3 | 50 |
get_result=knn.predict(x_text) get_proba=knn.predict_proba(x_text) print("概率:{}".format(get_proba)) print("分类结果:{}".format(get_result))
概率:[[0.6 0.4] [0.4 0.6]] 分类结果:['动作片' '爱情片']
电影分类运行原理
s=((movie["武打镜头"]-50)**2+(movie["接吻镜头"]-2)**2)**0.5 #根据knn算法求距离 index=s.sort_values().index #先将数据排序然后取出索引 fljg=movie["分类情况"][index[:5]] print("\n{}".format(index),"\n分类:\n{}".format(fljg))
Int64Index([1, 3, 0, 2, 4, 5], dtype='int64') 分类: 1 动作片 3 动作片 0 动作片 2 爱情片 4 爱情片 Name: 分类情况, dtype: object
识别梵文
import numpy import os import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier img=plt.imread(r"D:\Python\代码\Machine-Learn\1-KNN\data\手写字母测试与训练\梵文识别学习\Test\character_1_ka\1339.png") plt.imshow(img,cmap=plt.cm.gray)
<matplotlib.image.AxesImage at 0x1af31dcc048>
#将读取的图片保存到数组data中 def img_read(dir_name,data): for filename in os.listdir(dir_name): img=plt.imread(dir_name+"\\"+filename) data.append(img) def readTain(): #获取训练数据 data=[] dir_path=r"D:\Python\代码\Machine-Learn\1-KNN\data\手写字母测试与训练\梵文识别学习\Train" for dir_name in os.listdir(dir_path)[36:]: dir_name=dir_path+"\\"+dir_name img_read(dir_name,data) return data def readTest(): #获取测试数据 data_test=[] dir_test_path=r"D:\Python\代码\Machine-Learn\1-KNN\data\手写字母测试与训练\梵文识别学习\Test" for dir_name in os.listdir(dir_test_path)[36:]: dir_name=dir_test_path+"\\"+dir_name img_read(dir_name,data_test) return data_test x=readTain() #训练数据 train_x=numpy.array(x) train_x_shape={} train_x_shape["图片数量"]=test_x.shape[0] train_x_shape["宽度"]=str(test_x.shape[1])+"px" train_x_shape["高度"]=str(test_x.shape[2])+"px" train_x_shape
{'图片数量': 3000, '宽度': '32px', '高度': '32px'}
x=readTest() #测试数据 ndarray_x=numpy.array(x) # 随机抽样测试数据 index=numpy.random.randint(0,3000,size=1000) test_x=ndarray_x[index] test_x_shape={} test_x_shape["图片数量"]=test_x.shape[0] test_x_shape["宽度"]=str(test_x.shape[1])+"px" test_x_shape["高度"]=str(test_x.shape[2])+"px" test_x_shape
{'图片数量': 1000, '宽度': '32px', '高度': '32px'}
# 对应的数字 num=[0,1,2,3,4,5,6,7,8,9]*300 test_y=numpy.array(num) test_y.sort() test_y=test_y[index] test_y
array([0, 9, 8, 0, 3, 3, 0, 6, 6, 2, 1, 0, 2, 9, 0, 5, 5, 1, 7, 3, 1, 9, 7, 3, 0, 8, 8, 4, 0, 5, 7, 7, 4, 3, 3, 1, 8, 2, 6, 1, 5, 0, 8, 6, 0, 2, 7, 4, 3, 1, 9, 8, 9, 4, 2, 7, 5, 3, 0, 5, 9, 4, 1, 8, 5, 7, 6, 5, 0, 9, 9, 1, 4, 9, 9, 5, 2, 6, 4, 6, 2, 2, 2, 6, 7, 7, 4, 3, 8, 7, 2, 5, 4, 2, 6, 0, 9, 9, 5, 8, 4, 3, 7, 5, 0, 1, 5, 7, 1, 3, 3, 9, 5, 8, 6, 6, 7, 5, 6, 5, 1, 6, 0, 3, 6, 3, 5, 3, 4, 5, 8, 9, 7, 2, 3, 9, 5, 6, 6, 0, 3, 2, 3, 5, 8, 8, 8, 2, 3, 0, 7, 9, 6, 0, 9, 8, 8, 6, 6, 6, 9, 2, 8, 6, 6, 7, 4, 6, 1, 7, 2, 4, 2, 6, 6, 7, 9, 4, 9, 0, 7, 6, 6, 7, 9, 9, 5, 3, 1, 1, 8, 1, 0, 6, 6, 3, 5, 4, 7, 3, 3, 5, 0, 3, 1, 9, 2, 9, 7, 0, 6, 1, 2, 6, 4, 2, 3, 0, 4, 3, 4, 9, 2, 6, 8, 4, 2, 1, 5, 1, 0, 7, 9, 2, 4, 8, 4, 4, 5, 0, 4, 1, 1, 5, 0, 4, 4, 7, 4, 1, 2, 1, 0, 1, 2, 5, 6, 6, 1, 7, 6, 7, 6, 5, 0, 2, 4, 8, 7, 7, 9, 8, 1, 7, 9, 8, 5, 0, 2, 9, 7, 8, 2, 0, 5, 4, 3, 3, 6, 1, 4, 5, 9, 9, 5, 4, 0, 9, 9, 4, 3, 9, 8, 2, 3, 5, 6, 4, 8, 5, 0, 2, 6, 5, 5, 7, 2, 1, 8, 6, 4, 7, 9, 7, 2, 6, 4, 4, 3, 9, 5, 4, 4, 0, 5, 1, 5, 8, 9, 6, 5, 3, 2, 3, 4, 1, 6, 0, 0, 8, 1, 3, 0, 4, 0, 6, 5, 9, 0, 8, 7, 5, 4, 2, 0, 3, 8, 4, 3, 2, 0, 5, 0, 8, 3, 1, 2, 5, 6, 3, 6, 0, 5, 9, 9, 8, 2, 3, 2, 1, 4, 6, 1, 7, 9, 2, 1, 5, 4, 1, 3, 3, 9, 5, 1, 4, 0, 1, 7, 2, 4, 3, 4, 0, 0, 0, 3, 5, 0, 4, 3, 5, 3, 0, 7, 7, 5, 1, 7, 2, 5, 8, 0, 0, 5, 1, 9, 5, 8, 8, 5, 4, 9, 7, 4, 2, 9, 2, 9, 5, 8, 8, 4, 9, 7, 1, 5, 1, 1, 0, 6, 9, 1, 6, 3, 3, 7, 1, 6, 0, 7, 8, 7, 3, 6, 7, 9, 1, 1, 1, 8, 8, 8, 9, 1, 4, 5, 1, 0, 7, 3, 2, 9, 3, 7, 7, 1, 7, 6, 8, 3, 8, 3, 0, 4, 3, 1, 0, 3, 3, 2, 5, 6, 6, 6, 2, 9, 4, 6, 3, 7, 6, 1, 8, 8, 4, 2, 6, 3, 7, 8, 0, 6, 4, 4, 9, 9, 2, 3, 5, 9, 2, 1, 4, 3, 9, 5, 8, 9, 5, 5, 2, 2, 7, 4, 5, 4, 6, 4, 0, 5, 9, 6, 4, 6, 9, 2, 0, 4, 6, 6, 7, 5, 8, 8, 8, 5, 8, 9, 0, 0, 3, 2, 7, 7, 3, 3, 4, 5, 2, 3, 3, 1, 0, 9, 1, 1, 8, 1, 3, 9, 8, 7, 1, 6, 9, 1, 7, 8, 4, 9, 5, 6, 4, 2, 3, 8, 3, 4, 8, 3, 8, 7, 5, 5, 0, 6, 2, 9, 8, 6, 6, 6, 5, 2, 9, 0, 1, 8, 1, 2, 6, 6, 6, 5, 4, 3, 2, 0, 6, 6, 3, 4, 5, 3, 8, 6, 4, 5, 4, 7, 6, 5, 4, 5, 8, 9, 4, 5, 5, 2, 5, 1, 5, 6, 4, 4, 1, 4, 1, 9, 8, 8, 7, 1, 9, 3, 5, 2, 5, 8, 0, 2, 7, 2, 2, 7, 5, 8, 0, 6, 0, 7, 0, 4, 2, 8, 6, 3, 3, 3, 8, 3, 6, 7, 5, 3, 9, 3, 8, 5, 8, 6, 2, 2, 0, 1, 9, 2, 6, 2, 8, 6, 0, 7, 0, 3, 4, 9, 4, 1, 2, 2, 3, 5, 5, 7, 9, 9, 7, 0, 6, 5, 8, 3, 1, 6, 8, 4, 1, 6, 7, 3, 9, 5, 1, 4, 5, 7, 1, 0, 5, 9, 4, 9, 5, 3, 6, 2, 2, 2, 3, 9, 0, 0, 2, 3, 2, 9, 9, 1, 8, 4, 7, 1, 1, 2, 4, 3, 4, 9, 7, 4, 7, 8, 6, 0, 4, 8, 7, 0, 6, 0, 5, 4, 0, 9, 7, 2, 9, 4, 0, 3, 0, 8, 4, 3, 5, 4, 5, 2, 2, 2, 7, 9, 0, 7, 2, 1, 5, 3, 6, 5, 3, 3, 1, 3, 4, 6, 4, 1, 5, 7, 7, 0, 7, 0, 3, 1, 2, 2, 3, 6, 1, 8, 3, 9, 5, 9, 7, 7, 8, 4, 3, 0, 1, 5, 1, 7, 5, 8, 5, 8, 5, 1, 7, 4, 8, 0, 2, 8, 8, 3, 2, 8, 6, 2, 1, 0, 2, 7, 3, 4, 2, 6, 3, 3, 9, 9, 1, 8, 9, 7, 4, 9, 8, 4, 4, 7, 0, 7, 0, 2, 0, 0, 2, 8, 7, 3, 6, 6, 2, 4, 2, 0, 4, 9, 0, 4, 3, 7, 5, 7, 7, 2, 6, 9, 3, 1, 0, 4, 1, 7, 8, 4, 5, 1, 4, 1, 0, 9, 3, 9, 3, 7, 1, 9, 2, 0, 2, 5, 2, 9, 1, 6, 0, 2, 1, 8, 5, 0, 1, 8, 2, 0, 0, 8, 3, 1, 1, 9, 5, 9, 7, 5, 6, 5, 7, 1, 1])
# 对应的数字 num=[0,1,2,3,4,5,6,7,8,9]*1700 train_y=numpy.array(num) train_y.sort() train_y
array([0, 0, 0, ..., 9, 9, 9])
# 将三维数据变为二维,fit训练数据不支持二维以上数据 train_x.reshape(17000,1024) test_x.reshape(1000,1024)
array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)
# %%time # 训练数据 knn=KNeighborsClassifier(n_neighbors=5) knn.fit(train_x.reshape(17000,-1),train_y)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform')
# %%time # 获得结果 y_result=knn.predict(test_x.reshape(1000,1024)) print("预测结果:\n{}".format(y_result[500:700]),"\n实际结果:\n{}".format(test_y[500:700]))
预测结果: [6 3 2 8 5 0 2 8 4 3 7 2 7 7 9 1 5 3 4 0 9 5 2 0 5 2 0 0 0 8 0 9 0 4 9 9 4 1 3 6 0 8 6 4 6 8 2 0 7 3 2 5 6 1 4 7 7 4 5 9 7 9 0 7 0 2 1 8 7 5 4 9 2 4 7 9 8 2 6 7 3 1 6 9 6 8 7 0 1 0 2 2 0 3 3 0 5 9 5 2 2 8 2 9 7 9 8 3 9 8 9 0 7 4 2 4 9 0 3 4 3 8 6 2 2 9 5 3 1 8 2 5 1 3 7 2 7 3 2 8 1 3 5 2 1 7 9 4 4 6 9 2 9 8 9 4 5 2 2 9 1 4 9 1 9 4 1 7 2 1 2 0 3 1 8 3 5 9 0 8 3 6 6 8 1 6 1 2 0 0 0 2 1 0 5 7 9 2 7 9] 实际结果: [6 3 2 8 5 0 2 8 4 3 7 2 7 7 9 1 5 3 4 6 9 5 2 0 3 2 0 0 0 8 0 9 0 4 9 9 4 1 3 6 0 8 6 4 6 8 2 0 7 3 2 5 6 1 4 7 7 4 5 9 7 9 0 7 0 2 1 8 7 5 4 9 2 4 7 9 8 2 6 7 3 1 6 9 6 8 7 0 1 0 2 2 0 3 3 0 5 9 5 2 2 8 2 9 7 9 8 3 9 8 9 0 7 4 2 4 9 0 3 4 3 8 6 2 2 9 3 3 1 8 2 5 1 3 7 2 7 3 2 8 1 3 5 2 1 7 9 4 4 6 9 2 9 8 9 4 5 5 2 9 1 4 9 1 9 4 1 7 2 1 2 0 3 1 8 3 5 9 0 8 3 6 6 8 1 6 1 2 0 0 0 2 1 0 5 7 9 2 7 9]
# 准确率 acc=(test_y==y_result).mean() print("准确率为:{}".format(acc))
准确率为:0.984
提高准确率
# 准确率与邻居数无关 knn=KNeighborsClassifier(n_neighbors=10) knn.fit(train_x.reshape(17000,-1),train_y) # score()方法既可以预测还可以求出准确率 knn.score(test_x.reshape(1000,1024),test_y)
0.974
# 改变权重为邻居数距离越近权重越高,距离越远权重越低;有的时候可以提高,有的时候不能提高 knn=KNeighborsClassifier(n_neighbors=5,weights="distance") knn.fit(train_x.reshape(17000,-1),train_y) # score()方法既可以预测还可以求出准确率 knn.score(test_x.reshape(1000,1024),test_y)
0.981
# p=1,使用曼哈顿距离为算法核心 # n_jobs是进程数,当=-1时,CPU有几个核就开启几个进程,提高运行速度 knn=KNeighborsClassifier(n_neighbors=5,weights="distance",n_jobs=-1) knn.fit(train_x.reshape(17000,-1),train_y) # score()方法既可以预测还可以求出准确率 knn.score(test_x.reshape(1000,1024),test_y)
0.981
来源:https://www.cnblogs.com/lq13035130506/p/12535569.html