scikit-learn KNN模型选择

。_饼干妹妹 提交于 2020-01-17 07:56:09
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# cv int 6 将数据分分成6份 怎么分的就用到下面的KFold
from sklearn.model_selection import cross_val_score,GridSearchCV #用到下面的Kfold
# KFold、StratifiedKFold将数据分成多少份 StratifiedKFold按比例进行分
from sklearn.model_selection import KFold划分,StratifiedKFold按比例划分

data = np.random.randint(0,10,size = (8,2))
target = np.array([0,0,1,0,1,1,1,0])
display(data,target)
#array([[8, 9],
#       [8, 5],
#       [0, 1],
#       [1, 9],
#       [8, 2],
#       [0, 1],
#       [7, 1],
#       [0, 6]])
#array([0, 0, 1, 0, 1, 1, 1, 0])

train_test_split(data,target)
#[array([[0, 1],
#        [8, 5],
#        [7, 1],
#        [8, 2],
#        [0, 1],
#        [0, 6]]), array([[8, 9],
#        [1, 9]]), array([1, 0, 1, 1, 1, 0]), array([0, 0])]

kFold = KFold(n_splits=4) #
# train,test是索引,只要有索引可以获取数据
for train,test in kFold.split(data,target): #返回值是生成器
    print(target[train],target[test]) #目标值的训练数据和测试数据
#[1 0 1 1 1 0] [0 0] #测试数据两个0,没有按比例分,数据量少的时候,数据预测不太好
#[0 0 1 1 1 0] [1 0]
#[0 0 1 0 1 0] [1 1]
#[0 0 1 0 1 1] [1 0]

# 分成4份,每一份数据特征,数据样本比例和原来一样的
sKFold = StratifiedKFold(n_splits=4)
for train,test in sKFold.split(data,target):
    print(target[train],target[test])
#[0 0 1 1 1 0] [0 1]
#[0 1 0 1 1 0] [0 1]
#[0 0 1 1 1 0] [0 1]
#[0 0 1 0 1 1] [1 0]

# train_test_split随机拆,KFold,StratifiedKFold作用都是将数据拆分
data = pd.read_csv('./salary.txt')
data.head()

在这里插入图片描述
最后一列是salary<=50K

data.columns
#Index(['age', 'workclass', 'final_weight', 'education', 'education_num','marital_status', 'occupation', 'relationship', 'race', 'sex','capital_gain', 'capital_loss', 'hours_per_week', 'native_country','salary'],dtype='object')

data.drop(labels=['final_weight','education','capital_gain','capital_loss'],axis = 1,inplace=True)
data.shape
(32561, 11)
data.head()

在这里插入图片描述

X = data.iloc[:,0:-1]
y = data['salary']

knn = KNeighborsClassifier()
knn.fit(X,y) #会报错,KNN没法计算string,因为string无法计算距离

# 方法将数据中str转换int,float从而算法可以计算
# map方法,apply,transform
u = X['workclass'].unique()
u
#array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov','Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],dtype=object)
np.argwhere(u == 'Local-gov')[0,0]
#4

def convert(x):
    return np.argwhere(u == x)[0,0]
X['workclass'] = X['workclass'].map(convert)
X.head()

在这里插入图片描述

cols = [ 'marital_status', 'occupation', 'relationship', 'race', 'sex','native_country']
for col in cols:
    u = X[col].unique()
    def convert(x):
        return np.argwhere(u == x)[0,0]
    X[col] = X[col].map(convert)
X.head()

在这里插入图片描述

knn = KNeighborsClassifier()
kFold = KFold(10)
knn = KNeighborsClassifier()
accuracy = 0
for train,test in kFold.split(X,y):
    knn.fit(X.loc[train],y[train])
    acc = knn.score(X.loc[test],y[test])
    accuracy += acc/10
print(accuracy)
#0.7973345728987424
X.head()

在这里插入图片描述

y.unique()
#array(['<=50K', '>50K'], dtype=object)
# 从preprocessing 数据预处理中找一找有没有其他方法将str---------int,float类型

KNN编码,将string转为数字类型

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

salary = pd.read_csv('./salary.txt')
salary.drop(labels=['final_weight','education_num','capital_gain','capital_loss'],
            axis = 1,inplace=True)

ordinalEncoder = OrdinalEncoder()
data = ordinalEncoder.fit_transform(salary) #data是numpy
salary_ordinal = DataFrame(data,columns=salary.columns)
salary_ordinal.head()

在这里插入图片描述

# LabelEncode(Series)和OrdinalEncode(DataFrame) 类似
labelEncode = LabelEncoder()
salary_label = labelEncode.fit_transform(salary['salary'])
for col in salary.columns:
    salary[col] = labelEncode.fit_transform(salary[col])   
salary.head()

在这里插入图片描述

edu.drop_duplicates().count()
#education    16
#dtype: int64

edu = salary[['education']] #OneHotEncoder需要二维数据
edu
onehotEncoder = OneHotEncoder()
onehot = onehotEncoder.fit_transform(edu)
onehot
#<32561x16 sparse matrix of type '<class 'numpy.float64'>'with 32561 stored elements in Compressed Sparse Row format>

edu

在这里插入图片描述

np.sort(edu.education.unique())
#array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th','Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad','Masters', 'Preschool', 'Prof-school', 'Some-college'],dtype=object)

# one - hot:独热 不受大小影响
nd1 = onehot.toarray()[:10]
nd1

在这里插入图片描述

nd1.argmax(axis = 1)
#array([ 9,  9, 11,  1,  9, 12,  6, 11, 12,  9], dtype=int64)
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!