import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# cv int 6 将数据分分成6份 怎么分的就用到下面的KFold
from sklearn.model_selection import cross_val_score,GridSearchCV #用到下面的Kfold
# KFold、StratifiedKFold将数据分成多少份 StratifiedKFold按比例进行分
from sklearn.model_selection import KFold划分,StratifiedKFold按比例划分
data = np.random.randint(0,10,size = (8,2))
target = np.array([0,0,1,0,1,1,1,0])
display(data,target)
#array([[8, 9],
# [8, 5],
# [0, 1],
# [1, 9],
# [8, 2],
# [0, 1],
# [7, 1],
# [0, 6]])
#array([0, 0, 1, 0, 1, 1, 1, 0])
train_test_split(data,target)
#[array([[0, 1],
# [8, 5],
# [7, 1],
# [8, 2],
# [0, 1],
# [0, 6]]), array([[8, 9],
# [1, 9]]), array([1, 0, 1, 1, 1, 0]), array([0, 0])]
kFold = KFold(n_splits=4) #
# train,test是索引,只要有索引可以获取数据
for train,test in kFold.split(data,target): #返回值是生成器
print(target[train],target[test]) #目标值的训练数据和测试数据
#[1 0 1 1 1 0] [0 0] #测试数据两个0,没有按比例分,数据量少的时候,数据预测不太好
#[0 0 1 1 1 0] [1 0]
#[0 0 1 0 1 0] [1 1]
#[0 0 1 0 1 1] [1 0]
# 分成4份,每一份数据特征,数据样本比例和原来一样的
sKFold = StratifiedKFold(n_splits=4)
for train,test in sKFold.split(data,target):
print(target[train],target[test])
#[0 0 1 1 1 0] [0 1]
#[0 1 0 1 1 0] [0 1]
#[0 0 1 1 1 0] [0 1]
#[0 0 1 0 1 1] [1 0]
# train_test_split随机拆,KFold,StratifiedKFold作用都是将数据拆分
data = pd.read_csv('./salary.txt')
data.head()
最后一列是salary<=50K
data.columns
#Index(['age', 'workclass', 'final_weight', 'education', 'education_num','marital_status', 'occupation', 'relationship', 'race', 'sex','capital_gain', 'capital_loss', 'hours_per_week', 'native_country','salary'],dtype='object')
data.drop(labels=['final_weight','education','capital_gain','capital_loss'],axis = 1,inplace=True)
data.shape
(32561, 11)
data.head()
X = data.iloc[:,0:-1]
y = data['salary']
knn = KNeighborsClassifier()
knn.fit(X,y) #会报错,KNN没法计算string,因为string无法计算距离
# 方法将数据中str转换int,float从而算法可以计算
# map方法,apply,transform
u = X['workclass'].unique()
u
#array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov','Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],dtype=object)
np.argwhere(u == 'Local-gov')[0,0]
#4
def convert(x):
return np.argwhere(u == x)[0,0]
X['workclass'] = X['workclass'].map(convert)
X.head()
cols = [ 'marital_status', 'occupation', 'relationship', 'race', 'sex','native_country']
for col in cols:
u = X[col].unique()
def convert(x):
return np.argwhere(u == x)[0,0]
X[col] = X[col].map(convert)
X.head()
knn = KNeighborsClassifier()
kFold = KFold(10)
knn = KNeighborsClassifier()
accuracy = 0
for train,test in kFold.split(X,y):
knn.fit(X.loc[train],y[train])
acc = knn.score(X.loc[test],y[test])
accuracy += acc/10
print(accuracy)
#0.7973345728987424
X.head()
y.unique()
#array(['<=50K', '>50K'], dtype=object)
# 从preprocessing 数据预处理中找一找有没有其他方法将str---------int,float类型
KNN编码,将string转为数字类型
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
salary = pd.read_csv('./salary.txt')
salary.drop(labels=['final_weight','education_num','capital_gain','capital_loss'],
axis = 1,inplace=True)
ordinalEncoder = OrdinalEncoder()
data = ordinalEncoder.fit_transform(salary) #data是numpy
salary_ordinal = DataFrame(data,columns=salary.columns)
salary_ordinal.head()
# LabelEncode(Series)和OrdinalEncode(DataFrame) 类似
labelEncode = LabelEncoder()
salary_label = labelEncode.fit_transform(salary['salary'])
for col in salary.columns:
salary[col] = labelEncode.fit_transform(salary[col])
salary.head()
edu.drop_duplicates().count()
#education 16
#dtype: int64
edu = salary[['education']] #OneHotEncoder需要二维数据
edu
onehotEncoder = OneHotEncoder()
onehot = onehotEncoder.fit_transform(edu)
onehot
#<32561x16 sparse matrix of type '<class 'numpy.float64'>'with 32561 stored elements in Compressed Sparse Row format>
edu
np.sort(edu.education.unique())
#array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th','Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad','Masters', 'Preschool', 'Prof-school', 'Some-college'],dtype=object)
# one - hot:独热 不受大小影响
nd1 = onehot.toarray()[:10]
nd1
nd1.argmax(axis = 1)
#array([ 9, 9, 11, 1, 9, 12, 6, 11, 12, 9], dtype=int64)
来源:CSDN
作者:W流沙W
链接:https://blog.csdn.net/shunjianxaioshi/article/details/103914378