# -*- coding: utf-8 -*- """scikit-learn introduction Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1quaJafg43SN7S6cNwKFr0_WYn2ELt4Ph scikit-learn官方网站:https://scikit-learn.org/stable/ 模块引入 """ from sklearn import datasets from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt import numpy as np """#分类: - SVM(support vector machine):支持向量机 - svm.SVC() ###iris数据集 - iris feature: 花萼长度,花萼宽度,花瓣长度,花瓣宽度 - iris lable: 山鸢尾,杂色鸢尾,维吉尼亚鸢尾 """ iris = datasets.load_iris() print('iris feature\n', iris.data[0:5]) print('iris label\n', iris.target[0:5]) """###创建模型""" from sklearn import svm clf = svm.SVC() irisX = iris.data irisY = iris.target clf.fit(irisX, irisY) irisPred = clf.predict(irisX) clf.predict([[5.1,3.5,1.4,0.2]]) #刚刚的第1个数据 """###评估指标 - accuracy - precision - recall - F1 """ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score print('acc is ', accuracy_score(irisY, irisPred, normalize=False)/len(irisY)) print('precision is ', precision_score(irisY, irisPred, average='macro')) print('recall is ', recall_score(irisY, irisPred, average='macro')) print('F1 is ', f1_score(irisY, irisPred, average='macro')) """#回归 - 线性回归 - 模块:linear_model.LinearRegression() ###糖尿病数据集 """ diabetes = datasets.load_diabetes() diabetesX = np.array([[diabetes.data[i][0]] for i in range(0,diabetes.data.shape[0])]) diabetesY = diabetes.target print('feature\n',diabetesX[:5]) print('label\n',diabetesY[:5]) """###创建模型""" from sklearn import svm, linear_model regr = linear_model.LinearRegression() regr.fit(diabetesX, diabetes.target) diabetesPred = regr.predict(diabetesX) regr.predict([[0.03807591]]) #对于原始数据的第一个值的预测结果 plt.scatter(diabetesX, diabetes.target) #原始数据的散点图 plt.plot(diabetesX, diabetesPred) #线性回归的折线图 """###评价指标 - 均方误差(mse) """ from sklearn.metrics import mean_squared_error print('mean squared error is ', mean_squared_error(diabetesY, diabetesPred)) """#聚类 - k-means ###创建数据集 """ from sklearn.datasets.samples_generator import make_blobs clusterX, clusterY = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.2, 0.2, 0.2], random_state=0) plt.scatter(clusterX[:, 0], clusterX[:, 1]) """###建立模型""" from sklearn.cluster import KMeans clu = KMeans(n_clusters=2, random_state=9) clusterPredict = clu.fit_predict(clusterX) plt.scatter(clusterX[:, 0], clusterX[:, 1], c=clusterPredict) plt.show() """#模型评估 - cross validation 交叉验证 - 以iris数据集为例 """ from sklearn.model_selection import train_test_split,cross_val_score from sklearn.metrics import accuracy_score from sklearn import svm import warnings warnings.filterwarnings('ignore') clf = svm.SVC() scores = cross_val_score(clf, irisX, irisY, cv=10, scoring='accuracy') print('十折交叉验证分别的accuracy ', scores) print('平均的accuracy ', sum(scores/10)) """- 通过设置随机种子来进行十次十折交叉验证""" from sklearn.model_selection import StratifiedKFold,KFold accEachTime = [] for i in range(0,10): clf = svm.SVC() scores = cross_val_score(clf, irisX, irisY, cv=KFold(n_splits=10, random_state=i, shuffle=True), scoring='accuracy') print(scores) accEachTime.append(sum(scores/10)) print('每一次的accuracy值 ', accEachTime) print('十次十折交叉验证的平均accuracy值 ', sum(accEachTime)/10)
来源:https://www.cnblogs.com/hannahzhao/p/11959326.html