sklearn模型的保存和加载API

from sklearn.externals import joblib

。保存：joblib.dump(estimator,'test.pkl')
。加载：estimator=joblib.load('test.pk')

将训练模型保存下来，下次有新的需要预测的数据传进来的时候直接加载模型，然后预测就不用每次都从新训练了。

案例：癌症(逻辑回归)分类预测–良/恶性

import pandas as pd
import numpy as np

'''# 1、读取数据'''
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(path, names=column_name)


'''2 数据预处理'''
# 2、缺失值处理
# 1）替换-》np.nan
data = data.replace(to_replace="?", value=np.nan)
# 2）删除有缺失值的样本
data.dropna(inplace=True)


'''# 3、划分数据集'''
from sklearn.model_selection import train_test_split

# 筛选特征值和目标值
x = data.iloc[:, 1:-1]
y = data["Class"]
x_train, x_test, y_train, y_test = train_test_split(x, y)


'''# 4、特征工程---标准化'''
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)


'''# 5、预估器流程'''
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
estimator.fit(x_train, y_train)

'''模型保存'''
from sklearn.externals import joblib
joblib.dump(estimator,'LogisticRegression.pkl')  #将模型保存在当前路劲下的LogisticRegression.pk文件里

# 返回逻辑回归的模型参数：回归系数和偏置
print('回归系数:',estimator.coef_) 
print("误差(偏置)：",estimator.intercept_)


'''# 6、模型评估'''
# 方法1：直接比对真实值和预测值
y_predict = estimator.predict(x_test)  #计算预测值
print("y_predict(预测值):\n", y_predict)
# print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2：计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：", score)

# 查看精确率、召回率、F1-score
from sklearn.metrics import classification_report
report = classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"])
print("查看精确率、召回率、F1-score：\n",report)


# ROC曲线与AUC指标
from sklearn.metrics import roc_auc_score
#print(y_test.head())
# y_true：每个样本的真实类别，必须为0(反例),1(正例)标记
# 将y_test 转换成 0 ,1
y_true = np.where(y_test > 3, 1, 0)
# print(y_true)
AUC = roc_auc_score(y_true, y_predict)  #计算AUC指标：越接近1越好
print("AUC:",AUC)

回归系数: [[1.47945227 0.07579265 0.59505721 0.69195463 0.33274168 1.16446335
  1.16645995 0.92205206 0.72380317]]
误差(偏置)： [-0.93015988]
y_predict(预测值):
 [4 4 2 4 2 2 4 4 4 2 2 4 2 2 4 2 4 4 2 2 2 2 4 2 2 4 2 2 4 2 4 4 2 4 2 4 2
 4 2 2 2 4 4 2 4 2 2 4 2 2 2 4 4 2 2 2 4 2 2 2 4 2 2 2 4 2 4 2 4 4 4 2 4 2
 2 4 2 2 2 4 2 4 2 2 2 4 4 4 2 4 2 2 4 2 4 4 2 2 2 2 2 2 2 4 2 2 2 4 2 2 2
 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 4
 2 2 2 2 2 2 2 4 2 4 2 2 2 4 4 4 2 2 2 2 4 2 2]
准确率为： 0.9766081871345029
查看精确率、召回率、F1-score：
              precision    recall  f1-score   support

         良性       0.98      0.98      0.98       114
         恶性       0.96      0.96      0.96        57

avg / total       0.98      0.98      0.98       171

AUC: 0.9736842105263157

'''加载模型并进行预测'''

#载入需要预测的特征变量
x = (data.iloc[:, 1:-1]).sample(50) #传入需要预测的特征变量，这里就暂时使用原来数据随机抽50个

#对特征变量进行标准化处理
from sklearn.preprocessing import StandardScaler  #对特征变量进行标准化处理
transfer = StandardScaler()
x_s = transfer.fit_transform(x)

#载入模型并预测
from sklearn.externals import joblib  #调用加载模型的API
from sklearn.linear_model import LogisticRegression  #调用模型的API
estimator=joblib.load('LogisticRegression.pkl')  #加载模型
y_predict = estimator.predict(x_s)  #计算预测值
print("y_predict(预测值):\n", y_predict)

# 结果展示：构造二维表
import pandas as pd
data = pd.DataFrame(x)
data["y_predict"]= y_predict
display(data.head())

y_predict(预测值):
 [2 2 4 2 2 4 4 4 2 2 2 2 2 4 4 4 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2
 2 2 4 2 4 4 4 2 4 2 2 2 2]

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	y_predict
429	2	1	1	1	2	1	2	1	1	2
219	6	1	3	1	2	1	3	1	1	2
320	7	6	3	2	5	10	7	4	6	4
423	5	1	3	1	2	1	2	1	1	2
678	1	1	1	1	2	1	1	1	1	2

来源：CSDN

作者：Jalen data analysis

链接：https://blog.csdn.net/weixin_41685388/article/details/104515820

标签

预测模型

test

data

estimator

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	y_predict
429	2	1	1	1	2	1	2	1	1	2
219	6	1	3	1	2	1	3	1	1	2
320	7	6	3	2	5	10	7	4	6	4
423	5	1	3	1	2	1	2	1	1	2
678	1	1	1	1	2	1	1	1	1	2

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	y_predict
429	2	1	1	1	2	1	2	1	1	2
219	6	1	3	1	2	1	3	1	1	2
320	7	6	3	2	5	10	7	4	6	4
423	5	1	3	1	2	1	2	1	1	2
678	1	1	1	1	2	1	1	1	1	2

sklearn模型的保存和加载API--案例癌症(逻辑回归)分类预测

sklearn模型的保存和加载API

案例：癌症(逻辑回归)分类预测–良/恶性

	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	y_predict
429	2	1	1	1	2	1	2	1	1	2
219	6	1	3	1	2	1	3	1	1	2
320	7	6	3	2	5	10	7	4	6	4
423	5	1	3	1	2	1	2	1	1	2
678	1	1	1	1	2	1	1	1	1	2