原理请参考url
https://blog.csdn.net/suzyu12345/article/details/81461667 https://blog.csdn.net/whiterbear/article/details/53120004
增量学习有几点需要注意:
1.不是所有模型都支持增量学习,参考:https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html
2. fit 与 partial_fit函数的区别需要注意:可以fit,然后partfit;如果没有fit过程,直接进行partialfit,需要加参数指定分类结果:GaussianNB2.partial_fit(train[:,:3], train[:,3],classes=np.unique(train[:,3]))
代码示例:
import pandas as pd #导入数据分析库 import logging from random import shuffle #导入随机函数shuffle,用来打算数据 import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import SGDClassifier from sklearn.svm import SVC import pandas as pd #导入数据分析库 from random import shuffle #导入随机函数shuffle,用来打算数据 import matplotlib.pyplot as plt import numpy as np from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC,LinearSVC,LinearSVR,NuSVC,NuSVR,OneClassSVM,SVR,l1_min_c,liblinear,libsvm from sklearn.linear_model import SGDClassifier from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.linear_model import Perceptron from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import OneHotEncoder import logging logging.basicConfig(level=logging.DEBUG,format=’%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s’) “”" 整个训练过程的demo: 1.读取两个文件 一个文件中为测试数据,一个文件中为训练数据 2.读取训练数据,增量(array类型)返回:分批次获得增量数据后,对支持在线学习的模型进行训练, 直至所有增量数据全部返回,完成对增量模型的训练; 3.读取训练数据,对不支持增量训练的模型进行训练; 4.读取测试数据,获取三个率 “”" def get_train_data(): datafile0 = ‘…/data/model0.xls’ # 数据名 datafile1 = ‘…/data/model1.xls’ data0 = pd.read_excel(datafile0) data1 = pd.read_excel(datafile1) data = data1.append(data0) train_data = data._values # 将表格转换为矩阵 np.random.shuffle(train_data) return train_data def get_test_data(): datafile2 = ‘…/data/model2.xls’ # 数据名 data2 = pd.read_excel(datafile2) test_data = data2._values # 将表格转换为矩阵 np.random.shuffle(test_data) return test_data class partialFitClassifiers: def init(self): try: self.cls_status = { ‘sgdClassfier’: {“max_iter”: 900, “check_partial_fit_first_call”:False}, ‘Perceptron’: {“check_partial_fit_first_call”:False}, ‘GaussianNB’: {“check_partial_fit_first_call”:False}, ‘Passive_Aggressive’: {“check_partial_fit_first_call”:False} } self.partial_fit_classifiers = { ‘sgdClassfier’: SGDClassifier(max_iter=self.cls_status.get(‘sgdClassfier’).get(‘max_iter’,900)), ‘Perceptron’: Perceptron(), ‘GaussianNB’: GaussianNB(), ‘Passive_Aggressive’: PassiveAggressiveClassifier(), } self.off_line_classifiers = { “decisionTree”:DecisionTreeClassifier(), “gbdt”: GradientBoostingClassifier(n_estimators=100) } logging.info(" successed initial partialFitClassifiers object ") except Exception as e: logging.error(“ERROR: initial partialFitClassifiers object failed , %s” % e) def train_partial_models(self,train_data): logging.info(“try to train models by online-learning…”) for cls_name, cls in self.partial_fit_classifiers.items(): if self.cls_status.get(cls_name).get(“check_partial_fit_first_call”): cls.partial_fit(train_data[:,:3], train_data[:,3]) self.partial_fit_classifiers[cls_name] = cls else: logging.info(“first time to fit classifier:%s” % cls_name) cls.fit(train_data[:,:3], train_data[:,3]) self.partial_fit_classifiers[cls_name] = cls self.cls_status[cls_name][“check_partial_fit_first_call”] = True def predict_test_data(self,test_data): result = {} for cls_name, cls in self.partial_fit_classifiers.items(): predict_result = cls.predict(test_data[:,:3]) result[cls_name] = predict_result return result def train_off_line_models(self,train_data): for cls_name, cls in self.off_line_classifiers.items(): cls.fit(train_data[:,:3], train_data[:,3]) self.off_line_classifiers[cls_name]=cls def predic_test_data_by_off_line(self,test_data): result = {} for cls_name, cls in self.off_line_classifiers.items(): predict_result = cls.predict(test_data[:, :3]) result[cls_name] = predict_result return result # def save_models(self): # #将模型保存为本地对象 def mainFunction(): try: partial_fit_cls_test = partialFitClassifiers() logging.info(“partial_fit_cls_test:%s” % partial_fit_cls_test) except: logging.error(“get partial_fit_cls object filed in mainFunction :%s”) return None partial_fit_cls = partialFitClassifiers() train_data = get_train_data() test_data = get_test_data() print(“online:第一次训练数据输入…”) partial_fit_cls.train_partial_models(train_data) print(“online:第二次训练数据输入…”) partial_fit_cls.train_partial_models(train_data) print(“online:第三次训练数据输入…”) partial_fit_cls.train_partial_models(train_data) print(“三次训练后的测试:”) result = np.zeros(len(test_data[:, 3])) result_on_line = partial_fit_cls.predict_test_data(test_data) for k,v in result_on_line.items(): result += v print(“离线学习模型训练、测试:”) partial_fit_cls.train_off_line_models(train_data) result_off_line = partial_fit_cls.predic_test_data_by_off_line(test_data) for k,v in result_off_line.items(): result += v print(“result:”,result) deal_result = [] for e in result: if e >=3:deal_result.append(1) else:deal_result.append(0) print(“deal_result:”,deal_result) test_data_real_list = list(test_data[:, 3]) print(list(test_data[:, 3])) print(“teseal:”,test_data[:, 3],type(test_data[:, 3])) j = 0 for i in range(len(test_data[:, 3])): if deal_result[i] == test_data_real_list[i]: pass else: j+=1 print(“j:”,j,len(test_data[:, 3])) if name == ‘main’: mainFunction()
来源:CSDN
作者:小金子的夏天
链接:https://blog.csdn.net/WangYouJin321/article/details/103969798