VotingClassifier: Different Feature Sets

后端 未结 2 835

I have two different feature sets (so, with same number of rows and the labels are the same), in my case DataFrames:

df1:

| A          


        
2条回答
  •  礼貌的吻别
    2021-02-02 16:51

    To use as much as sklearn tools as possible, I find following way more appealing.

    from sklearn.base import TransformerMixin, BaseEstimator
    import numpy as np
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import VotingClassifier
    
    ######################
    # custom transformer for sklearn pipeline
    class ColumnExtractor(TransformerMixin, BaseEstimator):
        def __init__(self, cols):
            self.cols = cols
    
        def transform(self, X):
            col_list = []
            for c in self.cols:
                col_list.append(X[:, c:c+1])
            return np.concatenate(col_list, axis=1)
    
        def fit(self, X, y=None):
            return self
    
    ######################
    # processing data
    data = load_iris()
    X = data.data
    y = data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    ######################
    # fit clf1 with df1
    pipe1 = Pipeline([
        ('col_extract', ColumnExtractor( cols=range(0,2) )), # selecting features 0 and 1 (df1) to be used with LR (clf1)
        ('clf', LogisticRegression())
        ])
    
    pipe1.fit(X_train, y_train) # sanity check
    pipe1.score(X_test,y_test) # sanity check
    # output: 0.6842105263157895
    
    ######################
    # fit clf2 with df2
    pipe2 = Pipeline([
        ('col_extract', ColumnExtractor( cols=range(2,4) )), # selecting features 2 and 3 (df2) to be used with SVC (clf2)
        ('clf', SVC(probability=True))
        ])
    
    pipe2.fit(X_train, y_train) # sanity check
    pipe2.score(X_test,y_test) # sanity check
    # output: 0.9736842105263158
    
    ######################
    # ensemble/voting classifier where clf1 fitted with df1 and clf2 fitted with df2
    eclf = VotingClassifier(estimators=[('df1-clf1', pipe1), ('df2-clf2', pipe2)], voting='soft', weights= [1, 0.5])
    eclf.fit(X_train, y_train)
    eclf.score(X_test,y_test)
    # output: 0.9473684210526315
    

提交回复
热议问题