How do I change - using for loops to call multiple functions - into - using a pipeline to call a class?

问题

So the basic requirement is that, I get a dictionary of models from user, and a dictionary of their hyper parameters and give a report. Currently goal is for binary classification, but this can be extended later.

This is what I am currently doing:

import numpy as np
import pandas as pd
# import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer
from sklearn import datasets
# import joblib
import warnings
warnings.filterwarnings('ignore')

cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)

def build_model(model_name, model_class, params=None):
    """
    return model instance
    """
    if 'Ridge' in model_name:
        model = model_class(penalty='l2')
    elif 'Lasso' in model_name:
        model = model_class(penalty='l1')
    elif 'Ensemble' in model_name:
        model = model_class(estimators=[('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier())], voting='hard')
    else:
        model = model_class()

    if params is not None:
        print('Custom Model Parameters provided. Implementing Randomized Search for {} model'.format(model_name))
        rscv = RandomizedSearchCV(estimator=model, param_distributions=params[model_name],
                                  random_state=22, n_iter=10, cv=5, verbose=1, n_jobs=-1,
                                 scoring=make_scorer(f1_score), error_score=0.0)
        return rscv

    print('No model parameters provided. Using sklearn default values for {} model'.format(model_name))
    return model

def fit_model(model_name, model_instance, xTrain, yTrain):
    """
    fit model
    """
    if model_name == 'SVM':
        scaler = StandardScaler()
        model = model_instance.fit(scaler.fit_transform(xTrain), yTrain)
    else:
        model = model_instance.fit(xTrain, yTrain)

    return model

def predict_vals(fitted_model, xTest):
    """
    predict and return vals
    """
    if model_name == 'SVM':
        scaler = StandardScaler()
        y_prediction = fitted_model.predict(scaler.fit_transform(xTest))
    else:
        y_prediction = fitted_model.predict(xTest)

    return y_prediction

def get_metrics(yTest, y_prediction):
    """
    get metrics after getting prediction
    """
    return [recall_score(yTest, y_prediction),
            precision_score(yTest, y_prediction), 
            f1_score(yTest, y_prediction),
           roc_auc_score(yTest, y_prediction)]

def model_report(list_of_metrics):
    """
    add metrics to df, return df
    """
    df = pd.DataFrame(list_of_metrics, columns=['Model', 'Recall', 'Precision', 'f1', 'roc_auc'])
    df = df.round(3)
    return df

models = {
    'Logistic Regression Ridge': LogisticRegression,
    'Logistic Regression Lasso': LogisticRegression,
    'Random Forest': RandomForestClassifier,
    'SVM': SVC,
    'GBM': GradientBoostingClassifier,
    'EnsembleRFGBM': VotingClassifier
}

model_parameters = {
    'SVM': {
        'C': np.random.uniform(50, 1, [25]),#[1, 10, 100, 1000],
        'class_weight': ['balanced'],
        'gamma': [0.0001, 0.001],
        'kernel': ['linear']
    },
    'Random Forest': {
        'n_estimators': [5, 10, 50, 100, 200],
        'max_depth': [3, 5, 10, 20, 40],
        'criterion': ['gini', 'entropy'],
        'bootstrap': [True, False],
        'min_samples_leaf': [np.random.randint(1,10)]
    },
    'Logistic Regression Ridge': {
        'C': np.random.rand(25),
        'class_weight': ['balanced']
    },
    'Logistic Regression Lasso': {
        'C': np.random.rand(25),
        'class_weight': ['balanced']
    },
    'GBM': {
        'n_estimators': [10, 50, 100, 200, 500],
        'max_depth': [3, 5, 10, None],
        'min_samples_leaf': [np.random.randint(1,10)]
    },
    'EnsembleRFGBM': {
        'rf__n_estimators': [5, 10, 50, 100, 200],
        'rf__max_depth': [3, 5, 10, 20, 40],
        'rf__min_samples_leaf': [np.random.randint(1,10)],
        'gbm__n_estimators': [10, 50, 100, 200, 500],
        'gbm__max_depth': [3, 5, 10, None],
        'gbm__min_samples_leaf': [np.random.randint(1,10)]
    }
}

Without parameters I get the following report.

# without parameters
lst = []
for model_name, model_class in models.items():
    model_instance = build_model(model_name, model_class)
    fitted_model = fit_model(model_name, model_instance, X_train, y_train)
    y_predicted = predict_vals(fitted_model, X_test)
    metrics = get_metrics(y_test, y_predicted)

    lst.append([model_name] + metrics)

model_report(lst)

With parameters given as input

# with parameters
lst = []
for model_name, model_class in models.items():
    model_instance = build_model(model_name, model_class, model_parameters)
    fitted_model = fit_model(model_name, model_instance, X_train, y_train)
    y_predicted = predict_vals(fitted_model, X_test)
    metrics = get_metrics(y_test, y_predicted)

    lst.append([model_name] + metrics)

model_report(lst)

The task given to me right now is as follows.

Take from user, a dictionary of models, and their parameters. If parameters are not provided, then use defaults of the models.
Give as output the report (as seen in images)

I was told that I should change the functions to classes. And avoid for loops if possible.

My challenges:

How do I change all the functions into a class and methods? Basically my senior wants something like

report.getReport # gives the dataFrame of the report

But the above sounds to me like it can be done in a function as follows (I don't understand why/how a class would be beneficial)

customReport(whatever inputs I'd like to give) # gives df of report

How do I avoid for loops to get through the user inputs for various models? What I thought was that I could use sklearn pipeline, since according to my understanding, pipeline is a series of steps, so from user take the params and models, and execute them as a series of steps. This avoids the for loops.

Something like this

customPipeline = Pipeline([ ('rf', RandomForestClassifier(with relevant params from params dict),
                             'SVC', SVC(with relevant params from params dict)) ] )

Similar solution I found is here but I would like to avoid for loops as such.

Another related solution here is using a class which can switch between different models. But here I would require that the user be able to give option whether he wants to do Gridsearch/RandomizedSearch/CV/None. My thinking is that I use this class, then inherit this to another class which the user can give input to choose Gridsearch/RandomizedSearch/CV/None etc. I'm not sure if I'm thinking in the right direction.

NOTE A full working solution is desirable (would love it) but not mandatory. It is ok if your answer has a skeleton which can give me a direction how to proceed. I am ok with exploring and learning from it.

回答1:

You can consider using map(), details here: https://www.geeksforgeeks.org/python-map-function/

Some programmers have the habit of avoiding raw loops - "A raw loop is any loop inside a function where the function serves purpose larger than the algorithm implemented by the loop". More details here: https://sean-parent.stlab.cc/presentations/2013-09-11-cpp-seasoning/cpp-seasoning.pdf

I think that's the reason you are asked to remove for loop.

回答2:

I have implemented a working solution. I should have worded my question better. I initially misunderstood how GridsearchCV or RandomizedSearchCV works internally. cv_results_ gives all the results of the grid available. I thought only the best estimator was available to us.

Using this, for each type of model, I took the max rank_test_score, and got the parameters making up the model. In this example, it is 4 models. Now I ran each of those models, i.e. the best combination of parameters for each model, with my test data, and predicted the required scores. I think this solution can be extended to RandomizedSearchCV and a lot more other options.

NOTE: This is just a trivial solution. Lot of modifications necessary, like needing to scale data for specific models, etc. This solution will just serve as a starting point which can be modified according to the user's needs.

Credits to this answer for the ClfSwitcher() class.

Following is the implementation of the class (suggestions to improve are welcomed).

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')

cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)

class ClfSwitcher(BaseEstimator):

    def __init__(self, model=RandomForestClassifier()):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.model = model


    def fit(self, X, y=None, **kwargs):
        self.model.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.model.predict(X)


    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

class report(ClfSwitcher):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.grid = None
        self.full_report = None
        self.concise_report = None
        self.scoring_metrics = {
            'precision': precision_score,
            'recall': recall_score,
            'f1': f1_score,
            'roc_auc': roc_auc_score
        }


    def griddy(self, pipeLine, parameters, **kwargs):
        self.grid = GridSearchCV(pipeLine, parameters, scoring='accuracy', n_jobs=-1)


    def fit_grid(self, X_train, y_train=None, **kwargs):
        self.grid.fit(X_train, y_train)

    def make_grid_report(self):
        self.full_report = pd.DataFrame(self.grid.cv_results_)

    @staticmethod
    def get_names(col):
        return col.__class__.__name__

    @staticmethod
    def calc_score(col, metric):
        return round(metric(y_test, col.fit(X_train, y_train).predict(X_test)), 4)


    def make_concise_report(self):
        self.concise_report = pd.DataFrame(self.grid.cv_results_)
        self.concise_report['model_names'] = self.concise_report['param_cst__model'].apply(self.get_names)
        self.concise_report = self.concise_report.sort_values(['model_names', 'rank_test_score'], ascending=[True, False]) \
                                                .groupby(['model_names']).head(1)[['param_cst__model', 'model_names']] \
                                                .reset_index(drop=True)

        for metric_name, metric_func in self.scoring_metrics.items():
            self.concise_report[metric_name] = self.concise_report['param_cst__model'].apply(self.calc_score, metric=metric_func)

        self.concise_report = self.concise_report[['model_names', 'precision', 'recall', 'f1', 'roc_auc', 'param_cst__model']]

pipeline = Pipeline([
    ('cst', ClfSwitcher()),
])

parameters = [
    {
        'cst__model': [RandomForestClassifier()],
        'cst__model__n_estimators': [10, 20],
        'cst__model__max_depth': [5, 10],
        'cst__model__criterion': ['gini', 'entropy']
    },
    {
        'cst__model': [SVC()],
        'cst__model__C': [10, 20],
        'cst__model__kernel': ['linear'],
        'cst__model__gamma': [0.0001, 0.001]
    },
    {
        'cst__model': [LogisticRegression()],
        'cst__model__C': [13, 17],
        'cst__model__penalty': ['l1', 'l2']
    },
    {
        'cst__model': [GradientBoostingClassifier()],
        'cst__model__n_estimators': [10, 50],
        'cst__model__max_depth': [3, 5],
        'cst__model__min_samples_leaf': [1, 2]
    }
]

my_report = report()
my_report.griddy(pipeline, parameters, scoring='f1')
my_report.fit_grid(X_train, y_train)
my_report.make_concise_report()
my_report.concise_report

Output Report as desired.

来源：https://stackoverflow.com/questions/55468376/how-do-i-change-using-for-loops-to-call-multiple-functions-into-using-a-pi

标签

python

for-loop

scikit-learn

pipeline