问题
I updated Neuraxle to the latest version (3.4).
I noticed the whole auto_ml.py
was redone. I checked the documentation but there is nothing about it. On git it seems method RandomSearch()
was replaced a long time ago by AutoML()
method. However the parameters are different.
Does somebody knows how can I channel Boston Housing example pipeline to automatic parameter search in latest Neuraxle version (3.4)?
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import load_boston
from sklearn.decomposition import PCA, FastICA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from neuraxle.hyperparams.distributions import RandInt, LogUniform, Boolean
from neuraxle.hyperparams.space import HyperparameterSpace
from neuraxle.metaopt.auto_ml import RandomSearch
from neuraxle.metaopt.random import KFoldCrossValidationWrapper
from neuraxle.pipeline import Pipeline
from neuraxle.steps.numpy import NumpyTranspose
from neuraxle.steps.sklearn import SKLearnWrapper
from neuraxle.union import AddFeatures, ModelStacking
def main():
boston = load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)
# Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set
# within the classes ar their definition if using custom classes, or also it could be defined after declaring the
# pipeline using a flat dict or a nested dict.
p = Pipeline([
AddFeatures([
SKLearnWrapper(
PCA(n_components=2),
HyperparameterSpace({"n_components": RandInt(1, 3)})
),
SKLearnWrapper(
FastICA(n_components=2),
HyperparameterSpace({"n_components": RandInt(1, 3)})
),
]),
ModelStacking([
SKLearnWrapper(
GradientBoostingRegressor(),
HyperparameterSpace({
"n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10),
"learning_rate": LogUniform(0.07, 0.7)
})
),
SKLearnWrapper(
KMeans(),
HyperparameterSpace({"n_clusters": RandInt(5, 10)})
),
],
joiner=NumpyTranspose(),
judge=SKLearnWrapper(
Ridge(),
HyperparameterSpace({"alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean()})
),
)
])
print("Meta-fitting on train:")
p = p.meta_fit(X_train, y_train, metastep=RandomSearch(
n_iter=10,
higher_score_is_better=True,
validation_technique=KFoldCrossValidationWrapper(scoring_function=r2_score, k_fold=10)
))
# Here is an alternative way to do it, more "pipeliney":
# p = RandomSearch(
# p,
# n_iter=15,
# higher_score_is_better=True,
# validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3)
# ).fit(X_train, y_train)
print("")
print("Transforming train and test:")
y_train_predicted = p.predict(X_train)
y_test_predicted = p.predict(X_test)
print("")
print("Evaluating transformed train:")
score_transform = r2_score(y_train_predicted, y_train)
print('R2 regression score:', score_transform)
print("")
print("Evaluating transformed test:")
score_test = r2_score(y_test_predicted, y_test)
print('R2 regression score:', score_test)
if __name__ == "__main__":
main()
回答1:
Here is a solution to your problem, this is a new example that isn't yet published on the documentation site:
- https://drive.google.com/drive/u/0/folders/12uzcNKU7n0EUyFzgitSt1wSaSvV4qJbs (go see the solution to the 2nd coding Kata from there)
Sample pipeline code from the link above:
from neuraxle.base import Identity
from neuraxle.steps.flow import TrainOnlyWrapper, ChooseOneStepOf
from neuraxle.steps.numpy import NumpyConcatenateInnerFeatures, NumpyShapePrinter, NumpyFlattenDatum
from neuraxle.union import FeatureUnion
pipeline = Pipeline([
TrainOnlyWrapper(NumpyShapePrinter(custom_message="Input shape before feature union")),
FeatureUnion([
Pipeline([
NumpyFFT(),
NumpyAbs(),
FeatureUnion([
NumpyFlattenDatum(), # Reshape from 3D to flat 2D: flattening data except on batch size
FFTPeakBinWithValue() # Extract 2D features from the 3D FFT bins
], joiner=NumpyConcatenateInnerFeatures())
]),
NumpyMean(),
NumpyMedian(),
NumpyMin(),
NumpyMax()
], joiner=NumpyConcatenateInnerFeatures()),
# TODO, optional: Add some feature selection right here for the motivated ones:
# https://scikit-learn.org/stable/modules/feature_selection.html
# TODO, optional: Add normalization right here (if using other classifiers)
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
TrainOnlyWrapper(NumpyShapePrinter(custom_message="Shape after feature union, before classification")),
# Shape: [batch_size, remade_features]
ChooseOneStepOf([
decision_tree_classifier,
# extra_tree_classifier, # TODO
# ridge_classifier, # TODO
logistic_regression,
# random_forest_classifier # TODO
]),
TrainOnlyWrapper(NumpyShapePrinter(custom_message="Shape at output after classification")),
# Shape: [batch_size]
Identity()
])
Then do AutoML:
from neuraxle.metaopt.auto_ml import AutoML, InMemoryHyperparamsRepository, validation_splitter, \
RandomSearchHyperparameterSelectionStrategy
from neuraxle.metaopt.callbacks import ScoringCallback
from sklearn.metrics import accuracy_score
auto_ml = AutoML(
pipeline=pipeline,
hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
validation_split_function=validation_splitter(test_size=0.20),
scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=False),
n_trials=7,
epochs=1,
hyperparams_repository=InMemoryHyperparamsRepository(cache_folder=cache_folder),
refit_trial=True,
)
This example is also studied within the Clean Machine Learning training of Neuraxio:
- https://www.neuraxio.com/products/clean-machine-learning-training
来源:https://stackoverflow.com/questions/60742991/neuraxles-randomsearch-successor