问题
I have a data set with categorical and numerical features on which I want to apply some transformations followed by XGBClassifier.
Link to data set: https://www.kaggle.com/blastchar/telco-customer-churn
As the transformations are different for the numerical and categorical features, I used sklearn_pandas and its DataFrameMapper.
To perform one-hot encoding on the categorical features, I want to use DictVectorizer. But to use DictVectorizer, I first need to convert the dataframe into a dict, which I try to do with a custom transformer Dictifier.
When I run the Pipeline I get the error 'builtin_function_or_method' object is not iterable. Does anyone know what might be causing this error?
import numpy as np
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn_pandas import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import xgboost as xgb
# Importing the data
df = pd.read_csv('../input/WA_Fn-UseC_-Telco-Customer-Churn.csv', na_values=' ')
X, y = df.iloc[:,1:-1], df.iloc[:,-1]
# Label encoding of the target classes
le = LabelEncoder()
y = le.fit_transform(y.values.reshape(y.shape[0], ))
# Defining the num and cat column names
cat_cols = X.columns[X.dtypes == object].tolist()
num_cols = X.columns[X.dtypes != object].tolist()
# DataFrameMappers for num and cat columns
num_transf_mapper = DataFrameMapper([([num_col], [Imputer(strategy="median"), StandardScaler()]) for num_col in num_cols],
input_df=True,
df_out=True)
cat_transf_mapper = DataFrameMapper([(cat_col , [CategoricalImputer()]) for cat_col in cat_cols],
input_df=True,
df_out=True)
# FeatureUnion of num and cat columns
num_cat_union = FeatureUnion([("num_mapper", num_transf_mapper),
("cat_mapper", cat_transf_mapper)])
# Custom transformer to convert Pandas DataFrame into Dict (needed for DictVectorizer)
class Dictifier(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return X.to_dict('records')
# Pipeline
pipeline = Pipeline([("featureunion", num_cat_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier(max_depth=3))])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)
Error trace
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py:542: FutureWarning: From version 0.22, errors during fit will result in a cross validation score of NaN by default. Use error_score='raise' if you want an exception raised or error_score=np.nan to adopt the behavior from version 0.22.FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-187-96272018fb87> in <module>()
53
54 # Perform cross-validation
---> 55 cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)
/opt/conda/lib/python3.6/site-packages/sklearn_pandas/cross_validation.py in cross_val_score(model, X, *args, **kwargs)
19 warnings.warn(DEPRECATION_MSG, DeprecationWarning)
20 X = DataWrapper(X)
---> 21 return sk_cross_val_score(model, X, *args, **kwargs)
22
23
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
979 # remaining jobs.
980 self._iterating = False
--> 981 if self.dispatch_one_batch(iterator):
982 self._iterating = self._original_iterator is not None
983
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
821 return False
822 else:
--> 823 self._dispatch(tasks)
824 return True
825
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
778 with self._lock:
779 job_idx = len(self._jobs)
--> 780 job = self._backend.apply_async(batch, callback=cb)
781 # A job can complete so quickly than its callback is
782 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
181 def apply_async(self, func, callback=None):
182 """Schedule a func to be run"""
--> 183 result = ImmediateResult(func)
184 if callback:
185 callback(result)
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
541 # Don't delay the application, to avoid keeping the input
542 # arguments in memory
--> 543 self.results = batch()
544
545 def get(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
263 This estimator
264 """
--> 265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
267 self._final_estimator.fit(Xt, y, **fit_params)
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
320
321 def __call__(self, *args, **kwargs):
--> 322 return self.func(*args, **kwargs)
323
324 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
790 delayed(_fit_transform_one)(trans, X, y, weight,
791 **fit_params)
--> 792 for name, trans, weight in self._iter())
793
794 if not result:
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
979 # remaining jobs.
980 self._iterating = False
--> 981 if self.dispatch_one_batch(iterator):
982 self._iterating = self._original_iterator is not None
983
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
821 return False
822 else:
--> 823 self._dispatch(tasks)
824 return True
825
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
778 with self._lock:
779 job_idx = len(self._jobs)
--> 780 job = self._backend.apply_async(batch, callback=cb)
781 # A job can complete so quickly than its callback is
782 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
181 def apply_async(self, func, callback=None):
182 """Schedule a func to be run"""
--> 183 result = ImmediateResult(func)
184 if callback:
185 callback(result)
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
541 # Don't delay the application, to avoid keeping the input
542 # arguments in memory
--> 543 self.results = batch()
544
545 def get(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
460 else:
461 # fit method of arity 2 (supervised transformation)
--> 462 return self.fit(X, y, **fit_params).transform(X)
463
464
/opt/conda/lib/python3.6/site-packages/sklearn_pandas/dataframe_mapper.py in transform(self, X)
342 stacked,
343 columns=self.transformed_names_,
--> 344 index=index)
345 # preserve types
346 for col, dtype in zip(self.transformed_names_, dtypes):
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
377 else:
378 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 379 copy=copy)
380 elif isinstance(data, (list, types.GeneratorType)):
381 if isinstance(data, types.GeneratorType):
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in _init_ndarray(self, values, index, columns, dtype, copy)
525 raise_with_traceback(e)
526
--> 527 index, columns = _get_axes(*values.shape)
528 values = values.T
529
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in _get_axes(N, K, index, columns)
482 index = com._default_index(N)
483 else:
--> 484 index = _ensure_index(index)
485
486 if columns is None:
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _ensure_index(index_like, copy)
4972 index_like = copy(index_like)
4973
-> 4974 return Index(index_like)
4975
4976
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, fastpath, tupleize_cols, **kwargs)
449 data, names=name or kwargs.get('names'))
450 # other iterable of some kind
--> 451 subarr = com._asarray_tuplesafe(data, dtype=object)
452 return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)
453
/opt/conda/lib/python3.6/site-packages/pandas/core/common.py in _asarray_tuplesafe(values, dtype)
303
304 if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')):
--> 305 values = list(values)
306 elif isinstance(values, Index):
307 return values.values
TypeError: 'builtin_function_or_method' object is not iterable
回答1:
This seems like a bug in sklearn_pandas.cross_val_score
.
sklearn_pandas
wraps the dataframe you supply in a DataWrapper object as seen in source code here:
def cross_val_score(model, X, *args, **kwargs):
warnings.warn(DEPRECATION_MSG, DeprecationWarning)
X = DataWrapper(X)
return sk_cross_val_score(model, X, *args, **kwargs)
which is apparently done to handle older versions of sklearn.cross_validation.cross_val_score
which did not handle pandas DataFrames well. DataWrapper
returns a list
instance when divided into train and test.
But then it is not handled correctly during transform()
of DataframeMapper
as given in source code here
if self.df_out:
# if no rows were dropped preserve the original index,
# otherwise use a new integer one
no_rows_dropped = len(X) == len(stacked)
if no_rows_dropped:
index = X.index # <== This here is the source of error
else:
index = None
Here, X is not a DataFrame, but a list object, so index
is not a list the actual index of pandas as intended, but actually a function of list and hence the error you got.
But since newer sklearn cross_val_score
handles DataFrame correctly, you dont have to use the other import.
Change it from:
from sklearn_pandas import cross_val_score
to this:
from sklearn.model_selection import cross_val_score
So now you wont get that error anymore.
But, still further down the code, you will get another error about:
AttributeError: 'numpy.ndarray' object has no attribute 'to_dict'
This is because you wrap both your DataFrameMapper
objects into a FeatureUnion
by doing this:
num_cat_union = FeatureUnion([("num_mapper", num_transf_mapper),
("cat_mapper", cat_transf_mapper)])
and then do this:
pipeline = Pipeline([("featureunion", num_cat_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier(max_depth=3))])
Your Dictifier
expects a DataFrame to be passed to it, so that it can call to_dict()
on it, but the previous step in pipeline FeatureUnion
will not preserve the DataFrame, it will convert that into a numpy array.
Generally, DataFrameMapper
and FeatureUnion
don't work well together. I would advise you to remove the FeatureUnion
altogether and instead combine your both DataFrameMapper objects
into a single object. This will effectively work as you wanted FeatureUnion
to work.
Something like this:
transformers = []
# Combine both your operations here only
transformers.extend([([num_col], [Imputer(strategy="median"),
StandardScaler()]) for num_col in num_cols])
transformers.extend([(cat_col , [CategoricalImputer()]) for cat_col in cat_cols])
num_cat_union = DataFrameMapper(transformers,
input_df=True,
df_out=True)
# Your other code
...
...
回答2:
Let show just part of the code as I do
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(CATEGORICAL.T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
(CategoricalImputer())
])),
])),
('model_fitting', xgb.XGBClassifier(max_depth=3)),
])
回答3:
All together in a pipeline. I hope this might help.
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object
# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()
# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
[([numeric_feature], Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
input_df=True,
df_out=True
)
# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
[(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
input_df=True,
df_out=True
)
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion
# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
("num_mapper", numeric_imputation_mapper),
("cat_mapper", categorical_imputation_mapper)
])
# Create full pipeline
pipeline = Pipeline([
("featureunion", numeric_categorical_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier())
])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)
# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))
来源:https://stackoverflow.com/questions/52055658/sklearn-pandas-in-a-pipeline-returns-typeerror-builtin-function-or-method-obj