ValueError in pipeline - featureHasher not working?

只愿长相守 提交于 2019-12-11 16:47:02

问题


I think I'm having issues getting my vectorizer working within a gridsearch pipeline:

data as panda df x_train:

bathrooms   bedrooms    price   building_id     manager_id
10  1.5     3   3000    53a5b119ba8f7b61d4e010512e0dfc85    5ba989232d0489da1b5f2c45f6688adc
10000   1.0     2   5465    c5c8a357cba207596b04d1afd1e4f130    7533621a882f71e25173b27e3139d83d
100004  1.0     1   2850    c3ba40552e2120b0acfc3cb5730bb2aa    d9039c43983f6e564b1482b273bd7b01
100007  1.0     1   3275    28d9ad350afeaab8027513a3e52ac8d5    1067e078446a7897d2da493d2f741316
100013  1.0     4   3350    0   98e13ad4b495b9613cef886d79a6291f

numeric_predictors = ['bathrooms', 'bedrooms', 'price']
categorical_predictors = ['building_id', 'manager_id']

minMaxScaler fit & transform:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler

class MyScaler(BaseEstimator, TransformerMixin):

def __init__(self, cols):
    self.cols = cols

def fit(self, X, y=None):

    self.scaler = MinMaxScaler()
    self.scaler.fit(X[self.cols])
    return self

def transform(self, X):
    return self.scaler.transform(X[self.cols])

My categorical feature hashing vectorizer:

from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizer

class MyVectorizer(BaseEstimator, TransformerMixin):
    """
    Vectorize a set of categorical variables
    """

    def __init__(self, cols, hashing=None):
        """
        args:
            cols: a list of column names of the categorical variables
            hashing: 
                If None, then vectorization is a simple one-hot-encoding.
                If an integer, then hashing is the number of features in the output.
        """
        self.cols = cols
        self.hashing = hashing

    def fit(self, X, y=None):

        data = X[self.cols]

        # Choose a vectorizer
        if self.hashing is None:
            self.myvec = HashingVectorizer()
        else:
            self.myvec = FeatureHasher(n_features = self.hashing)

        self.myvec.fit(X[self.cols].to_dict(orient='records'))
        return self

    def transform(self, X):

        # Vectorize Input
        if self.hashing is None:
            return pd.DataFrame(
                self.myvec.transform(X[self.cols].to_dict(orient='records')),
                columns = self.myvec.feature_names_
            )
        else:
            return pd.DataFrame(
                self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray()
            )

GridSearch hyperparameters:

search_params = {
    'preprocess__vectorize__hashing': [20, 40, 80],
    'predict__alpha': [.01, .1, 1, 2, 10]
}

pipeline:

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression

pipeline = Pipeline([
    ('preprocess', FeatureUnion([
        ('scale', MyScaler(cols=numeric_predictors)),
        ('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5))
    ])),
    ('predict', MultinomialNB())
])

And last, calling this with the gridsearchCV classifier:

grid_search = GridSearchCV(pipeline, search_params)
grid_search.fit(x_train, y_train)

I get a ValueError: Input X must be non-negative. I checked and my numeric_predictor columns' data all non-negative, so I am narrowing it down to an issue with the hashing of the categorical predictors.

    ValueError                                Traceback (most recent call last)
<ipython-input-62-50522376d1e5> in <module>()
      1 grid_search = GridSearchCV(pipeline, search_params)
----> 2 grid_search.fit(x_train, y_train)
      3 grid_search.best_params_

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)
    636                                   error_score=self.error_score)
    637           for parameters, (train, test) in product(candidate_params,
--> 638                                                    cv.split(X, y, groups)))
    639 
    640         # if one choose to see train score, "out" will contain train score info

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    435             estimator.fit(X_train, **fit_params)
    436         else:
--> 437             estimator.fit(X_train, y_train, **fit_params)
    438 
    439     except Exception as e:

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
    257         Xt, fit_params = self._fit(X, y, **fit_params)
    258         if self._final_estimator is not None:
--> 259             self._final_estimator.fit(Xt, y, **fit_params)
    260         return self
    261 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight)
    602         self.feature_count_ = np.zeros((n_effective_classes, n_features),
    603                                        dtype=np.float64)
--> 604         self._count(X, Y)
    605         alpha = self._check_alpha()
    606         self._update_feature_log_prob(alpha)

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)
    706         """Count and smooth feature occurrences."""
    707         if np.any((X.data if issparse(X) else X) < 0):
--> 708             raise ValueError("Input X must be non-negative")
    709         self.feature_count_ += safe_sparse_dot(Y.T, X)
    710         self.class_count_ += Y.sum(axis=0)

ValueError: Input X must be non-negative

> /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count()
    706         """Count and smooth feature occurrences."""
    707         if np.any((X.data if issparse(X) else X) < 0):
--> 708             raise ValueError("Input X must be non-negative")
    709         self.feature_count_ += safe_sparse_dot(Y.T, X)
    710         self.class_count_ += Y.sum(axis=0)

回答1:


Yes, when hashing is not None, FeatureHasher() is chosen, which can output negative values.

But you can remove convert those negative values to positive by using the non_negative parameter of FeatureHashser as given in documentation:

non_negative : boolean, optional, default False

When True, an absolute value is applied to the features matrix prior to returning it. When used in conjunction with alternate_sign=True, this significantly reduces the inner product preservation property.

So change this line in MyVectorizer:

self.myvec = FeatureHasher(n_features = self.hashing)

to this:

self.myvec = FeatureHasher(n_features = self.hashing, non_negative=True)

Note:

  • This parameter has been deprecated since version 0.19 and will be removed in 0.21.
  • You need to study how this parameter will affect your classification problem.


来源:https://stackoverflow.com/questions/45723699/valueerror-in-pipeline-featurehasher-not-working

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!