Label encoding across multiple columns in scikit-learn

后端 未结 22 1915
礼貌的吻别
礼貌的吻别 2020-11-22 09:02

I\'m trying to use scikit-learn\'s LabelEncoder to encode a pandas DataFrame of string labels. As the dataframe has many (50+) columns, I want to a

相关标签:
22条回答
  • 2020-11-22 09:48

    This is a year-and-a-half after the fact, but I too, needed to be able to .transform() multiple pandas dataframe columns at once (and be able to .inverse_transform() them as well). This expands upon the excellent suggestion of @PriceHardman above:

    class MultiColumnLabelEncoder(LabelEncoder):
        """
        Wraps sklearn LabelEncoder functionality for use on multiple columns of a
        pandas dataframe.
    
        """
        def __init__(self, columns=None):
            self.columns = columns
    
        def fit(self, dframe):
            """
            Fit label encoder to pandas columns.
    
            Access individual column classes via indexig `self.all_classes_`
    
            Access individual column encoders via indexing
            `self.all_encoders_`
            """
            # if columns are provided, iterate through and get `classes_`
            if self.columns is not None:
                # ndarray to hold LabelEncoder().classes_ for each
                # column; should match the shape of specified `columns`
                self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                               dtype=object)
                self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                                dtype=object)
                for idx, column in enumerate(self.columns):
                    # fit LabelEncoder to get `classes_` for the column
                    le = LabelEncoder()
                    le.fit(dframe.loc[:, column].values)
                    # append the `classes_` to our ndarray container
                    self.all_classes_[idx] = (column,
                                              np.array(le.classes_.tolist(),
                                                      dtype=object))
                    # append this column's encoder
                    self.all_encoders_[idx] = le
            else:
                # no columns specified; assume all are to be encoded
                self.columns = dframe.iloc[:, :].columns
                self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                               dtype=object)
                for idx, column in enumerate(self.columns):
                    le = LabelEncoder()
                    le.fit(dframe.loc[:, column].values)
                    self.all_classes_[idx] = (column,
                                              np.array(le.classes_.tolist(),
                                                      dtype=object))
                    self.all_encoders_[idx] = le
            return self
    
        def fit_transform(self, dframe):
            """
            Fit label encoder and return encoded labels.
    
            Access individual column classes via indexing
            `self.all_classes_`
    
            Access individual column encoders via indexing
            `self.all_encoders_`
    
            Access individual column encoded labels via indexing
            `self.all_labels_`
            """
            # if columns are provided, iterate through and get `classes_`
            if self.columns is not None:
                # ndarray to hold LabelEncoder().classes_ for each
                # column; should match the shape of specified `columns`
                self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                               dtype=object)
                self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                                dtype=object)
                self.all_labels_ = np.ndarray(shape=self.columns.shape,
                                              dtype=object)
                for idx, column in enumerate(self.columns):
                    # instantiate LabelEncoder
                    le = LabelEncoder()
                    # fit and transform labels in the column
                    dframe.loc[:, column] =\
                        le.fit_transform(dframe.loc[:, column].values)
                    # append the `classes_` to our ndarray container
                    self.all_classes_[idx] = (column,
                                              np.array(le.classes_.tolist(),
                                                      dtype=object))
                    self.all_encoders_[idx] = le
                    self.all_labels_[idx] = le
            else:
                # no columns specified; assume all are to be encoded
                self.columns = dframe.iloc[:, :].columns
                self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                               dtype=object)
                for idx, column in enumerate(self.columns):
                    le = LabelEncoder()
                    dframe.loc[:, column] = le.fit_transform(
                            dframe.loc[:, column].values)
                    self.all_classes_[idx] = (column,
                                              np.array(le.classes_.tolist(),
                                                      dtype=object))
                    self.all_encoders_[idx] = le
            return dframe.loc[:, self.columns].values
    
        def transform(self, dframe):
            """
            Transform labels to normalized encoding.
            """
            if self.columns is not None:
                for idx, column in enumerate(self.columns):
                    dframe.loc[:, column] = self.all_encoders_[
                        idx].transform(dframe.loc[:, column].values)
            else:
                self.columns = dframe.iloc[:, :].columns
                for idx, column in enumerate(self.columns):
                    dframe.loc[:, column] = self.all_encoders_[idx]\
                        .transform(dframe.loc[:, column].values)
            return dframe.loc[:, self.columns].values
    
        def inverse_transform(self, dframe):
            """
            Transform labels back to original encoding.
            """
            if self.columns is not None:
                for idx, column in enumerate(self.columns):
                    dframe.loc[:, column] = self.all_encoders_[idx]\
                        .inverse_transform(dframe.loc[:, column].values)
            else:
                self.columns = dframe.iloc[:, :].columns
                for idx, column in enumerate(self.columns):
                    dframe.loc[:, column] = self.all_encoders_[idx]\
                        .inverse_transform(dframe.loc[:, column].values)
            return dframe.loc[:, self.columns].values
    

    Example:

    If df and df_copy() are mixed-type pandas dataframes, you can apply the MultiColumnLabelEncoder() to the dtype=object columns in the following way:

    # get `object` columns
    df_object_columns = df.iloc[:, :].select_dtypes(include=['object']).columns
    df_copy_object_columns = df_copy.iloc[:, :].select_dtypes(include=['object']).columns
    
    # instantiate `MultiColumnLabelEncoder`
    mcle = MultiColumnLabelEncoder(columns=object_columns)
    
    # fit to `df` data
    mcle.fit(df)
    
    # transform the `df` data
    mcle.transform(df)
    
    # returns output like below
    array([[1, 0, 0, ..., 1, 1, 0],
           [0, 5, 1, ..., 1, 1, 2],
           [1, 1, 1, ..., 1, 1, 2],
           ..., 
           [3, 5, 1, ..., 1, 1, 2],
    
    # transform `df_copy` data
    mcle.transform(df_copy)
    
    # returns output like below (assuming the respective columns 
    # of `df_copy` contain the same unique values as that particular 
    # column in `df`
    array([[1, 0, 0, ..., 1, 1, 0],
           [0, 5, 1, ..., 1, 1, 2],
           [1, 1, 1, ..., 1, 1, 2],
           ..., 
           [3, 5, 1, ..., 1, 1, 2],
    
    # inverse `df` data
    mcle.inverse_transform(df)
    
    # outputs data like below
    array([['August', 'Friday', '2013', ..., 'N', 'N', 'CA'],
           ['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['August', 'Monday', '2014', ..., 'N', 'N', 'NJ'],
           ..., 
           ['February', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['March', 'Tuesday', '2013', ..., 'N', 'N', 'NJ']], dtype=object)
    
    # inverse `df_copy` data
    mcle.inverse_transform(df_copy)
    
    # outputs data like below
    array([['August', 'Friday', '2013', ..., 'N', 'N', 'CA'],
           ['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['August', 'Monday', '2014', ..., 'N', 'N', 'NJ'],
           ..., 
           ['February', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['March', 'Tuesday', '2013', ..., 'N', 'N', 'NJ']], dtype=object)
    

    You can access individual column classes, column labels, and column encoders used to fit each column via indexing:

    mcle.all_classes_
    mcle.all_encoders_
    mcle.all_labels_

    0 讨论(0)
  • 2020-11-22 09:48
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder
    
    train=pd.read_csv('.../train.csv')
    
    #X=train.loc[:,['waterpoint_type_group','status','waterpoint_type','source_class']].values
    # Create a label encoder object 
    def MultiLabelEncoder(columnlist,dataframe):
        for i in columnlist:
    
            labelencoder_X=LabelEncoder()
            dataframe[i]=labelencoder_X.fit_transform(dataframe[i])
    columnlist=['waterpoint_type_group','status','waterpoint_type','source_class','source_type']
    MultiLabelEncoder(columnlist,train)
    

    Here i am reading a csv from location and in function i am passing the column list i want to labelencode and the dataframe I want to apply this.

    0 讨论(0)
  • 2020-11-22 09:49

    After lots of search and experimentation with some answers here and elsewhere, I think your answer is here:

    pd.DataFrame(columns=df.columns, data=LabelEncoder().fit_transform(df.values.flatten()).reshape(df.shape))

    This will preserve category names across columns:

    import pandas as pd
    from sklearn.preprocessing import LabelEncoder
    
    df = pd.DataFrame([['A','B','C','D','E','F','G','I','K','H'],
                       ['A','E','H','F','G','I','K','','',''],
                       ['A','C','I','F','H','G','','','','']], 
                      columns=['A1', 'A2', 'A3','A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10'])
    
    pd.DataFrame(columns=df.columns, data=LabelEncoder().fit_transform(df.values.flatten()).reshape(df.shape))
    
        A1  A2  A3  A4  A5  A6  A7  A8  A9  A10
    0   1   2   3   4   5   6   7   9   10  8
    1   1   5   8   6   7   9   10  0   0   0
    2   1   3   9   6   8   7   0   0   0   0
    
    0 讨论(0)
  • 2020-11-22 09:49

    Following up on the comments raised on the solution of @PriceHardman I would propose the following version of the class:

    class LabelEncodingColoumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):
        pdu._is_cols_input_valid(cols)
        self.cols = cols
        self.les = {col: LabelEncoder() for col in cols}
        self._is_fitted = False
    
    def transform(self, df, **transform_params):
        """
        Scaling ``cols`` of ``df`` using the fitting
    
        Parameters
        ----------
        df : DataFrame
            DataFrame to be preprocessed
        """
        if not self._is_fitted:
            raise NotFittedError("Fitting was not preformed")
        pdu._is_cols_subset_of_df_cols(self.cols, df)
    
        df = df.copy()
    
        label_enc_dict = {}
        for col in self.cols:
            label_enc_dict[col] = self.les[col].transform(df[col])
    
        labelenc_cols = pd.DataFrame(label_enc_dict,
            # The index of the resulting DataFrame should be assigned and
            # equal to the one of the original DataFrame. Otherwise, upon
            # concatenation NaNs will be introduced.
            index=df.index
        )
    
        for col in self.cols:
            df[col] = labelenc_cols[col]
        return df
    
    def fit(self, df, y=None, **fit_params):
        """
        Fitting the preprocessing
    
        Parameters
        ----------
        df : DataFrame
            Data to use for fitting.
            In many cases, should be ``X_train``.
        """
        pdu._is_cols_subset_of_df_cols(self.cols, df)
        for col in self.cols:
            self.les[col].fit(df[col])
        self._is_fitted = True
        return self
    

    This class fits the encoder on the training set and uses the fitted version when transforming. Initial version of the code can be found here.

    0 讨论(0)
提交回复
热议问题