I\'m trying to use scikit-learn\'s LabelEncoder
to encode a pandas DataFrame
of string labels. As the dataframe has many (50+) columns, I want to a
This is a year-and-a-half after the fact, but I too, needed to be able to .transform()
multiple pandas dataframe columns at once (and be able to .inverse_transform()
them as well). This expands upon the excellent suggestion of @PriceHardman above:
class MultiColumnLabelEncoder(LabelEncoder):
"""
Wraps sklearn LabelEncoder functionality for use on multiple columns of a
pandas dataframe.
"""
def __init__(self, columns=None):
self.columns = columns
def fit(self, dframe):
"""
Fit label encoder to pandas columns.
Access individual column classes via indexig `self.all_classes_`
Access individual column encoders via indexing
`self.all_encoders_`
"""
# if columns are provided, iterate through and get `classes_`
if self.columns is not None:
# ndarray to hold LabelEncoder().classes_ for each
# column; should match the shape of specified `columns`
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
self.all_encoders_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
# fit LabelEncoder to get `classes_` for the column
le = LabelEncoder()
le.fit(dframe.loc[:, column].values)
# append the `classes_` to our ndarray container
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
# append this column's encoder
self.all_encoders_[idx] = le
else:
# no columns specified; assume all are to be encoded
self.columns = dframe.iloc[:, :].columns
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
le = LabelEncoder()
le.fit(dframe.loc[:, column].values)
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
self.all_encoders_[idx] = le
return self
def fit_transform(self, dframe):
"""
Fit label encoder and return encoded labels.
Access individual column classes via indexing
`self.all_classes_`
Access individual column encoders via indexing
`self.all_encoders_`
Access individual column encoded labels via indexing
`self.all_labels_`
"""
# if columns are provided, iterate through and get `classes_`
if self.columns is not None:
# ndarray to hold LabelEncoder().classes_ for each
# column; should match the shape of specified `columns`
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
self.all_encoders_ = np.ndarray(shape=self.columns.shape,
dtype=object)
self.all_labels_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
# instantiate LabelEncoder
le = LabelEncoder()
# fit and transform labels in the column
dframe.loc[:, column] =\
le.fit_transform(dframe.loc[:, column].values)
# append the `classes_` to our ndarray container
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
self.all_encoders_[idx] = le
self.all_labels_[idx] = le
else:
# no columns specified; assume all are to be encoded
self.columns = dframe.iloc[:, :].columns
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
le = LabelEncoder()
dframe.loc[:, column] = le.fit_transform(
dframe.loc[:, column].values)
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
self.all_encoders_[idx] = le
return dframe.loc[:, self.columns].values
def transform(self, dframe):
"""
Transform labels to normalized encoding.
"""
if self.columns is not None:
for idx, column in enumerate(self.columns):
dframe.loc[:, column] = self.all_encoders_[
idx].transform(dframe.loc[:, column].values)
else:
self.columns = dframe.iloc[:, :].columns
for idx, column in enumerate(self.columns):
dframe.loc[:, column] = self.all_encoders_[idx]\
.transform(dframe.loc[:, column].values)
return dframe.loc[:, self.columns].values
def inverse_transform(self, dframe):
"""
Transform labels back to original encoding.
"""
if self.columns is not None:
for idx, column in enumerate(self.columns):
dframe.loc[:, column] = self.all_encoders_[idx]\
.inverse_transform(dframe.loc[:, column].values)
else:
self.columns = dframe.iloc[:, :].columns
for idx, column in enumerate(self.columns):
dframe.loc[:, column] = self.all_encoders_[idx]\
.inverse_transform(dframe.loc[:, column].values)
return dframe.loc[:, self.columns].values
Example:
If df
and df_copy()
are mixed-type pandas
dataframes, you can apply the MultiColumnLabelEncoder()
to the dtype=object
columns in the following way:
# get `object` columns
df_object_columns = df.iloc[:, :].select_dtypes(include=['object']).columns
df_copy_object_columns = df_copy.iloc[:, :].select_dtypes(include=['object']).columns
# instantiate `MultiColumnLabelEncoder`
mcle = MultiColumnLabelEncoder(columns=object_columns)
# fit to `df` data
mcle.fit(df)
# transform the `df` data
mcle.transform(df)
# returns output like below
array([[1, 0, 0, ..., 1, 1, 0],
[0, 5, 1, ..., 1, 1, 2],
[1, 1, 1, ..., 1, 1, 2],
...,
[3, 5, 1, ..., 1, 1, 2],
# transform `df_copy` data
mcle.transform(df_copy)
# returns output like below (assuming the respective columns
# of `df_copy` contain the same unique values as that particular
# column in `df`
array([[1, 0, 0, ..., 1, 1, 0],
[0, 5, 1, ..., 1, 1, 2],
[1, 1, 1, ..., 1, 1, 2],
...,
[3, 5, 1, ..., 1, 1, 2],
# inverse `df` data
mcle.inverse_transform(df)
# outputs data like below
array([['August', 'Friday', '2013', ..., 'N', 'N', 'CA'],
['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['August', 'Monday', '2014', ..., 'N', 'N', 'NJ'],
...,
['February', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['March', 'Tuesday', '2013', ..., 'N', 'N', 'NJ']], dtype=object)
# inverse `df_copy` data
mcle.inverse_transform(df_copy)
# outputs data like below
array([['August', 'Friday', '2013', ..., 'N', 'N', 'CA'],
['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['August', 'Monday', '2014', ..., 'N', 'N', 'NJ'],
...,
['February', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
['March', 'Tuesday', '2013', ..., 'N', 'N', 'NJ']], dtype=object)
You can access individual column classes, column labels, and column encoders used to fit each column via indexing:
mcle.all_classes_
mcle.all_encoders_
mcle.all_labels_
import pandas as pd
from sklearn.preprocessing import LabelEncoder
train=pd.read_csv('.../train.csv')
#X=train.loc[:,['waterpoint_type_group','status','waterpoint_type','source_class']].values
# Create a label encoder object
def MultiLabelEncoder(columnlist,dataframe):
for i in columnlist:
labelencoder_X=LabelEncoder()
dataframe[i]=labelencoder_X.fit_transform(dataframe[i])
columnlist=['waterpoint_type_group','status','waterpoint_type','source_class','source_type']
MultiLabelEncoder(columnlist,train)
Here i am reading a csv from location and in function i am passing the column list i want to labelencode and the dataframe I want to apply this.
After lots of search and experimentation with some answers here and elsewhere, I think your answer is here:
pd.DataFrame(columns=df.columns, data=LabelEncoder().fit_transform(df.values.flatten()).reshape(df.shape))
This will preserve category names across columns:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame([['A','B','C','D','E','F','G','I','K','H'],
['A','E','H','F','G','I','K','','',''],
['A','C','I','F','H','G','','','','']],
columns=['A1', 'A2', 'A3','A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10'])
pd.DataFrame(columns=df.columns, data=LabelEncoder().fit_transform(df.values.flatten()).reshape(df.shape))
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10
0 1 2 3 4 5 6 7 9 10 8
1 1 5 8 6 7 9 10 0 0 0
2 1 3 9 6 8 7 0 0 0 0
Following up on the comments raised on the solution of @PriceHardman I would propose the following version of the class:
class LabelEncodingColoumns(BaseEstimator, TransformerMixin):
def __init__(self, cols=None):
pdu._is_cols_input_valid(cols)
self.cols = cols
self.les = {col: LabelEncoder() for col in cols}
self._is_fitted = False
def transform(self, df, **transform_params):
"""
Scaling ``cols`` of ``df`` using the fitting
Parameters
----------
df : DataFrame
DataFrame to be preprocessed
"""
if not self._is_fitted:
raise NotFittedError("Fitting was not preformed")
pdu._is_cols_subset_of_df_cols(self.cols, df)
df = df.copy()
label_enc_dict = {}
for col in self.cols:
label_enc_dict[col] = self.les[col].transform(df[col])
labelenc_cols = pd.DataFrame(label_enc_dict,
# The index of the resulting DataFrame should be assigned and
# equal to the one of the original DataFrame. Otherwise, upon
# concatenation NaNs will be introduced.
index=df.index
)
for col in self.cols:
df[col] = labelenc_cols[col]
return df
def fit(self, df, y=None, **fit_params):
"""
Fitting the preprocessing
Parameters
----------
df : DataFrame
Data to use for fitting.
In many cases, should be ``X_train``.
"""
pdu._is_cols_subset_of_df_cols(self.cols, df)
for col in self.cols:
self.les[col].fit(df[col])
self._is_fitted = True
return self
This class fits the encoder on the training set and uses the fitted version when transforming. Initial version of the code can be found here.