问题
EDIT: the question remains the same but the code has changed.
I am working on the home credit dataset on Kaggle and specifically on instalment_payment.csv. Following are my custom transformers
class Xfrmer_replace1(BaseEstimator, TransformerMixin):
"""
this transformer does the global repplace within the dataframe
replace 365243 spcific to this case study with 0
replace +/-inf , nan with zero
"""
# constructor
def __init__(self):
#we are not going to use this
self._features = None
#Return self
def fit(self, X,y=None ):
return self
def transform(self,X,y=None):
#replace high values with zero
for col in X.columns:
X=X.replace([365243,365243.0],0)
print('replaced values')
#X=X.replace([np.inf,-np.inf],np.nan)
#X=X.replace(np.nan,0)
return X
class Xfrmer_signchng1(BaseEstimator, TransformerMixin):
"""
this transformer does the change for positive to negative
"""
# constructor
def __init__(self):
#we are not going to use this
self.signchng_columns = None
#Return self
def fit(self,X,y=None ):
return self
def transform(self,X,y=None):
#change the sign of the columns
for col in X.columns:
print('sign change')
X[col]= [0 if val >= 0 else (val *-1) for val in X[col] ]
return X
class Xfrmer_dif_calc1(BaseEstimator, TransformerMixin):
"""
this transformer does the difference bewteen the two columns
the i/p is a list of tuples
the second item in the tuple is divided from the first item
the third item in the tuple is the name of this new column
"""
# constructor
def __init__(self):
#we are not going to use this
self.dif_columns = None
#Return self
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
print('diff caclulator')
print('X columns', X.columns)
print(X[X.columns[0]])
print(X[X.columns[1]])
#print(X['AMT_PAYMENT'])
#print(X['AMT_INSTALMENT'])
#print(X[X.columns[0]] - X[X.columns[1]])
#iter1.X.loc[:,'AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
X['AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
print(X['AMT_PMT_DIF'])
return X
class Xfrmer_rto_calc1(BaseEstimator, TransformerMixin):
"""
this transformer calculates the ratio between two columns
the i/p is a list of tuples
the first item in the tuple is divided from the second item
the third item in the tuple is the name of this new column
"""
# constructor
def __init__(self):
#we are not going to use this
self.rto_columns = None
#Return self
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
print('ratio caclulator')
#iter1.X.loc[:,'AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
X['AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
return X
This is how I am consuming my pipelines
lst_all_cols = dtprcs.X_train.columns.values.tolist()
lst_signchng_cols = ["DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"]
lst_imptr_cols=['DAYS_ENTRY_PAYMENT','AMT_PAYMENT']
lst_diff_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"]
lst_rto_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"]
print('Starting pipeline processing')
#"""
instpmt_preprcs_pipln = ColumnTransformer( transformers = [
( 'instpmt_repl_pipln', Xfrmer_replace1(),lst_all_cols ),
( 'instpmt_sgnchng_pipln', Xfrmer_signchng1(),lst_signchng_cols ),
( 'instpmt_imptr_piplin',SimpleImputer(strategy = 'median'),lst_imptr_cols ),
('instpmt_dif_pipln',Xfrmer_dif_calc1(), lst_diff_cols),
('instpmt_rto_pipln',Xfrmer_rto_calc1(),lst_rto_cols)],
remainder='passthrough')
print('Pipeline fitting start...')
instpmt_preprcs_pipln.fit( dtprcs.X_train, dtprcs.y_train )
print('Pipeline fitting over...')
#print(dtprcs.X_train.shape,dtprcs.x_test.shape)
#print(dtprcs.X_train.columns,dtprcs.x_test.columns)
#Can predict with it like any other pipeline
print('Pipeline transforming x_test...')
y_pred = instpmt_partial_piplin.transform( dtprcs.x_test )
print('Pipeline transforming x_test over...')
print(type(dtprcs.X_train),type(dtprcs.x_test),type(dtprcs.y_train))
print(dtprcs.X_train.columns,dtprcs.x_test.columns)
print('Pipeline preprocessing pver. Seting up other classes...')
My Questions
How to add a new column to a data frame within a columntransformer ? I tried using .loc and without .loc. From the trace below we find that the value is actually being calculated but not getting updated into the dataframe
The debug values are printed during the fit() but not during the transform of the test dataset.
Latest Stack Trace
Finished reading apln train/test files...
installments_payments.csv
primary name train installments_payments_train.csv
primary name test installments_payments_test.csv
Train test files ready...
finished writing train/test files.
Exiting function(0).
(16915, 8)
(4574, 8)
Processing installments_payments.csv...
Starting pipeline processing
Pipeline fitting start...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
X columns Index(['AMT_PAYMENT', 'AMT_INSTALMENT'], dtype='object')
0 6948.360
2 6948.360
3 1716.525
4 1716.525
5 3375.000
...
42390 12303.000
42391 10299.960
42392 10869.435
42402 124.155
42409 4198.950
Name: AMT_PAYMENT, Length: 16915, dtype: float64
0 6948.360
2 6948.360
3 1716.525
4 1716.525
5 3375.000
...
42390 12303.000
42391 10299.960
42392 14958.135
42402 124.155
42409 4198.950
Name: AMT_INSTALMENT, Length: 16915, dtype: float64
0 0.0
2 0.0
3 0.0
4 0.0
5 0.0
...
42390 0.0
42391 0.0
42392 -4088.7
42402 0.0
42409 0.0
Name: AMT_PMT_DIF, Length: 16915, dtype: float64
ratio caclulator
Pipeline fitting over...
Pipeline transforming x_test...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
ratio caclulator
**Pipeline transforming x_test over...**
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
'AMT_INSTALMENT', 'AMT_PAYMENT'],
dtype='object') Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
'AMT_INSTALMENT', 'AMT_PAYMENT'],
dtype='object')
Pipeline preprocessing pver. Seting up other classes...
Exiting main function...
E:\anaconda\envs\appliedaicourse\lib\site-packages\ipykernel_launcher.py:187: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
E:\anaconda\envs\appliedaicourse\lib\site-packages\pandas\core\indexing.py:362: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[key] = _infer_fill_value(value)
E:\anaconda\envs\appliedaicourse\lib\site-packages\pandas\core\indexing.py:562: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[item_labels[indexer[info_axis]]] = value
回答1:
Like i said in comment, I first extract the feature I need to learn from(.fit
) using:
from sklearn.base import TransformerMixin
class FeatureExtractor(TransformerMixin):
def __init__(self, cols):
self.cols = cols
print(self.cols)
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is Pandas Dataframe
X_cols = X.loc[:, self.cols]
return X_cols
Then use this class to learn from one of the columns from the data:
class SynopsisNumWords(TransformerMixin):
def __init__(self):
return None
# self.text_array = text_array
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
X = X.copy()
# # rename the series to not have the same column name as input
return X.loc[:,'Synopsis'].apply(lambda x: len(str(x).split())).rename('Synopsis_num_words').to_frame()
Then union all the features to make a single dataframe using this:
class DFFeatureUnion(TransformerMixin):
# FeatureUnion but for pandas DataFrames
def __init__(self, transformer_list):
self.transformer_list = transformer_list
def fit(self, X, y=None):
for (name, t) in self.transformer_list:
t.fit(X)
return self
def transform(self, X):
# X must be a DataFrame
Xts = [t.transform(X) for _, t in self.transformer_list]
Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
return Xunion
Then unite all of it and make a pipeline like below. This pipeline takes a dataframe of 9 columns, learns from a column, generates another column from it, then unite all of them and return the dataframe with 10 columns.
from sklearn.pipeline import Pipeline
synopsis_feat_gen_pipeline = Pipeline(steps=[('engineer_data',
DFFeatureUnion([
('extract_all_columns',
Pipeline(steps=[
('extract_all_features',
FeatureExtractor(['Synopsis', 'Title', 'Author', 'Edition',
'Reviews', 'Ratings', 'Genre', 'BookCategory', 'Price'])
)
], verbose=True
)
),
('generate_num_words_column',
Pipeline(steps=[
('extract_Synopsis_feature', FeatureExtractor(['Synopsis'])),
('generate_num_words', SynopsisNumWords())
], verbose=True
)
),
]))
],
verbose=True)
来源:https://stackoverflow.com/questions/65164203/using-loc-inside-custom-transformer-produces-copy-with-slice-error