Using .loc inside custom transformer produces copy with slice error

喜欢而已 提交于 2020-12-15 06:16:41

问题


EDIT: the question remains the same but the code has changed.

I am working on the home credit dataset on Kaggle and specifically on instalment_payment.csv. Following are my custom transformers

class Xfrmer_replace1(BaseEstimator, TransformerMixin):
    """
        this transformer does the global repplace within the dataframe
        replace 365243 spcific to this case study with 0
        replace +/-inf , nan with zero
    """
    # constructor
    def __init__(self):
        #we are not going to use this 
        self._features = None
        
    #Return self 
    def fit(self, X,y=None  ):
        return self
    
    def transform(self,X,y=None):        
        #replace high values with zero
        for col in X.columns:
            X=X.replace([365243,365243.0],0)
            print('replaced values')
        #X=X.replace([np.inf,-np.inf],np.nan)
        #X=X.replace(np.nan,0)    
    
        return X

class Xfrmer_signchng1(BaseEstimator, TransformerMixin):
    """
        this transformer does the change for positive to negative 
    """
    # constructor
    def __init__(self):
        #we are not going to use this         
        self.signchng_columns = None
        
    #Return self 
    def fit(self,X,y=None  ):
        return self
    
    def transform(self,X,y=None):        
        #change the sign of the columns
        for col in X.columns:
            print('sign change')
            X[col]= [0  if val >= 0 else (val *-1) for val in X[col] ]
    
        return X  

class Xfrmer_dif_calc1(BaseEstimator, TransformerMixin):
    """
        this transformer does the difference bewteen the two columns
        the i/p is a list of tuples
        the second item in the tuple is divided from the first item
        the third item in the tuple is the name of this new column    
    """
    # constructor
    def __init__(self):
        #we are not going to use this         
        self.dif_columns = None
        
    #Return self 
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        print('diff caclulator')
        print('X columns', X.columns)
        print(X[X.columns[0]])
        print(X[X.columns[1]])
        #print(X['AMT_PAYMENT'])
        #print(X['AMT_INSTALMENT'])
        #print(X[X.columns[0]] - X[X.columns[1]])
        #iter1.X.loc[:,'AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
        X['AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
        print(X['AMT_PMT_DIF'])
        return X   

    
class Xfrmer_rto_calc1(BaseEstimator, TransformerMixin):
    """
        this transformer calculates the ratio between two columns
        the i/p is a list of tuples
        the first item in the tuple is divided from the second item
        the third item in the tuple is the name of this new column
    """
    # constructor
    def __init__(self):
        #we are not going to use this 
        self.rto_columns = None
        
    #Return self 
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):        
        print('ratio caclulator')
        #iter1.X.loc[:,'AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
        X['AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
    
        return X  

This is how I am consuming my pipelines

lst_all_cols = dtprcs.X_train.columns.values.tolist()
lst_signchng_cols = ["DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"]            
lst_imptr_cols=['DAYS_ENTRY_PAYMENT','AMT_PAYMENT']
lst_diff_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"]            
lst_rto_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"] 
print('Starting pipeline processing')        
#"""

instpmt_preprcs_pipln = ColumnTransformer( transformers = [
                                        ( 'instpmt_repl_pipln', Xfrmer_replace1(),lst_all_cols ),
                                        ( 'instpmt_sgnchng_pipln', Xfrmer_signchng1(),lst_signchng_cols ),
                                        ( 'instpmt_imptr_piplin',SimpleImputer(strategy = 'median'),lst_imptr_cols ),
                                        ('instpmt_dif_pipln',Xfrmer_dif_calc1(), lst_diff_cols),
                                        ('instpmt_rto_pipln',Xfrmer_rto_calc1(),lst_rto_cols)],
                                        remainder='passthrough')
print('Pipeline fitting start...')
instpmt_preprcs_pipln.fit( dtprcs.X_train, dtprcs.y_train )
print('Pipeline fitting over...')
#print(dtprcs.X_train.shape,dtprcs.x_test.shape)
#print(dtprcs.X_train.columns,dtprcs.x_test.columns)
#Can predict with it like any other pipeline
print('Pipeline transforming x_test...')

y_pred = instpmt_partial_piplin.transform( dtprcs.x_test ) 
print('Pipeline transforming x_test over...')
print(type(dtprcs.X_train),type(dtprcs.x_test),type(dtprcs.y_train))
print(dtprcs.X_train.columns,dtprcs.x_test.columns)
print('Pipeline preprocessing pver. Seting up other classes...')

My Questions

  1. How to add a new column to a data frame within a columntransformer ? I tried using .loc and without .loc. From the trace below we find that the value is actually being calculated but not getting updated into the dataframe

  2. The debug values are printed during the fit() but not during the transform of the test dataset.

Latest Stack Trace

Finished reading apln train/test files...
installments_payments.csv
primary name train installments_payments_train.csv
primary name test installments_payments_test.csv
Train test files ready...
finished writing train/test files.
Exiting function(0).
(16915, 8)
(4574, 8)
Processing installments_payments.csv...
Starting pipeline processing
Pipeline fitting start...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
X columns Index(['AMT_PAYMENT', 'AMT_INSTALMENT'], dtype='object')
0         6948.360
2         6948.360
3         1716.525
4         1716.525
5         3375.000
           ...    
42390    12303.000
42391    10299.960
42392    10869.435
42402      124.155
42409     4198.950
Name: AMT_PAYMENT, Length: 16915, dtype: float64
0         6948.360
2         6948.360
3         1716.525
4         1716.525
5         3375.000
           ...    
42390    12303.000
42391    10299.960
42392    14958.135
42402      124.155
42409     4198.950
Name: AMT_INSTALMENT, Length: 16915, dtype: float64
0           0.0
2           0.0
3           0.0
4           0.0
5           0.0
          ...  
42390       0.0
42391       0.0
42392   -4088.7
42402       0.0
42409       0.0
Name: AMT_PMT_DIF, Length: 16915, dtype: float64
ratio caclulator
Pipeline fitting over...
Pipeline transforming x_test...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
ratio caclulator

**Pipeline transforming x_test over...**
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT'],
      dtype='object') Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT'],
      dtype='object')
Pipeline preprocessing pver. Seting up other classes...
Exiting main function...
E:\anaconda\envs\appliedaicourse\lib\site-packages\ipykernel_launcher.py:187: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
E:\anaconda\envs\appliedaicourse\lib\site-packages\pandas\core\indexing.py:362: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
E:\anaconda\envs\appliedaicourse\lib\site-packages\pandas\core\indexing.py:562: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value

回答1:


Like i said in comment, I first extract the feature I need to learn from(.fit) using:

from sklearn.base import TransformerMixin

class FeatureExtractor(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        print(self.cols)
    
    def fit(self, X, y=None):
        # stateless transformer
        return self
    
    def transform(self, X):
        # assumes X is Pandas Dataframe
        X_cols = X.loc[:, self.cols]
        return X_cols

Then use this class to learn from one of the columns from the data:

class SynopsisNumWords(TransformerMixin):
    def __init__(self):
        return None
        # self.text_array = text_array
    
    def fit(self,  X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        X = X.copy()
        # # rename the series to not have the same column name as input
        return X.loc[:,'Synopsis'].apply(lambda x: len(str(x).split())).rename('Synopsis_num_words').to_frame()

Then union all the features to make a single dataframe using this:

class DFFeatureUnion(TransformerMixin):
    # FeatureUnion but for pandas DataFrames

    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def fit(self, X, y=None):
        for (name, t) in self.transformer_list:
            t.fit(X)
        return self

    def transform(self, X):
        # X must be a DataFrame
        Xts = [t.transform(X) for _, t in self.transformer_list]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
        return Xunion

Then unite all of it and make a pipeline like below. This pipeline takes a dataframe of 9 columns, learns from a column, generates another column from it, then unite all of them and return the dataframe with 10 columns.

from sklearn.pipeline import Pipeline
synopsis_feat_gen_pipeline = Pipeline(steps=[('engineer_data',
                                        DFFeatureUnion([
                                                     ('extract_all_columns',
                                                      Pipeline(steps=[
                                                                      ('extract_all_features',
                                                                       FeatureExtractor(['Synopsis', 'Title', 'Author', 'Edition',
                                                                                         'Reviews', 'Ratings', 'Genre', 'BookCategory', 'Price'])
                                                                       )
                                                                      ], verbose=True
                                                               )
                                                     ),
                                                     ('generate_num_words_column',
                                                      Pipeline(steps=[
                                                                      ('extract_Synopsis_feature', FeatureExtractor(['Synopsis'])),
                                                                      ('generate_num_words', SynopsisNumWords())
                                                                      ], verbose=True
                                                               )
                                                      ),
                                                     ]))
                                     ],
                              verbose=True)


来源:https://stackoverflow.com/questions/65164203/using-loc-inside-custom-transformer-produces-copy-with-slice-error

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!