问题
I have a pandas data frame that contains information about messages sent by user. For my model, I'm interested in predicting missing recipients of a message i,e given recipients A,B,C of a message I want to predict who else should have been part of the recipients.
I'm doing multi-label classification using OneVsRestClassifier and LinearSVC. For features, I want to use the recipients of the message. subject and body.
Since recipients is a list of users, I want to transform that column using MultiLabelBinarizer. For Subject and Body, I want to use TFIDF
My input pickle file has data as follows: All values are strings except recipients which is a set()
[[message_id,sent_time,subject,body,set(recipients),message_type, is_sender]]
I'm using feature union with custom transformers in the pipeline to achieve this as follows.
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
import pickle
import pandas as pd
import numpy as np
if __name__ == "__main__":
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
return X[self.column]
class MultiLabelTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X):
mlb = MultiLabelBinarizer()
return mlb.fit_transform(X[self.column])
pipeline = Pipeline([
('features', FeatureUnion([
('subject_tfidf', Pipeline([
('selector', ColumnSelector(column='Subject')),
('tfidf', TfidfVectorizer(min_df=0.0025, ngram_range=(1, 4)))
])),
('body_tfidf', Pipeline([
('selector', ColumnSelector(column='Body')),
('tfidf', TfidfVectorizer(min_df=0.0025, ngram_range=(1, 4)))
])),
('recipients_binarizer', Pipeline([
('multi_label', MultiLabelTransformer(column='CoRecipients'))
])),
])),
('classifier', OneVsRestClassifier(LinearSVC(), n_jobs=-1))
])
top_recips = ['A', 'B', 'C, 'D]
corpus_data = pickle.load(
open("E:\\Data\\messages_items.pkl", "rb"))
df = pd.DataFrame(corpus_data, columns=[
'MessageId', 'SentTime', 'Subject', 'Body', 'Recipients', 'MessageType', 'IsSender'])
df = df.dropna()
# add co recipients and top recipients columns
df['CoRecipients'] = df['Recipients'].apply(
lambda r: [x for x in r if x not in top_recips])
df['TopRecipients'] = df['Recipients'].apply(
lambda r: [x for x in top_recips if x in r])
# drop rows where top recipients = 0
df = df.loc[df['TopRecipients'].str.len() > 0]
df_train = df.loc[df['SentTime'] <= '2017-10-15']
df_test = df.loc[(df['SentTime'] > '2017-10-15') & (df['MessageType'] == 'Meeting')]
mlb = MultiLabelBinarizer(classes=top_recips)
train_x = df_train[['Subject', 'Body', 'CoRecipients']]
train_y = mlb.fit_transform(df_train['TopRecipients'])
test_x = df_train[['Subject', 'Body', 'CoRecipients']]
test_y = mlb.fit_transform(df_train['TopRecipients'])
print "train"
pipeline.fit(train_x, train_y)
print "predict"
predictions = pipeline.predict(test_x)
print "done"
I'm not sure if I'm doing the featurization of the CoRecipients column correctly. As the results dont look right. Any clue?
UPDATE 1
Changed the code of MLB transformer as follows:
class MultiLabelTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
self.mlb = MultiLabelBinarizer()
self.mlb.fit(X[self.column])
return self
def transform(self, X):
return self.mlb.transform(X[self.column])
And fixed the test set to use df_test
mlb = MultiLabelBinarizer(classes=top_recips)
train_x = df_train[['Subject', 'Body', 'CoRecipients']]
train_y = mlb.fit_transform(df_train['TopRecipients'])
test_x = df_test[['Subject', 'Body', 'CoRecipients']]
test_y = mlb.transform(df_test['TopRecipients'])
Seeing the below KeyError
Traceback (most recent call last):
File "E:\Projects\NLP\FeatureUnion.py", line 99, in <module>
predictions = pipeline.predict(test_x)
File "C:\Python27\lib\site-packages\sklearn\utils\metaestimators.py", line 115, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "C:\Python27\lib\site-packages\sklearn\pipeline.py", line 306, in predict
Xt = transform.transform(Xt)
File "C:\Python27\lib\site-packages\sklearn\pipeline.py", line 768, in transform
for name, trans, weight in self._iter())
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python27\lib\site-packages\sklearn\pipeline.py", line 571, in _transform_one
res = transformer.transform(X)
File "C:\Python27\lib\site-packages\sklearn\pipeline.py", line 426, in _transform
Xt = transform.transform(Xt)
File "E:\Projects\NLP\FeatureUnion.py", line 37, in transform
return self.mlb.transform(X[self.column])
File "C:\Python27\lib\site-packages\sklearn\preprocessing\label.py", line 765, in transform
yt = self._transform(y, class_to_index)
File "C:\Python27\lib\site-packages\sklearn\preprocessing\label.py", line 789, in _transform
indices.extend(set(class_mapping[label] for label in labels))
File "C:\Python27\lib\site-packages\sklearn\preprocessing\label.py", line 789, in <genexpr>
indices.extend(set(class_mapping[label] for label in labels))
KeyError: u'cf3024@gmail.com'
> UPDATE 2
Working code
class MultiLabelTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column, classes):
self.column = column
self.classes = classes
def fit(self, X, y=None):
self.mlb = MultiLabelBinarizer(classes=self.classes)
self.mlb.fit(X[self.column])
return self
def transform(self, X):
return self.mlb.transform(X[self.column])
# drop rows where top recipients = 0
df = df.loc[df['TopRecipients'].str.len() > 0]
df_train = df.loc[df['SentTime'] <= '2017-10-15']
df_test = df.loc[(df['SentTime'] > '2017-10-15') &
(df['MessageType'] == 'Meeting')]
mlb = MultiLabelBinarizer(classes=top_recips)
train_x = df_train[['Subject', 'Body', 'CoRecipients']]
train_y = mlb.fit_transform(df_train['TopRecipients'])
test_x = df_test[['Subject', 'Body', 'CoRecipients']]
test_y = mlb.transform(df_test['TopRecipients'])
# get all unique co-recipients
co_recips = list(set([a for b in df.CoRecipients.tolist() for a in b]))
# create pipeline
pipeline = Pipeline([
('features', FeatureUnion(
# list of features
transformer_list=[
('subject_tfidf', Pipeline([
('selector', ColumnSelector(column='Subject')),
('tfidf', TfidfVectorizer(min_df=0.0025, ngram_range=(1, 4)))
])),
('body_tfidf', Pipeline([
('selector', ColumnSelector(column='Body')),
('tfidf', TfidfVectorizer(min_df=0.0025, ngram_range=(1, 4)))
])),
('recipients_binarizer', Pipeline([
('multi_label', MultiLabelTransformer(column='CoRecipients', classes=co_recips))
]))
],
# weight components in FeatureUnion
transformer_weights={
'subject_tfidf': 3.0,
'body_tfidf': 1.0,
'recipients_binarizer': 1.0,
}
)),
('classifier', OneVsRestClassifier(LinearSVC(), n_jobs=-1))
])
print "train"
pipeline.fit(train_x, train_y)
print "predict"
predictions = pipeline.predict(test_x)
回答1:
You are doing the transforming for MultiLabelBinarizer wrong. You are fitting for both training and testing data. Thats not correct way.
You should only always fit on training data and use transform on test data.
You have done this mistake two times:
- In MultiLabelTransformer, where you transform the 'Co-recipients'
- During transforming the test_y, where you transform the 'TopRecipients'
The problem is when test data have different (or new) values in 'Co-recipients' or 'TopRecipients', the returned array will have different shape than what it had during training time. That will result in wrong results.
Change your code like this:
class MultiLabelTransformer(BaseEstimator, TransformerMixin):
#Updated
def __init__(self, column, classes):
self.column = column
self.classes = classes
def fit(self, X, y=None):
# Updated
self.mlb = MultiLabelBinarizer(classes = self.classes)
self.mlb.fit(X[self.column])
return self
def transform(self, X):
return self.mlb.transform(X[self.column])
And
test_y = mlb.transform(df_train['TopRecipients'])
And inside the pipeline:
....
....
('multi_label', MultiLabelTransformer(column='CoRecipients',
classes=set([a for b in df.CoRecipients.tolist() for a in b]))
....
....
Although the last change in test_y
will not affect the returned array because you have specified the classes using top_recips
during mlb = MultiLabelBinarizer(classes=top_recips)
, but its still better to do only transform (and never fit or fit_transform) on the test data.
来源:https://stackoverflow.com/questions/47564903/how-to-transform-multiple-features-in-a-pipeline-using-featureunion