问题
I am getting this error:
ValueError: Items of feature_columns must be a _FeatureColumn. Given (type ): Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited'], dtype='object').
I am using tensorFlow lib. I want to get prediction results but I can not run m.train(input_fn=get_input_fn ,steps=5000)
code. I always got the same error whatever I did. I used these input functions in the following but nothing changed.
def input_fn_train():
x=tf.constant(df_train.astype(np.float64)),
y=tf.constant(df_train[LABEL].astype(np.float64))
return x, y
and
def get_input_fn(data_set, num_epochs=None, shuffle=False):
return tf.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in data_set.columns}),
y=pd.Series(data_set[LABEL].values), num_epochs=num_epochs,
shuffle=shuffle)
I can not understand what should I do. What the error is about? I've been googling but never found useful thing. How can I handle this error. The code is below. Thanks!
import pandas as pd
import tensorflow as tf
import numpy as np
import tempfile
COLS= ["RowNumber","CustomerId","Surname","CreditScore","Geography",
"Gender","Age","Tenure","Balance","NumOfProducts","HasCrCard",
"IsActiveMember","EstimatedSalary","Exited"]
FEATURES = ["CreditScore","Age","Tenure","Balance","NumOfProducts",
"HasCrCard","IsActiveMember", "EstimatedSalary"]
LABEL="Exited"
df_train = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
df_test = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
test_label = df_test[LABEL].astype(float)
df_test.drop("Surname", axis = 1, inplace=True)
df_test.drop("RowNumber", axis = 1, inplace=True)
df_test.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("Surname", axis = 1, inplace=True)
df_train.drop("RowNumber", axis = 1, inplace=True)
df_train.drop("Geography", axis = 1, inplace=True)
df_train.drop("Gender", axis = 1, inplace=True)
def get_input_fn():
return {'x': tf.constant(df_train[FEATURES].as_matrix(), tf.float32,
df_train.shape),
'y': tf.constant(df_train[LABEL].as_matrix(), tf.float32,
df_train.shape)
}
df=df_train.select_dtypes(exclude=['object'])
numeric_cols=df.columns
m = tf.estimator.LinearClassifier(model_dir=model_dir, feature_columns=
[numeric_cols])
m.train(input_fn=get_input_fn ,steps=5000)
results = m.evaluate(input_fn= get_input_fn(df_test, num_epochs=1,
shuffle=False),steps=None)
y = m.predict(input_fn=get_input_fn(df_test, num_epochs=1, shuffle=False))
pred = list(y)
rowNumber=0
for i in pred:
print(str(rowNumber)+': '+str(pred[i]))
rowNumber=rowNumber+1
回答1:
Your first mistake is how you create tf.estimator.LinearClassifier
. You're passing the dataframe index df.columns
into feature_columns
, but should pass the list of tensorflow feature columns. The columns should define if it's numerical or categorical and in the later case the encoding type.
Secondly, the input function can be simplified a lot, since you're reading pandas
dataframe. Just use tf.estimator.inputs.pandas_input_fn
.
Your .csv
is most likely different, I've made a dummy one with some values. So here's a way to read the input and fit the model correctly:
import pandas as pd
import tensorflow as tf
FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts",
"HasCrCard", "IsActiveMember", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
has_card = tf.feature_column.categorical_column_with_vocabulary_list("HasCrCard", ["True", "False"])
is_active_member = tf.feature_column.categorical_column_with_vocabulary_list("IsActiveMember", ["True", "False"])
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, has_card, is_active_member, estimated_salary]
def input_fn(num_epochs=None, shuffle=True, batch_size=100):
df = pd.read_csv('Churn_Modelling.csv',
names=FEATURES,
dtype={'HasCrCard': str, 'IsActiveMember': str},
skipinitialspace=True,
header=0)
df = df.dropna(how='any', axis=0) # remove NaN elements
labels = df["Exited"]
return tf.estimator.inputs.pandas_input_fn(x=df,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model = tf.estimator.LinearClassifier(model_dir=None,
feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)
回答2:
It is working clearly.
import pandas as pd
import tensorflow as tf
import tempfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
def split_data(data, rate, label):
data = data.dropna()
train_data, test_data = train_test_split(data, test_size=rate)
train_label = train_data[label]
train_data = train_data.drop(label, 1)
test_label = test_data[label]
test_data = test_data.drop(label, 1)
return train_data, train_label, test_data, test_label
LABEL = "Exited"
data = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
data.drop("Surname", axis=1, inplace=True)
data.drop("RowNumber", axis=1, inplace=True)
data.drop("CustomerId", axis=1, inplace=True)
data.drop("Geography", axis=1, inplace=True)
data.drop("Gender", axis=1, inplace=True)
x_train, y_train, x_test, y_test = split_data(data, 0.20, LABEL)
def get_input_fn_train():
input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_train,
y=y_train,
shuffle=False
)
return input_fn
def get_input_fn_test():
input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_test,
y=y_test,
shuffle=False
)
return input_fn
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn
(get_input_fn_train())
model_dir = tempfile.mkdtemp()
m = tf.estimator.LinearClassifier(model_dir=model_dir,
feature_columns=feature_columns)
# train data
m.train(input_fn=get_input_fn_train(), steps=5000)
# you can get accuracy, accuracy_baseline, auc, auc_precision_recall,
#average_loss, global_step, label/mean, lossprediction/mean
results = m.evaluate(input_fn=get_input_fn_test(), steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
# get prediction results
y = m.predict(input_fn=get_input_fn_test())
predictions = list(y)
pred1=pd.DataFrame(data=predictions)
prediction=pd.DataFrame(data=pred1['class_ids'])
pred=[]
for row in prediction["class_ids"]:
pred.append(row[0])
rowNumber = 0
for i in pred:
print(str(rowNumber) + ': ' + str(i))
rowNumber = rowNumber + 1
def calculate(prediction, LABEL):
arr = {"accuracy": accuracy_score(prediction, LABEL),
"report": classification_report(prediction, LABEL),
"Confusion_Matrix": confusion_matrix(prediction, LABEL),
"F1 score": f1_score(prediction, LABEL),
"Recall Score": recall_score(prediction, LABEL),
"cohen_kappa": cohen_kappa_score(prediction, LABEL)
}
return arr
pred2 = pd.DataFrame(data=pred)
print(calculate(pred2.round(), y_test))
回答3:
I'm going to make some small changes to @Maxim's answer (thanks, btw) and post a minimum working example with random numpy data. This seems to run fine on my windows machine. Note the suppressed warning due to my particular hardware.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
import tensorflow as tf
FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, estimated_salary]
def input_fn(num_epochs=None, shuffle=True, batch_size=100):
N_features = len(FEATURES)
print(N_features)
N_examples = 5000
X_train = np.random.rand(N_examples,N_features)
Y_train = np.random.rand(N_examples)
columns = [str(i) for i in range(N_features)]
columns = FEATURES
df = pd.DataFrame(data = X_train, columns = columns)
labels = df["Exited"]
return tf.estimator.inputs.pandas_input_fn(x=df,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model = tf.estimator.LinearClassifier(model_dir='model_dir',
feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)
来源:https://stackoverflow.com/questions/47197989/items-of-feature-columns-must-be-a-featurecolumn