I am relatively new to machine learning as well as tensorflow. I would like to train the data so that predictions with 2 targets and multiple classes could be made. Is this some
You could use a multi-headed DNNEstimator model. This treats Flow and Visibility as two separate softmax classification targets, each with their own set of classes. I had to modify the load_csv_without_header
helper function to support multiple targets (which could be cleaner, but is not the point here - feel free to ignore its details).
import numpy as np
import tensorflow as tf
from tensorflow.python.platform import gfile
import csv
import collections
num_flow_classes = 4
num_visib_classes = 7
Dataset = collections.namedtuple('Dataset', ['data', 'target'])
def load_csv_without_header(fn, target_dtype, features_dtype, target_columns):
with gfile.Open(fn) as csv_file:
data_file = csv.reader(csv_file)
data = []
targets = {
target_cols: []
for target_cols in target_columns.keys()
}
for row in data_file:
cols = sorted(target_columns.items(), key=lambda tup: tup[1], reverse=True)
for target_col_name, target_col_i in cols:
targets[target_col_name].append(row.pop(target_col_i))
data.append(np.asarray(row, dtype=features_dtype))
targets = {
target_col_name: np.array(val, dtype=target_dtype)
for target_col_name, val in targets.items()
}
data = np.array(data)
return Dataset(data=data, target=targets)
feature_columns = [
tf.contrib.layers.real_valued_column("", dimension=1),
tf.contrib.layers.real_valued_column("", dimension=2),
]
head = tf.contrib.learn.multi_head([
tf.contrib.learn.multi_class_head(
num_flow_classes, label_name="Flow", head_name="Flow"),
tf.contrib.learn.multi_class_head(
num_visib_classes, label_name="Visibility", head_name="Visibility"),
])
classifier = tf.contrib.learn.DNNEstimator(
feature_columns=feature_columns,
hidden_units=[10, 20, 10],
model_dir="iris_model",
head=head,
)
def get_input_fn(filename):
def input_fn():
dataset = load_csv_without_header(
fn=filename,
target_dtype=np.int,
features_dtype=np.int,
target_columns={"Flow": 2, "Visibility": 3}
)
x = tf.constant(dataset.data)
y = {k: tf.constant(v) for k, v in dataset.target.items()}
return x, y
return input_fn
classifier.fit(input_fn=get_input_fn("tmp_train.csv"), steps=4000)
res = classifier.evaluate(input_fn=get_input_fn("tmp_test.csv"), steps=1)
print("Validation:", res)
If you keep your CSV data separated by commas, and keep the last column for all the classes a row might have (separated by some token such as space), you can use the following code:
import numpy as np
import tensorflow as tf
all_classes = ["0", "1", "2", "3", "4", "5", "6"]
def k_hot(classes_col, all_classes, delimiter=' '):
table = tf.contrib.lookup.index_table_from_tensor(
mapping=tf.constant(all_classes)
)
classes = tf.string_split(classes_col, delimiter)
ids = table.lookup(classes)
num_items = tf.cast(tf.shape(ids)[0], tf.int64)
num_entries = tf.shape(ids.indices)[0]
y = tf.SparseTensor(
indices=tf.stack([ids.indices[:, 0], ids.values], axis=1),
values=tf.ones(shape=(num_entries,), dtype=tf.int32),
dense_shape=(num_items, len(all_classes)),
)
y = tf.sparse_tensor_to_dense(y, validate_indices=False)
return y
def feature_engineering_fn(features, labels):
labels = k_hot(labels, all_classes)
return features, labels
feature_columns = [
tf.contrib.layers.real_valued_column("", dimension=1), # DayOfYear
tf.contrib.layers.real_valued_column("", dimension=2), # Temperature
]
classifier = tf.contrib.learn.DNNEstimator(
feature_columns=feature_columns,
hidden_units=[10, 20, 10],
model_dir="iris_model",
head=tf.contrib.learn.multi_label_head(n_classes=len(all_classes)),
feature_engineering_fn=feature_engineering_fn,
)
def get_input_fn(filename):
def input_fn():
dataset = tf.contrib.learn.datasets.base.load_csv_without_header(
filename=filename,
target_dtype="S100", # strings of length up to 100 characters
features_dtype=np.int,
target_column=-1
)
x = tf.constant(dataset.data)
y = tf.constant(dataset.target)
return x, y
return input_fn
classifier.fit(input_fn=get_input_fn("tmp_train.csv"), steps=4000)
res = classifier.evaluate(input_fn=get_input_fn("tmp_test.csv"), steps=1)
print("Validation:", res)
We are using DNNEstimator
with a multi_label_head
, which uses sigmoid crossentropy rather than softmax crossentropy as a loss function. This means that each of the output units/logits are passed through the sigmoid function, which gives the likelihood of the data point belonging to that class, i.e. the classes are computed independently and are not mutually exclusive as they are with softmax crossentropy. This means that you could have between 0 and len(all_classes)
classes set for each row in the training set and final predictions.
Also notice that the classes are represented as strings (and k_hot
makes the conversion to token indices), so that you could use arbitrary class identifiers such as category UUIDs in e-commerce settings. If the categories in the 3rd and 4th column are different (Flow ID 1 != Visibility ID 1), you could prepend the column name to each class ID, e.g.
316,8,flow1 visibility4
285,-1,flow1 visibility4
326,8,flow2 visibility5
For a description of how k_hot
works, see my other SO answer. I decided to use k_hot
as a separate function (rather than define it directly in feature_engineering_fn
because it's a distinct piece of functionality, and probably TensorFlow will soon have a similar utility function.
Note that if you're now using the first two columns to predict the last two columns, your accuraccy will certainly go down, as the last two columns are highly correlated and using one of them will give you a lot of information about the other. Actually, your code was using only the 3rd column, which was kind of a cheat anyway if the goal is to predict the 3rd and 4th columns.