Assuming I have a bunch of summaries defined like:
loss = ...
tf.scalar_summary(\"loss\", loss)
# ...
summaries = tf.m
I had the same problem when I realized I had to iterate over my validation data when the memory space cramped up and the OOM errors flooding.
As multiple of these answers say, the tf.metrics
have this built in, but I'm not using tf.metrics
in my project. So inspired by that, I made this:
import tensorflow as tf
import numpy as np
def batch_persistent_mean(tensor):
# Make a variable that keeps track of the sum
accumulator = tf.Variable(initial_value=tf.zeros_like(tensor), dtype=tf.float32)
# Keep count of batches in accumulator (needed to estimate mean)
batch_nums = tf.Variable(initial_value=tf.zeros_like(tensor), dtype=tf.float32)
# Make an operation for accumulating, increasing batch count
accumulate_op = tf.assign_add(accumulator, tensor)
step_batch = tf.assign_add(batch_nums, 1)
update_op = tf.group([step_batch, accumulate_op])
eps = 1e-5
output_tensor = accumulator / (tf.nn.relu(batch_nums - eps) + eps)
# In regards to the tf.nn.relu, it's a hacky zero_guard:
# if batch_nums are zero then return eps, else it'll be batch_nums
# Make an operation to reset
flush_op = tf.group([tf.assign(accumulator, 0), tf.assign(batch_nums, 0)])
return output_tensor, update_op, flush_op
# Make a variable that we want to accumulate
X = tf.Variable(0., dtype=tf.float32)
# Make our persistant mean operations
Xbar, upd, flush = batch_persistent_mean(X)
Now you send Xbar
to your summary e.g. tf.scalar_summary("mean_of_x", Xbar)
, and where you'd do sess.run(X)
before, you'll do sess.run(upd)
. And between epochs you'd do sess.run(flush)
.
### INSERT ABOVE CODE CHUNK IN S.O. ANSWER HERE ###
sess = tf.InteractiveSession()
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
# Calculate the mean of 1+2+...+20
for i in range(20):
sess.run(upd, {X: i})
print(sess.run(Xbar), "=", np.mean(np.arange(20)))
for i in range(40):
sess.run(upd, {X: i})
# Now Xbar is the mean of (1+2+...+20+1+2+...+40):
print(sess.run(Xbar), "=", np.mean(np.concatenate([np.arange(20), np.arange(40)])))
# Now flush it
sess.run(flush)
print("flushed. Xbar=", sess.run(Xbar))
for i in range(40):
sess.run(upd, {X: i})
print(sess.run(Xbar), "=", np.mean(np.arange(40)))
I found one solution myself. I think it's kind of hacky and I hope there is a more elegant solution.
During setup:
valid_loss_placeholder = tf.placeholder(dtype=tf.float32, shape=[])
valid_loss_summary = tf.scalar_summary("valid loss", valid_loss_placeholder)
Or for tensorflow versions after 0.12 (change in name for tf.scalar_summary):
valid_loss_placeholder = tf.placeholder(dtype=tf.float32, shape=[])
valid_loss_summary = tf.summary.scalar("valid loss", valid_loss_placeholder)
Within training loop:
# Compute valid loss in python by doing sess.run() for each batch
# and averaging
valid_loss = ...
summary = sess.run(valid_loss_summary, {valid_loss_placeholder: valid_loss})
summary_writer.add_summary(summary, step)
For quite some time I'm only saving the summary once per epoch. I never knew that TensorFlows summary would then only save the summary for the last run batch.
Shocked I looked into this problem. This is the solution I came up with (using the dataset API):
loss = ...
train_op = ...
loss_metric, loss_metric_update = tf.metrics.mean(ae_loss)
tf.summary.scalar('loss', loss_metric)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(os.path.join(res_dir, 'train'))
test_writer = tf.summary.FileWriter(os.path.join(res_dir, 'test'))
init_local = tf.initializers.local_variables()
init_global = tf.initializers.global_variables()
sess.run(init_global)
def train_run(epoch):
sess.run([dataset.train_init_op, init_local]) # test_init_op is the operation that switches to test data
for i in range(dataset.num_train_batches): # num_test_batches is the number of batches that should be run for the test set
sess.run([train_op, loss_metric_update])
summary, cur_loss = sess.run([merged, loss_metric])
train_writer.add_summary(summary, epoch)
return cur_loss
def test_run(epoch):
sess.run([dataset.test_init_op, init_local]) # test_init_op is the operation that switches to test data
for i in range(dataset.num_test_batches): # num_test_batches is the number of batches that should be run for the test set
sess.run(loss_metric_update)
summary, cur_loss = sess.run([merged, loss_metric])
test_writer.add_summary(summary, epoch)
return cur_loss
for epoch in range(epochs):
train_loss = train_run(epoch+1)
test_loss = test_run(epoch+1)
print("Epoch: {0:3}, loss: (train: {1:10.10f}, test: {2:10.10f})".format(epoch+1, train_loss, test_loss))
For the summary I'm just wrapping the tensor I'm interested in into tf.metrics.mean()
. For each batch run I call the metrics update operation. At the end of every epoch the metrics tensor will return the correct mean of all batch results.
Don't forget to initialize local variables every time you switch between training and test data. Otherwise your train and test metrics will be near identical.