问题
I was training Bidirectional LSTM type RNN for nearly 24 hours, and due to oscillation in the error I decided to decrease the learning before allowing it to continue training. Since the model is saved using Saver.save(sess,file) at every epoch, I terminated the training with the CTC Loss having minimised to approximately 115.
Now after restoring the model, the initial error rate I am getting is somewhere around 162, which is inconsistent with the flow of error rate I was getting in 7th epoch, and is also what I got in the first epoch. So it is my impression that either "restore" function is not working or if it is working, then there must be something else that is not allowing it to take effect.
Here is my code:
graph = tf.Graph()
with graph.as_default():
# Graph creation
graph_start = time.time()
seq_inputs = tf.placeholder(tf.float32, shape= [None,batch_size,frame_length], name="sequence_inputs")
seq_lens = tf.placeholder(shape=[batch_size],dtype=tf.int32)
seq_inputs = seq_bn(seq_inputs,seq_lens)
initializer = tf.truncated_normal_initializer(mean=0,stddev=0.1)
forward = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
num_proj = hidden_size,
use_peepholes=use_peephole,
initializer=initializer,
state_is_tuple=True)
forward = tf.nn.rnn_cell.MultiRNNCell([forward] * n_layers, state_is_tuple=True)
backward = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
num_proj= hidden_size,
use_peepholes=use_peephole,
initializer=initializer,
state_is_tuple=True)
backward = tf.nn.rnn_cell.MultiRNNCell([backward] * n_layers, state_is_tuple=True)
[fw_out,bw_out], _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=forward, cell_bw=backward, inputs=seq_inputs,time_major=True, dtype=tf.float32, sequence_length=tf.cast(seq_lens,tf.int64))
# Batch normalize forward output
mew,var_ = tf.nn.moments(fw_out,axes=[0])
fw_out = tf.nn.batch_normalization(fw_out,mew,var_,0.1,1,1e-6)
# fw_out = seq_bn(fw_out,seq_lens)
# Batch normalize backward output
mew,var_ = tf.nn.moments(bw_out,axes=[0])
bw_out = tf.nn.batch_normalization(bw_out,mew,var_,0.1,1,1e-6)
# bw_out = seq_bn(bw_out,seq_lens)
# Reshaping forward, and backward outputs for affine transformation
fw_out = tf.reshape(fw_out,[-1,hidden_size])
bw_out = tf.reshape(bw_out,[-1,hidden_size])
# Linear Layer params
W_fw = tf.Variable(tf.truncated_normal(shape=[hidden_size,n_chars],stddev=np.sqrt(2.0 / (hidden_size))))
W_bw = tf.Variable(tf.truncated_normal(shape=[hidden_size,n_chars],stddev=np.sqrt(2.0 / (hidden_size))))
b_out = tf.constant(0.1,shape=[n_chars])
# Perform an affine transformation
logits = tf.add(tf.add(tf.matmul(fw_out,W_fw),tf.matmul(bw_out,W_bw)),b_out)
logits = tf.reshape(logits,[-1,batch_size,n_chars])
# Use CTC Beam Search Decoder to decode pred string from the prob map
decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_lens)
# Target params
indices = tf.placeholder(dtype=tf.int64, shape=[None,2])
values = tf.placeholder(dtype=tf.int32, shape=[None])
shape = tf.placeholder(dtype=tf.int64,shape=[2])
# Make targets
targets = tf.SparseTensor(indices,values,shape)
# Compute Loss
loss = tf.reduce_mean(tf.nn.ctc_loss(logits, targets, seq_lens))
# Compute error rate based on edit distance
predicted = tf.to_int32(decoded[0])
error_rate = tf.reduce_sum(tf.edit_distance(predicted,targets,normalize=False))/ \
tf.to_float(tf.size(targets.values))
tvars = tf.trainable_variables()
grad, _ = tf.clip_by_global_norm(tf.gradients(loss,tvars),max_grad_norm)
optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=momentum)
train_step = optimizer.apply_gradients(zip(grad,tvars))
graph_end = time.time()
print("Time elapsed for creating graph: %.3f"%(round(graph_end-graph_start,3)))
# steps per epoch
start_time = 0
steps = int(np.ceil(len(data_train.files)/batch_size))
loss_tr = []
log_tr = []
loss_vl = []
log_vl = []
err_tr = []
err_vl = []
saver = tf.train.Saver()
with tf.Session(config=config) as sess:
#sess.run(tf.initialize_all_variables())
checkpt_path = tf.train.latest_checkpoint(checkpoint_dir)
print(saver.restore(sess,checkpt_path))
print("Model restore from 7th epoch 188th step")
feed = None
epoch = None
step = None
try:
for epoch in range(7,epochs+1):
if epoch==7:
initial_step = 189
else:
initial_step = 0
transcript = []
loss_val = 0
l_pr = 0
start_time = time.time()
for step in range(initial_step,steps):
train_data, transcript, \
targ_indices, targ_values, \
targ_shape, n_frames = data_train.next_batch()
n_frames = np.reshape(n_frames,[-1])
feed = {seq_inputs: train_data, indices:targ_indices, values:targ_values, shape:targ_shape, seq_lens:n_frames}
del train_data,targ_indices,targ_values,targ_shape,n_frames
# Evaluate loss value, decoded transcript, and log probability
_,loss_val,deco,l_pr,err_rt_tr = sess.run([train_step,loss,decoded,log_prob,error_rate],
feed_dict=feed)
del feed
loss_tr.append(loss_val)
log_tr.append(l_pr)
err_tr.append(err_rt_tr)
# On validation set
val_data, val_transcript, \
targ_indices, targ_values, \
targ_shape, n_frames = data_val.next_batch()
n_frames = np.reshape(n_frames, [-1])
feed = {seq_inputs: val_data, indices: targ_indices,values: targ_values, shape: targ_shape, seq_lens: n_frames}
del val_data, val_transcript,targ_indices,targ_values,targ_shape,n_frames
vl_loss, l_val_pr, err_rt_vl = sess.run([loss, log_prob, error_rate], feed_dict=feed)
del feed
loss_vl.append(vl_loss)
log_vl.append(l_val_pr)
err_vl.append(err_rt_vl)
print("epoch %d, step: %d, tr_loss: %.2f, vl_loss: %.2f, tr_err: %.2f, vl_err: %.2f"
% (epoch, step, np.mean(loss_tr), np.mean(loss_vl), err_rt_tr, err_rt_vl))
end_time = time.time()
elapsed = round(end_time - start_time, 3)
# On training set
# Select a random index within batch_size
sample_index = np.random.randint(0, batch_size)
# Fetch the target transcript
actual_str = [data_train.reverse_map[i] for i in transcript[sample_index]]
# Fetch the decoded path from probability map
pred_sparse = tf.SparseTensor(deco[0].indices, deco[0].values, deco[0].shape)
pred_dense = tf.sparse_tensor_to_dense(pred_sparse)
ans = pred_dense.eval()
#pred = [data_train.reverse_map[i] for i in ans[sample_index, :]]
pred = []
for i in ans[sample_index,:]:
if i == n_chars-1:
pred.append(data_train.reverse_map[0])
else:
pred.append(data_train.reverse_map[i])
print("time_elapsed for 200 steps: %.3f, " % (elapsed))
if epoch%2 == 0:
print("Sample mini-batch results: \n" \
"predicted string: ", np.array(pred))
print("actual string: ", np.array(actual_str))
print("On training set, the loss: %.2f, log_pr: %.3f, error rate %.3f:"% (loss_val, np.mean(l_pr), err_rt_tr))
print("On validation set, the loss: %.2f, log_pr: %.3f, error rate: %.3f" % (vl_loss, np.mean(l_val_pr), err_rt_vl))
# Save the trainable parameters after the end of an epoch
if epoch > 7:
path = saver.save(sess, 'model_%d' % epoch)
print("Session saved at: %s" % path)
np.save(results_fn, np.array([loss_tr, log_tr, loss_vl, log_vl, err_tr, err_vl], dtype=np.object))
except (KeyboardInterrupt, SystemExit, Exception), e:
print("Error/Interruption: %s" % str(e))
exc_type, exc_obj, exc_tb = sys.exc_info()
print("Line no: %d" % exc_tb.tb_lineno)
if epoch > 7:
print("Saving model: %s" % saver.save(sess, 'Last.cpkt'))
print("Current batch: %d" % data_train.b_id)
print("Current epoch: %d" % epoch)
print("Current step: %d"%step)
np.save(results_fn, np.array([loss_tr, log_tr, loss_vl, log_vl, err_tr, err_vl], dtype=np.object))
print("Clossing TF Session...")
sess.close()
print("Terminating Program...")
sys.exit(0)
回答1:
I think you need to re-initialize your accumulators for each epoch.
So these ones must be put at the beginning, inside the loop.
loss_tr = []
log_tr = []
loss_vl = []
log_vl = []
err_tr = []
err_vl = []
来源:https://stackoverflow.com/questions/38880176/tensorflow-saver-restore-not-restoring-all-parameters