Foward pass in LSTM netwok learned by keras

问题

I have the following code that I am hoping to get a forward pass from a 2 layer LSTM:

"""
this is a simple numerical example of LSTM forward pass to allow deep understanding
the LSTM is trying to learn the sin function by learning to predict the next value after a sequence of 3 inputs
    example 1: {0.583, 0.633, 0.681} --> {0.725}, these values correspond to
               {sin(35.66), sin(39.27}, sin(42.92)} --> {sin(46.47)}
    example 2: {0.725, 0.767, 0.801} --> {0.849}, these values correspond to
               {sin(46.47), sin(50.09), sin(53.23)} --> {sin(58.10)}

example tested:  [[['0.725323664']
                   ['0.7671179']
                   ['0.805884672']]]
predicted_instance:  [ 0.83467698]


training example pair:  [['0.680666907']
 ['0.725323664']
 ['0.7671179']] 0.805884672

"""
import numpy as np


# linear activation matrix-wise (works also element-wise)
def linear(x):
    return x


# sigmoid function matrix-wise (works also element-wise)
def sigmoid(x):
    return 1/(1 + np.exp(-x))


# hard sigmoid function element wise
def hard_sig(x):
    # in Keras for both tensorflow and theano backend
    return np.max(np.array([0.0, np.min(np.array([1.0, x * 0.2 + 0.5]))]))
    # Courbariaux et al. 2016 (Binarized Neural Networks)
    # return np.max(np.array([0.0, np.min(np.array([1.0, (x + 1.0)/2.0]))]))


# hard sigmoid function matrix wise
def hard_sigmoid(x, fun=hard_sig):
    return np.vectorize(fun)(x)


# hyperbolic tangent function matrix wise (works also element-wise)
def hyperbolic_tangent(x):
    return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))


print(sigmoid(np.array([-100, 0, 100])))
print(hard_sigmoid(np.array([-100, 0, 0.1, 100])))
print(hyperbolic_tangent(np.array([-100, 0, 100])))

parameter_names = ['lstm_1_kernel_0.npy',
                   'lstm_1_recurrent_kernel_0.npy',
                   'lstm_1_bias_0.npy',
                   'lstm_2_kernel_0.npy',
                   'lstm_2_recurrent_kernel_0.npy',
                   'lstm_2_bias_0.npy',
                   'dense_1_kernel_0.npy',
                   'dense_1_bias_0.npy']


# LSTM 1 Weights
lstm_1_kernel_0 = np.load('lstm_1_kernel_0.npy')
print('lstm_1_kernel_0: ', lstm_1_kernel_0.shape)
lstm_1_recurrent_kernel_0 = np.load('lstm_1_recurrent_kernel_0.npy')
print('lstm_1_recurrent_kernel_0: ', lstm_1_recurrent_kernel_0.shape)
lstm_1_bias_0 = np.load('lstm_1_bias_0.npy')
print('lstm_1_bias_0: ', lstm_1_bias_0.shape)

# LSTM 2 Wights
lstm_2_kernel_0 = np.load('lstm_2_kernel_0.npy')
print('lstm_2_kernel_0: ', lstm_2_kernel_0.shape)
lstm_2_recurrent_kernel_0 = np.load('lstm_2_recurrent_kernel_0.npy')
print('lstm_2_recurrent_kernel_0: ', lstm_2_recurrent_kernel_0.shape)
lstm_2_bias_0 = np.load('lstm_2_bias_0.npy')
print('lstm_2_bias_0: ', lstm_2_bias_0.shape)

# Dense layer
dense_1_kernel_0 = np.load('dense_1_kernel_0.npy')
print('dense_1_kernel_0: ', dense_1_kernel_0.shape)
dense_1_bias_0 = np.load('dense_1_bias_0.npy')
print('dense_1_bias_0: ', dense_1_bias_0.shape)

time_seq = [0, 1, 2]
"""
input_seq = np.array([[[0.725323664],
                       [0.7671179],
                       [0.805884672]]])
"""
input_seq = np.array([[[0.680666907],
                       [0.725323664],
                       [0.7671179]]])
print('input_seq: ', input_seq.shape)
for time in time_seq:
    print('input t', time, ':', input_seq[0, time, 0])

"""
# z0 = z[:, :self.units]
# z1 = z[:, self.units: 2 * self.units]
# z2 = z[:, 2 * self.units: 3 * self.units]
# z3 = z[:, 3 * self.units:]

# i = self.recurrent_activation(z0)
# f = self.recurrent_activation(z1)
# c = f * c_tm1 + i * self.activation(z2)
# o = self.recurrent_activation(z3)

# activation =' tanh'
# recurrent_activation = 'hard_sigmoid'
"""


# LSTM 1
x_1_lstm_1 = input_seq[0, 0, 0]
print('x_1: ', x_1_lstm_1)
x_2_lstm_1 = input_seq[0, 1, 0]
print('x_2: ', x_2_lstm_1)
x_3_lstm_1 = input_seq[0, 2, 0]
print('x_3: ', x_3_lstm_1)

c_0_lstm_1 = np.zeros((1, 3))
h_0_lstm_1 = np.zeros((1, 3))

z_1_lstm_1 = np.dot(x_1_lstm_1, lstm_1_kernel_0) + np.dot(h_0_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_1_lstm_1.shape)
i_1_lstm_1 = sigmoid(z_1_lstm_1[:, 0:3])
f_1_lstm_1 = sigmoid(z_1_lstm_1[:, 3:6])
input_to_c_1_lstm_1 = z_1_lstm_1[:, 6:9]
o_1_lstm_1 = sigmoid(z_1_lstm_1[:, 9:12])
c_1_lstm_1 = np.multiply(f_1_lstm_1, c_0_lstm_1) + np.multiply(i_1_lstm_1, hyperbolic_tangent(input_to_c_1_lstm_1))
h_1_lstm_1 = np.multiply(o_1_lstm_1, hyperbolic_tangent(c_1_lstm_1))
print('h_1_lstm_1: ', h_1_lstm_1.shape, h_1_lstm_1)

z_2_lstm_1 = np.dot(x_2_lstm_1, lstm_1_kernel_0) + np.dot(h_1_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_2_lstm_1.shape)
i_2_lstm_1 = sigmoid(z_2_lstm_1[:, 0:3])
f_2_lstm_1 = sigmoid(z_2_lstm_1[:, 3:6])
input_to_c_2_lstm_1 = z_2_lstm_1[:, 6:9]
o_2_lstm_1 = sigmoid(z_2_lstm_1[:, 9:12])
c_2_lstm_1 = np.multiply(f_2_lstm_1, c_1_lstm_1) + np.multiply(i_2_lstm_1, hyperbolic_tangent(input_to_c_2_lstm_1))
h_2_lstm_1 = np.multiply(o_2_lstm_1, hyperbolic_tangent(c_2_lstm_1))
print('h_2_lstm_1: ', h_2_lstm_1.shape, h_2_lstm_1)

z_3_lstm_1 = np.dot(x_3_lstm_1, lstm_1_kernel_0) + np.dot(h_2_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_3_lstm_1.shape)
i_3_lstm_1 = sigmoid(z_3_lstm_1[:, 0:3])
f_3_lstm_1 = sigmoid(z_3_lstm_1[:, 3:6])
input_to_c_3_lstm_1 = z_3_lstm_1[:, 6:9]
o_3_lstm_1 = sigmoid(z_3_lstm_1[:, 9:12])
c_3_lstm_1 = np.multiply(f_3_lstm_1, c_2_lstm_1) + np.multiply(i_3_lstm_1, hyperbolic_tangent(input_to_c_3_lstm_1))
h_3_lstm_1 = np.multiply(o_3_lstm_1, hyperbolic_tangent(c_3_lstm_1))
print('h_3_lstm_1: ', h_3_lstm_1.shape, h_3_lstm_1)

# LSTM 2
x_1_lstm_2 = h_1_lstm_1
x_2_lstm_2 = h_2_lstm_1
x_3_lstm_2 = h_3_lstm_1

c_0_lstm_2 = np.zeros((1, 1))
h_0_lstm_2 = np.zeros((1, 1))

z_1_lstm_2 = np.dot(x_1_lstm_2, lstm_2_kernel_0) + np.dot(h_0_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_1_lstm_2.shape)
i_1_lstm_2 = sigmoid(z_1_lstm_2[:, 0])
f_1_lstm_2 = sigmoid(z_1_lstm_2[:, 1])
input_to_c_1_lstm_2 = z_1_lstm_2[:, 2]
o_1_lstm_2 = sigmoid(z_1_lstm_2[:, 3])
c_1_lstm_2 = np.multiply(f_1_lstm_2, c_0_lstm_2) + np.multiply(i_1_lstm_2, hyperbolic_tangent(input_to_c_1_lstm_2))
h_1_lstm_2 = np.multiply(o_1_lstm_2, hyperbolic_tangent(c_1_lstm_2))
print('h_1_lstm_2: ', h_1_lstm_2.shape, h_1_lstm_2)

z_2_lstm_2 = np.dot(x_2_lstm_2, lstm_2_kernel_0) + np.dot(h_1_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_2_lstm_2.shape)
i_2_lstm_2 = sigmoid(z_2_lstm_2[:, 0])
f_2_lstm_2 = sigmoid(z_2_lstm_2[:, 1])
input_to_c_2_lstm_2 = z_2_lstm_2[:, 2]
o_2_lstm_2 = sigmoid(z_2_lstm_2[:, 3])
c_2_lstm_2 = np.multiply(f_2_lstm_2, c_1_lstm_2) + np.multiply(i_2_lstm_2, hyperbolic_tangent(input_to_c_2_lstm_2))
h_2_lstm_2 = np.multiply(o_2_lstm_2, hyperbolic_tangent(c_2_lstm_2))
print('h_2_lstm_2: ', h_2_lstm_2.shape, h_2_lstm_2)

z_3_lstm_2 = np.dot(x_3_lstm_2, lstm_2_kernel_0) + np.dot(h_2_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_3_lstm_2.shape)
i_3_lstm_2 = sigmoid(z_3_lstm_2[:, 0])
f_3_lstm_2 = sigmoid(z_3_lstm_2[:, 1])
input_to_c_3_lstm_2 = z_3_lstm_2[:, 2]
o_3_lstm_2 = sigmoid(z_3_lstm_2[:, 3])
c_3_lstm_2 = np.multiply(f_3_lstm_2, c_2_lstm_2) + np.multiply(i_3_lstm_2, hyperbolic_tangent(input_to_c_3_lstm_2))
h_3_lstm_2 = np.multiply(o_3_lstm_2, hyperbolic_tangent(c_3_lstm_2))
print('h_3_lstm_2: ', h_3_lstm_2.shape, h_3_lstm_2)

output = np.dot(h_3_lstm_2, dense_1_kernel_0) + dense_1_bias_0
print('output: ', output)

The weights have been saved to file at train time and they can be retrieved from the following location:

LSTM weights

In order to create the LSTM which is fitting a sinwave signal I have used the following code in Keras:

def build_simple_model(layers):
    model = Sequential()

    model.add(LSTM(input_shape=(layers[1], layers[0]),
                   output_dim=layers[1],
                   return_sequences=True,
                   activation='tanh',
                   recurrent_activation='sigmoid')) # 'hard_sigmoid'
    # model.add(Dropout(0.2))
    model.add(LSTM(layers[2],
                   return_sequences=False,
                   activation='tanh',
                   recurrent_activation='sigmoid')) # 'hard_sigmoid'

    # model.add(Dropout(0.2))
    model.add(Dense(output_dim=layers[3]))
    model.add(Activation("linear"))

    start = time.time()
    model.compile(loss="mse", optimizer="rmsprop")
    print("> Compilation Time : ", time.time() - start)
    plot_model(model, to_file='lstm_model.png', show_shapes=True, show_layer_names=True)
    print(model.summary())
    return model

This resulted in the following model:

I have used the training procedure as follows:

    seq_len = 3        
    model = lstm.build_simple_model([1, seq_len, 1, 1])

    model.fit(X_train,
              y_train,
              batch_size=512,
              nb_epoch=epochs,
              validation_split=0.05)

Would it be possible to understand why my forward pass does not produce the desired output in predicting a future sin() signal value based on three previous consecutive ones.

The original example on which I am trying to base my forward pass exercise originates here. The weights uploaded in .npy format are from a network that is able to perfectly predict the next sin() value in a series.

回答1:

I realised what the problem was. I was trying to extract my model weights using Tensorflow session (after model fitting), rather than via Keras methods directly. This resulted in weights matrices that made perfect sense (dimension wise) but contained the values from initialization step.

model.fit(X_train,
          y_train,
          batch_size=batch_size,
          nb_epoch=epochs,
          validation_split=0.05,
          callbacks=callbacks_list)

print('n_parameters: ', len(model.weights))
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

parameter_names = ['lstm_1_kernel_0',
                   'lstm_1_recurrent_kernel_0',
                   'lstm_1_bias_0',
                   'lstm_2_kernel_0',
                   'lstm_2_recurrent_kernel_0',
                   'lstm_2_bias_0',
                   'dense_1_kernel_0',
                   'dense_1_bias_0']

weights = model.get_weights()
trainable_weights = model.trainable_weights
for parameter in range(len(model.weights)):
    print('')
    # using Keras methods is the correct way
    print('parameter: ', trainable_weights[parameter])
    print('parameter Keras: ', weights[parameter])
    # using session with TF is the wrong way
    print('parameter TF: ', model.weights[parameter].eval(session=sess))
    #np.save(parameter_names[parameter], model.weights[parameter].eval(session=sess))
    #np.save(parameter_names[parameter], weights[parameter])

This prints the following to screen:

parameter:  <tf.Variable 'lstm_1/kernel:0' shape=(1, 12) dtype=float32_ref>
parameter Keras:  [[ 0.02005039  0.59627813 -0.77670902 -0.17643917  0.64905447 -0.49418128
   0.01204901  0.79791737 -1.58887422 -0.3566488   0.67758918  0.77245694]]
parameter TF:  [[-0.20346385 -0.07166874 -0.58842945  0.03744811  0.46911311 -0.0469712
  -0.07291448  0.27316415 -0.53298378  0.08367682  0.10194337  0.20933461]]

parameter:  <tf.Variable 'lstm_1/recurrent_kernel:0' shape=(3, 12) dtype=float32_ref>
parameter Keras:  [[ 0.01916649 -0.30881727 -0.07018201  0.28770521 -0.45713434 -0.33738521
   0.53091544 -0.78456688  0.50647908  0.12326431 -0.18517831 -0.28752103]
 [ 0.44490865 -0.09020164  1.00983524  0.43070397 -0.14646551 -0.53908533
   1.33833826  0.76106179 -1.28808987  0.71029669 -0.19338571 -0.30499896]
 [ 0.76727188 -0.10291406  0.53285897  0.31021088  0.46876401  0.04961515
   0.0573149   1.17765784 -0.45716232  0.26181531  0.60458028 -0.6042906 ]]
parameter TF:  [[-0.044281   -0.42013288 -0.06702472  0.16710882  0.07229936  0.20263752
   0.01935999 -0.65925431  0.21676332  0.02481769  0.50321299 -0.08369029]
 [-0.17725646 -0.14031938 -0.07758044 -0.39292315  0.36675838 -0.20198873
   0.59491426 -0.12469263  0.14705807  0.39603388 -0.25511321 -0.01221756]
 [ 0.51603764  0.34401873  0.36002275  0.05344227 -0.00293417 -0.36086732
   0.1636388  -0.24916036  0.09064917 -0.04246153  0.05563453 -0.5006755 ]]

parameter:  <tf.Variable 'lstm_1/bias:0' shape=(12,) dtype=float32_ref>
parameter Keras:  [  3.91339064e-01  -2.09703773e-01  -4.88098420e-04   1.15376031e+00
   6.24452651e-01   2.24053934e-01   4.06851530e-01   4.78419960e-01
   1.77846551e-01   3.19107175e-01   5.16630232e-01  -2.22970009e-01]
parameter TF:  [ 0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  0.]

parameter:  <tf.Variable 'lstm_2/kernel:0' shape=(3, 4) dtype=float32_ref>
parameter Keras:  [[ 2.01334882  1.9168334   1.77633524 -0.90856379]
 [ 1.17618477  1.02978265 -0.06435115  0.66180402]
 [-1.33014703 -0.71629387 -0.87376142  1.35648465]]
parameter TF:  [[ 0.83115911  0.72150767  0.51600969 -0.52725452]
 [ 0.53043616  0.59162521 -0.59219611  0.0951736 ]
 [-0.8030411  -0.00424314 -0.06715947  0.67533839]]

parameter:  <tf.Variable 'lstm_2/recurrent_kernel:0' shape=(1, 4) dtype=float32_ref>
parameter Keras:  [[-0.09348518 -0.7667768   0.24031806 -0.39155772]]
parameter TF:  [[-0.085137   -0.59010917  0.61000961 -0.52193022]]

parameter:  <tf.Variable 'lstm_2/bias:0' shape=(4,) dtype=float32_ref>
parameter Keras:  [ 1.21466994  2.22224903  1.34946632  0.19186479]
parameter TF:  [ 0.  1.  0.  0.]

parameter:  <tf.Variable 'dense_1/kernel:0' shape=(1, 1) dtype=float32_ref>
parameter Keras:  [[ 2.69569159]]
parameter TF:  [[ 1.5422312]]

parameter:  <tf.Variable 'dense_1/bias:0' shape=(1,) dtype=float32_ref>
parameter Keras:  [ 0.20767514]
parameter TF:  [ 0.]

The forward pass code was therefore correct.The weights were wrong.The correct weights .npy files have also been updated at the link mentioned in the question. This forward pass can be used to illustrate sequence generation with LSTM by recycling the output.

来源：https://stackoverflow.com/questions/47702234/foward-pass-in-lstm-netwok-learned-by-keras

标签

tensorflow

keras

lstm

recurrent-neural-network