Training of multi-output Keras model on a joint loss function

问题

I'm writing two joint decoders in Keras, with one common input, two separate outputs, and a loss function that takes both outputs into account. The problem that I have is with the loss function.

Here is the minimal Keras code that you can reproduce the error:

import tensorflow as tf
from scat import *

from keras.layers import Input, Reshape, Permute, Lambda, Flatten
from keras.layers.core import Dense
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Model
from keras import backend as K

def identity(x):
    return K.identity(x)

# custom loss function
def custom_loss():
    def my_loss(y_dummy, pred):
        fcn_loss_1 = tf.nn.softmax_cross_entropy_with_logits(labels=y_dummy[0], logits=pred[0])
        fcn_loss_2 = tf.nn.softmax_cross_entropy_with_logits(labels=y_dummy[1], logits=pred[1])
        fcn_loss_2 = tf.matrix_band_part(fcn_loss_2, 0, -1) - tf.matrix_band_part(fcn_loss_2, 0, 0)

        fcn_loss = tf.reduce_mean(fcn_loss_1) + 2 * tf.reduce_mean(fcn_loss_2)

        return fcn_loss
    return my_loss

def keras_version():
    input = Input(shape=(135,), name='feature_input')
    out1 = Dense(128, kernel_initializer='glorot_normal', activation='linear')(input)
    out1 = LeakyReLU(alpha=.2)(out1)
    out1 = Dense(256, kernel_initializer='glorot_normal', activation='linear')(out1)
    out1 = LeakyReLU(alpha=.2)(out1)
    out1 = Dense(512, kernel_initializer='glorot_normal', activation='linear')(out1)
    out1 = LeakyReLU(alpha=.2)(out1)
    out1 = Dense(45, kernel_initializer='glorot_normal', activation='linear')(out1)
    out1 = LeakyReLU(alpha=.2)(out1)
    out1 = Reshape((9, 5))(out1)

    out2 = Dense(128, kernel_initializer='glorot_normal', activation='linear')(input)
    out2 = LeakyReLU(alpha=.2)(out2)
    out2 = Dense(256, kernel_initializer='glorot_normal', activation='linear')(out2)
    out2 = LeakyReLU(alpha=.2)(out2)
    out2 = Dense(512, kernel_initializer='glorot_normal', activation='linear')(out2)
    out2 = LeakyReLU(alpha=.2)(out2)
    out2 = Dense(540, kernel_initializer='glorot_normal', activation='linear')(out2)
    out2 = LeakyReLU(alpha=.2)(out2)
    out2 = Reshape((9, 4, 15))(out2)
    out2 = Lambda(lambda x: K.dot(K.permute_dimensions(x, (0, 2, 1, 3)),
                                  K.permute_dimensions(x, (0, 2, 3, 1))), output_shape=(4,9,9))(out2)
    out2 = Flatten()(out2)
    out2 = Dense(324, kernel_initializer='glorot_normal', activation='linear')(out2)
    out2 = LeakyReLU(alpha=.2)(out2)
    out2 = Reshape((4, 9, 9))(out2)
    out2 = Lambda(lambda x: K.permute_dimensions(x, (0, 2, 3, 1)))(out2)

    out1 = Lambda(identity, name='output_1')(out1)
    out2 = Lambda(identity, name='output_2')(out2)

    return Model(input, [out1, out2])

model = keras_version()
model.compile(loss=custom_loss(), optimizer='adam')

model.summary()

feature_final = np.random.normal(0,1,[5000, 9, 15])
train_features_array = np.random.normal(0,1,[5000, 9, 5])
train_adj_array = np.random.normal(0,1,[5000, 9, 9, 4])

feature_final = feature_final.reshape(-1, 135)
model.fit(feature_final, [train_features_array, train_adj_array],
                batch_size=50,
                epochs=10
                )

The error I get is:

File "...", line 135, in <module>
    epochs=10
File ".../keras/engine/training.py", line 1039, in fit
    validation_steps=validation_steps)
File ".../keras/backend/tensorflow_backend.py", line 2675, in _call
    fetched = self._callable_fn(*array_vals)
File ".../tensorflow/python/client/session.py", line 1458, in __call__
    run_metadata_ptr)
tensorflow.python.framework.errors_impl.InvalidArgumentError: input must be at least 2-dim, received shape: [9]
     [[{{node loss/output_1_loss/MatrixBandPart_1}}]]

On a second attempt, I tried writing two loss functions and using loss weights to combine them.

# custom loss function
def custom_loss_1():
    def my_loss_1(y_dummy, pred):
        fcn_loss_1 = tf.nn.softmax_cross_entropy_with_logits(labels=y_dummy[0], logits=pred[0])

        return tf.reduce_mean(fcn_loss_1)
    return my_loss_1

def custom_loss_2():
    def my_loss_2(y_dummy, pred):
        fcn_loss_2 = tf.nn.softmax_cross_entropy_with_logits(labels=y_dummy[1], logits=pred[1])
        fcn_loss_2 = tf.matrix_band_part(fcn_loss_2, 0, -1) - tf.matrix_band_part(fcn_loss_2, 0, 0)

        return tf.reduce_mean(fcn_loss_2)
    return my_loss_2

model.compile(loss={'output_1':custom_loss_1(), 'output_2':custom_loss_2()},
              loss_weights={'output_1':1.0, 'output_2':2.0}, optimizer='adam')

but I received

tensorflow.python.framework.errors_impl.InvalidArgumentError: Matrix size-incompatible: In[0]: [20,25920], In[1]: [324,324]
     [[{{node dense_9/BiasAdd}}]]

In that case, the problem might actually be from the model itself. Here is the model.summary:

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
feature_input (InputLayer)      (None, 135)          0                                            
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 128)          17408       feature_input[0][0]              
__________________________________________________________________________________________________
leaky_re_lu_5 (LeakyReLU)       (None, 128)          0           dense_5[0][0]                    
__________________________________________________________________________________________________
dense_6 (Dense)                 (None, 256)          33024       leaky_re_lu_5[0][0]              
__________________________________________________________________________________________________
leaky_re_lu_6 (LeakyReLU)       (None, 256)          0           dense_6[0][0]                    
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 512)          131584      leaky_re_lu_6[0][0]              
__________________________________________________________________________________________________
leaky_re_lu_7 (LeakyReLU)       (None, 512)          0           dense_7[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          17408       feature_input[0][0]              
__________________________________________________________________________________________________
dense_8 (Dense)                 (None, 540)          277020      leaky_re_lu_7[0][0]              
__________________________________________________________________________________________________
leaky_re_lu_1 (LeakyReLU)       (None, 128)          0           dense_1[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_8 (LeakyReLU)       (None, 540)          0           dense_8[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 256)          33024       leaky_re_lu_1[0][0]              
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 9, 4, 15)     0           leaky_re_lu_8[0][0]              
__________________________________________________________________________________________________
leaky_re_lu_2 (LeakyReLU)       (None, 256)          0           dense_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 4, 9, 9)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 512)          131584      leaky_re_lu_2[0][0]              
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 324)          0           lambda_1[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_3 (LeakyReLU)       (None, 512)          0           dense_3[0][0]                    
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 324)          105300      flatten_1[0][0]                  
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 45)           23085       leaky_re_lu_3[0][0]              
__________________________________________________________________________________________________
leaky_re_lu_9 (LeakyReLU)       (None, 324)          0           dense_9[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_4 (LeakyReLU)       (None, 45)           0           dense_4[0][0]                    
__________________________________________________________________________________________________
reshape_3 (Reshape)             (None, 4, 9, 9)      0           leaky_re_lu_9[0][0]              
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 9, 5)         0           leaky_re_lu_4[0][0]              
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 9, 9, 4)      0           reshape_3[0][0]                  
__________________________________________________________________________________________________
output_1 (Lambda)               (None, 9, 5)         0           reshape_1[0][0]                  
__________________________________________________________________________________________________
output_2 (Lambda)               (None, 9, 9, 4)      0           lambda_2[0][0]                   
==================================================================================================
Total params: 769,437
Trainable params: 769,437
Non-trainable params: 0
__________________________________________________________________________________________________

If you think the model has an issue, please check "model". This question is different from this question which uses only one output in the loss. Here is also the loss function from a similar model that was written in Tensorflow:

# -- loss function
Y_1 = tf.placeholder(tf.float32, shape=[None, 9, 9, 4])
Y_2 = tf.placeholder(tf.float32, shape=[None, 9, 5])

loss_1 = tf.nn.softmax_cross_entropy_with_logits(labels=Y_2, logits=fcn(X)[0])
loss_2 = tf.nn.softmax_cross_entropy_with_logits(labels=Y_1, logits=fcn(X)[1])
loss_2 = tf.matrix_band_part(loss_2, 0, -1) - tf.matrix_band_part(loss_2, 0, 0)

loss = tf.reduce_mean(loss_1) + 2 * tf.reduce_mean(loss_2)

Edits: I tried the code in the answer with the actual dataset, and loss function shows a different behavior from Tensorflow implementation of the code. The loss function suggested in the answers converges quickly and becomes nan. I agree with the answer which says output_1 should be categorical. Based on this, I wrote the following loss function, which still does not converge as fast as Tensorflow one, but definitly does not blow up:

def custom_loss_1(model, output_1):
    """ This loss function is called for output2
        It needs to fetch model.output[0] and the output_1 predictions in
        order to calculate fcn_loss_1
    """
    def my_loss(y_true, y_pred):
        fcn_loss_1 = tf.nn.softmax_cross_entropy_with_logits(labels=model.targets[0], logits=output_1)

        return tf.reduce_mean(fcn_loss_1)

    return my_loss

def custom_loss_2():
    """ This loss function is called for output2
        It needs to fetch model.output[0] and the output_1 predictions in
        order to calculate fcn_loss_1
    """
    def my_loss(y_true, y_pred):
        fcn_loss_2 = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
        fcn_loss_2 = tf.matrix_band_part(fcn_loss_2, 0, -1) - tf.matrix_band_part(fcn_loss_2, 0, 0)
        return tf.reduce_mean(fcn_loss_2)

    return my_loss

output_layer_1 = [layer for layer in model.layers if layer.name == 'output_1'][0]
losses = {'output_1': custom_loss_1(model, output_layer_1.output), 'output_2': custom_loss_2()}
model.compile(loss=losses, optimizer='adam', loss_weights=[1.0, 2.0])

回答1:

You had two issues in your code:

The first is that the K.dot operation inside the Lambda needed to be K.batch_dot

I used:

def output_mult(x):
    a = K.permute_dimensions(x, (0, 2, 1, 3))
    b = K.permute_dimensions(x, (0, 2, 3, 1))
    return K.batch_dot(a, b)


out2 = Lambda(output_mult)(out2)

It helps to actually let Keras compute the output dimensions. It is an easy way to check the code. In order to debug it, I first replaced the custom loss with an exists loss (mse) and this was easy to detect.

Second issue is that a custom loss function takes a single pair of target / output rather than a list. The arguments to a loss function are not a list of tensors as you assumed both initially and in your edit. So I defined your loss function as

def custom_loss(model, output_1):
    """ This loss function is called for output2
        It needs to fetch model.output[0] and the output_1 predictions in
        order to calculate fcn_loss_1
    """
    def my_loss(y_true, y_pred):
        fcn_loss_1 = tf.nn.softmax_cross_entropy_with_logits(labels=model.targets[0], logits=output_1)
        fcn_loss_2 = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
        fcn_loss_2 = tf.matrix_band_part(fcn_loss_2, 0, -1) - tf.matrix_band_part(fcn_loss_2, 0, 0)
        return tf.reduce_mean(fcn_loss_2)

    return my_loss

And used it as

output_layer_1 = [layer for layer in model.layers if layer.name == 'output_1'][0]
losses = {'output_1': 'categorical_crossentropy', 'output_2': custom_loss(model, output_layer_1.output)}
model.compile(loss=losses, optimizer='adam', loss_weights=[1.0, 2.0])

Edit: I initially misread the custom loss for output2 as requiring the value of fcn_loss_1, this doesn't seem to be the case and you can just write this as:

def custom_loss():
    def my_loss(y_true, y_pred):
        fcn_loss_2 = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
        fcn_loss_2 = tf.matrix_band_part(fcn_loss_2, 0, -1) - tf.matrix_band_part(fcn_loss_2, 0, 0)
        return tf.reduce_mean(fcn_loss_2)

    return my_loss

And used it as:

losses = {'output_1': 'categorical_crossentropy', 'output_2': custom_loss()}
model.compile(loss=losses, optimizer='adam', loss_weights=[1.0, 2.0])

I'm making the assumption that the loss for output_1 is categorical_crossentropy. But even if you need to change it, the simplest way to do it is to have 2 independent loss functions. Of course you can also choose to define a loss function that returns 0 and one that returns the full cost... but it would be cleaner to split the 'loss(output1) + 2 * loss(output2)' in two loss plus the weights, imho.

Full notebook: https://colab.research.google.com/drive/1NG3uIiesg-VIt-W9254Sea2XXUYPoVH5

来源：https://stackoverflow.com/questions/56996864/training-of-multi-output-keras-model-on-a-joint-loss-function

标签

python

tensorflow

keras

loss-function