问题
For background, I am referring to the Hierarchical Attention Network used for sentiment classification.
For code: my full code is posted below, but it is just simple revision of the original code posted by the author on the link above. And I explain my changes below. For training data: here For word embeddings: this is the Glove embedding here Key config: Keras 2.0.9, Scikit-Learn 0.19.1, Theano 0.9.0
The original code posted in the link above takes a 3D shape input, i.e., (review, sentence, word). And the attention mechanism is applied to sentence, and also words. So it had two attention components, as you can see in the fourth code block on the webpage.
I would like to change it to one that takes only a 2D shape input. I do this by
- changing the input shape and input embedding matrix (see inline comment in my code below)
- changing the model building part by removing the sentence attention component, keeping only the word attention component (see inline comment in my code below)
However, the code generates an error when 'model.fit' is called. I post the complete code and error below.
CODE:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import os
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers
MAX_SENT_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
def clean_str(string):
"""
Tokenization/string cleaning for dataset
Every dataset is lower cased except
"""
string = re.sub(r"\\", "", string)
string = re.sub(r"\'", "", string)
string = re.sub(r"\"", "", string)
return string.strip().lower()
#replace this to your own file path
data_train = pd.read_csv('/home/zz/Work/wop/data/sentiment/labeledTrainData_small.tsv', sep='\t')
print(data_train.shape)
labels = []
texts = []
for idx in range(data_train.review.shape[0]):
text = BeautifulSoup(data_train.review[idx])
text = clean_str(text.get_text().encode('ascii', 'ignore').decode('ascii'))
texts.append(text)
labels.append(data_train.sentiment[idx])
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
##################################
# Change 1. The input shape is now 2D (sentence, words) instead of 3D
##################################
data = np.zeros((len(texts), MAX_SENT_LENGTH), dtype='int32')
for i, content in enumerate(texts):
wordTokens = text_to_word_sequence(content)
k = 0
for _, word in enumerate(wordTokens):
if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
data[i, k] = tokenizer.word_index[word]
k = k + 1
##################################
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))
#replace with your own embedding file path
GLOVE_DIR = "/home/zz/Work/data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SENT_LENGTH,
trainable=True,
mask_zero=True)
class AttLayer(Layer):
def __init__(self, attention_dim,**kwargs):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(AttLayer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weights = [self.W, self.b, self.u]
super(AttLayer, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return None
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
uit = tf.matmul(x, uit)
uit = K.tanh(K.bias_add(uit, self.b))
ait = K.dot(uit, self.u)
ait = K.squeeze(ait, -1)
ait = K.exp(ait)
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
ait = K.expand_dims(ait)
weighted_input = x * ait
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1])
#################################################
# Change 2. The model contains only one attention block now
#################################################
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(200))(l_lstm)
l_att = AttLayer(100)(l_dense)
############################################
preds = Dense(2, activation='softmax')(l_att)
model = Model(sentence_input, preds)
#### clone the model #### Line X
model_copy = clone_model(model)
plot_model(model, to_file="model.png")
model.summary()
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=10, batch_size=50,verbose=2)
ERROR: the last line of the code generates an error trace of:
Epoch 1/10
Traceback (most recent call last):
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 884, in __call__
self.fn() if output_subset is None else\
ValueError: Input dimension mis-match. (input[0].shape[1] = 50, input[1].shape[1] = 100)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/zz/Work/wop/code/python/src/3rdparty/han/textClassfierHATT2D.py", line 187, in <module>
nb_epoch=10, batch_size=50,verbose=2)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1631, in fit
validation_steps=validation_steps)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1213, in _fit_loop
outs = f(ins_batch)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/backend/theano_backend.py", line 1223, in __call__
return self.function(*inputs)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 898, in __call__
storage_map=getattr(self.fn, 'storage_map', None))
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/gof/link.py", line 325, in raise_with_op
reraise(exc_type, exc_value, exc_trace)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/six.py", line 692, in reraise
raise value.with_traceback(tb)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 884, in __call__
self.fn() if output_subset is None else\
ValueError: Input dimension mis-match. (input[0].shape[1] = 50, input[1].shape[1] = 100)
Apply node that caused the error: Elemwise{mul,no_inplace}(InplaceDimShuffle{x,0}.0, Elemwise{Cast{float32}}.0)
Toposort index: 459
Inputs types: [TensorType(float32, row), TensorType(float32, matrix)]
Inputs shapes: [(1, 50), (50, 100)]
Inputs strides: [(200, 4), (400, 4)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0)]]
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
I would really appreciate some advice on this, many thanks in advance!
回答1:
In the referenced tutorial, it choose to use theano
instead of tensorflow
because the behavior of dot
in tensorflow
is different from that in numpy
. But I am not familiar with theano
so it's hard for me to make it work correctly by using theano
backend. I rather use a series of operations to mimic the behavior of dot
in numpy
. Following I've changed the K.dot
to a series of operations.
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import os
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed, Lambda
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers
class AttLayer(Layer):
def __init__(self, attention_dim, **kwargs):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(AttLayer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weights = [self.W, self.b, self.u]
super(AttLayer, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return None
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
uit = tf.matmul(x, uit)
uit = K.tanh(K.bias_add(uit, self.b))
ait = K.dot(uit, self.u)
ait = K.squeeze(ait, -1)
ait = K.exp(ait)
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
ait = K.expand_dims(ait)
weighted_input = x * ait
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1])
# https://github.com/keras-team/keras/issues/5401
# solve the problem of keras.models.clone_model
# and model.save_weights, model.load_weights
def get_config(self):
config = {'attention_dim': self.attention_dim}
base_config = super(AttLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
Also compute_mask
now returns None
because there is no sel_len
axis in AttLayer
's output.
Following is a script validating that the two operations are equivalent:
B = 8
S = 100
E = 200
A = 50
X = np.random.randn(B, S, E)
W = np.random.randn(E, A)
np_result = np.dot(X, W) #shape correct
X_ph = tf.placeholder(tf.float64)
W_ph = tf.placeholder(tf.float64)
tf_dot = tf.matmul(X_ph,
tf.tile(
tf.expand_dims(W_ph, axis=0),
(K.shape(X_ph)[0], 1, 1)))
with tf.Session() as sess:
tf_result = sess.run(tf_dot,
feed_dict = {X_ph:X, W_ph:W})
print(np.allclose(np_result, tf_result)) #True
Training history(I set batch_size
as 8):
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 1247s 62ms/step - loss: 0.4203 - acc: 0.8044 - val_loss: 0.3520 - val_acc: 0.8468
Epoch 2/10
20000/20000 [==============================] - 985s 49ms/step - loss: 0.2344 - acc: 0.9070 - val_loss: 0.3411 - val_acc: 0.8586
Epoch 3/10
20000/20000 [==============================] - 996s 50ms/step - loss: 0.0982 - acc: 0.9628 - val_loss: 0.4474 - val_acc: 0.8512
Epoch 4/10
20000/20000 [==============================] - 966s 48ms/step - loss: 0.0285 - acc: 0.9904 - val_loss: 0.7837 - val_acc: 0.8408
Epoch 5/10
20000/20000 [==============================] - 912s 46ms/step - loss: 0.0179 - acc: 0.9936 - val_loss: 1.0177 - val_acc: 0.8440
Epoch 6/10
20000/20000 [==============================] - 910s 45ms/step - loss: 0.0105 - acc: 0.9963 - val_loss: 1.0635 - val_acc: 0.8418
Epoch 7/10
20000/20000 [==============================] - 909s 45ms/step - loss: 0.0101 - acc: 0.9964 - val_loss: 1.0966 - val_acc: 0.8372
Epoch 8/10
20000/20000 [==============================] - 909s 45ms/step - loss: 0.0057 - acc: 0.9981 - val_loss: 1.2678 - val_acc: 0.8392
Epoch 9/10
20000/20000 [==============================] - 910s 46ms/step - loss: 0.0077 - acc: 0.9974 - val_loss: 1.2166 - val_acc: 0.8258
Epoch 10/10
20000/20000 [==============================] - 910s 46ms/step - loss: 0.0056 - acc: 0.9985 - val_loss: 1.4640 - val_acc: 0.8204
来源:https://stackoverflow.com/questions/54967216/hierarchical-attention-network-model-fit-generates-error-valueerror-input-di