Attention机制实践解读

江枫思渺然 提交于 2019-12-12 04:59:33

Attention Model(注意力模型)学习总结

https://blog.csdn.net/mpk_no1/article/details/72862348

[深度概念]·Attention机制实践解读 

https://blog.csdn.net/xiaosongshine/article/details/90573585

[深度应用]·Keras实现Self-Attention文本分类(机器如何读懂人心)

https://blog.csdn.net/xiaosongshine/article/details/90600028

attention 一:

class SelfAttention(tf.layers.Layer):
	def __init__ (self, hidden_size, num_heads, keep_prob):
		if hidden_size % num_heads != 0:
			raise ValueError("Hidden size must be evenly divisible by the number of "
			                 "heads.")

		super(SelfAttention, self).__init__()
		self.hidden_size = hidden_size
		self.num_heads = num_heads
		self.keep_prob = keep_prob

		self.q_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=False, name="q")
		self.k_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=False, name="k")
		self.v_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=False, name="v")

		self.output_dense_layer = tf.layers.Dense(self.hidden_size, use_bias=False, name="output_transform")

	def call (self, x, training):
		q = self.q_dense_layer(x)
		k = self.k_dense_layer(x)
		v = self.v_dense_layer(x)

		q = self.split_heads(q)
		k = self.split_heads(k)
		v = self.split_heads(v)

		depth = (self.hidden_size // self.num_heads)
		q *= depth ** -0.5

		logits = tf.matmul(q, k, transpose_b=True)
		weights = tf.nn.softmax(logits, name="attention_weights")
		if training:
			weights = tf.nn.dropout(weights, self.keep_prob)
		attention_output = tf.matmul(weights, v)

		attention_output = self.combine_heads(attention_output)

		attention_output = self.output_dense_layer(attention_output)
		return attention_output

	def split_heads (self, x):
		with tf.name_scope("split_heads"):
			batch_size = tf.shape(x)[0]
			length = tf.shape(x)[1]

			depth = (self.hidden_size // self.num_heads)

			x = tf.reshape(x, [batch_size, length, self.num_heads, depth])

			return tf.transpose(x, [0, 2, 1, 3])

	def combine_heads (self, x):
		with tf.name_scope("combine_heads"):
			batch_size = tf.shape(x)[0]
			length = tf.shape(x)[2]
			x = tf.transpose(x, [0, 2, 1, 3])
			return tf.reshape(x, [batch_size, length, self.hidden_size])

attention 二:

from keras.preprocessing import sequence
from keras.datasets import imdb
from matplotlib import pyplot as plt
import pandas as pd
 
from keras import backend as K
from keras.engine.topology import Layer
 
 
class Self_Attention(Layer):
 
    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(Self_Attention, self).__init__(**kwargs)
 
    def build(self, input_shape):
        # 为该层创建一个可训练的权重
        #inputs.shape = (batch_size, time_steps, seq_len)
        self.kernel = self.add_weight(name='kernel',
                                      shape=(3,input_shape[2], self.output_dim),
                                      initializer='uniform',
                                      trainable=True)
 
        super(Self_Attention, self).build(input_shape)  # 一定要在最后调用它
 
    def call(self, x):
        WQ = K.dot(x, self.kernel[0])
        WK = K.dot(x, self.kernel[1])
        WV = K.dot(x, self.kernel[2])
 
        print("WQ.shape",WQ.shape)
 
        print("K.permute_dimensions(WK, [0, 2, 1]).shape",K.permute_dimensions(WK, [0, 2, 1]).shape)
 
 
        QK = K.batch_dot(WQ,K.permute_dimensions(WK, [0, 2, 1]))
 
        QK = QK / (64**0.5)
 
        QK = K.softmax(QK)
 
        print("QK.shape",QK.shape)
 
        V = K.batch_dot(QK,WV)
 
        return V
 
    def compute_output_shape(self, input_shape):
 
        return (input_shape[0],input_shape[1],self.output_dim)
#%%
 
from keras.preprocessing import sequence
from keras.datasets import imdb
from matplotlib import pyplot as plt
import pandas as pd
 
from keras import backend as K
from keras.engine.topology import Layer
 
 
class Self_Attention(Layer):
 
    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(Self_Attention, self).__init__(**kwargs)
 
    def build(self, input_shape):
        # 为该层创建一个可训练的权重
        #inputs.shape = (batch_size, time_steps, seq_len)
        self.kernel = self.add_weight(name='kernel',
                                      shape=(3,input_shape[2], self.output_dim),
                                      initializer='uniform',
                                      trainable=True)
 
        super(Self_Attention, self).build(input_shape)  # 一定要在最后调用它
 
    def call(self, x):
        WQ = K.dot(x, self.kernel[0])
        WK = K.dot(x, self.kernel[1])
        WV = K.dot(x, self.kernel[2])
 
        print("WQ.shape",WQ.shape)
 
        print("K.permute_dimensions(WK, [0, 2, 1]).shape",K.permute_dimensions(WK, [0, 2, 1]).shape)
 
 
        QK = K.batch_dot(WQ,K.permute_dimensions(WK, [0, 2, 1]))
 
        QK = QK / (64**0.5)
 
        QK = K.softmax(QK)
 
        print("QK.shape",QK.shape)
 
        V = K.batch_dot(QK,WV)
 
        return V
 
    def compute_output_shape(self, input_shape):
 
        return (input_shape[0],input_shape[1],self.output_dim)
 
max_features = 20000
 
 
 
print('Loading data...')
 
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
#标签转换为独热码
y_train, y_test = pd.get_dummies(y_train),pd.get_dummies(y_test)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
 
 
 
#%%数据归一化处理
 
maxlen = 64
 
 
print('Pad sequences (samples x time)')
 
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
 
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
 
print('x_train shape:', x_train.shape)
 
print('x_test shape:', x_test.shape)
 
#%%
 
batch_size = 32
from keras.models import Model
from keras.optimizers import SGD,Adam
from keras.layers import *
from Attention_keras import Attention,Position_Embedding
 
 
S_inputs = Input(shape=(64,), dtype='int32')
 
embeddings = Embedding(max_features, 128)(S_inputs)
 
 
O_seq = Self_Attention(128)(embeddings)
 
 
O_seq = GlobalAveragePooling1D()(O_seq)
 
O_seq = Dropout(0.5)(O_seq)
 
outputs = Dense(2, activation='softmax')(O_seq)
 
 
model = Model(inputs=S_inputs, outputs=outputs)
 
print(model.summary())
# try using different optimizers and different optimizer configs
opt = Adam(lr=0.0002,decay=0.00001)
loss = 'categorical_crossentropy'
model.compile(loss=loss,
 
             optimizer=opt,
 
             metrics=['accuracy'])
 
#%%
print('Train...')
 
h = model.fit(x_train, y_train,
 
         batch_size=batch_size,
 
         epochs=5,
 
         validation_data=(x_test, y_test))
 
plt.plot(h.history["loss"],label="train_loss")
plt.plot(h.history["val_loss"],label="val_loss")
plt.plot(h.history["acc"],label="train_acc")
plt.plot(h.history["val_acc"],label="val_acc")
plt.legend()
plt.show()
 
#model.save("imdb.h5")
(TF_GPU) D:\Files\DATAs\prjs\python\tf_keras\transfromerdemo>C:/Files/APPs/RuanJian/Miniconda3/envs/TF_GPU/python.exe d:/Files/DATAs/prjs/python/tf_keras/transfromerdemo/train.1.py
Using TensorFlow backend.
Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 64)
x_test shape: (25000, 64)
WQ.shape (?, 64, 128)
K.permute_dimensions(WK, [0, 2, 1]).shape (?, 128, 64)
QK.shape (?, 64, 64)
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
input_1 (InputLayer)         (None, 64)                0
_________________________________________________________________
embedding_1 (Embedding)      (None, 64, 128)           2560000
_________________________________________________________________
self__attention_1 (Self_Atte (None, 64, 128)           49152
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258
=================================================================
Total params: 2,609,410
Trainable params: 2,609,410
Non-trainable params: 0
_________________________________________________________________
None
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
25000/25000 [==============================] - 17s 693us/step - loss: 0.5244 - acc: 0.7514 - val_loss: 0.3834 - val_acc: 0.8278
Epoch 2/5
25000/25000 [==============================] - 15s 615us/step - loss: 0.3257 - acc: 0.8593 - val_loss: 0.3689 - val_acc: 0.8368
Epoch 3/5
25000/25000 [==============================] - 15s 614us/step - loss: 0.2602 - acc: 0.8942 - val_loss: 0.3909 - val_acc: 0.8303
Epoch 4/5
25000/25000 [==============================] - 15s 618us/step - loss: 0.2078 - acc: 0.9179 - val_loss: 0.4482 - val_acc: 0.8215
Epoch 5/5
25000/25000 [==============================] - 15s 619us/step - loss: 0.1639 - acc: 0.9368 - val_loss: 0.5313 - val_acc: 0.8106

 

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!