如何使用 keras-self-attention package 可视化注意力 LSTM？

Question

我正在使用（keras-self-attention）在 KERAS 中实现注意力 LSTM。 训练 model 后如何可视化注意力部分？ 这是一个时间序列预测案例。

from keras.models import Sequential
from keras_self_attention import SeqWeightedAttention
from keras.layers import LSTM, Dense, Flatten

model = Sequential()
model.add(LSTM(activation = 'tanh' ,units = 200, return_sequences = True, 
               input_shape = (TrainD[0].shape[1], TrainD[0].shape[2])))
model.add(SeqSelfAttention())
model.add(Flatten())    
model.add(Dense(1, activation = 'relu'))

model.compile(optimizer = 'adam', loss = 'mse')

Answer 1

一种方法是获取给定输入的SeqSelfAttention的输出，并组织它们以显示每个通道的预测（见下文）。 如需更高级的内容，请查看iNNvestigate 库（包括使用示例）。

更新：我也可以推荐See RNN ，我写的 package。

解释： show_features_1D获取layer_name （可以是子字符串）层输出并显示每个通道（标记）的预测，沿 x 轴的时间步长和沿 y 轴的 output 值。

input_data =单批形状数据(1, input_shape)
prefetched_outputs = 已经获取的层输出； 覆盖input_data
max_timesteps = max # 要显示的时间步数
max_col_subplots = 沿水平方向的最大子图数
equate_axes = 强制所有 x 轴和 y 轴相等（推荐用于公平比较）
show_y_zero = 是否将 y=0 显示为红线
channel_axis = 层特征维度（例如 LSTM 的units ，最后一个）
scale_width, scale_height = 缩放显示的图像宽度和高度
dpi = 图像质量（每英寸点数）

视觉效果（下）解释：

首先对于查看提取特征的形状很有用，无论幅度如何 - 提供有关例如频率内容的信息
其次对于查看特征关系很有用——例如相对幅度、偏差和频率。 下面的结果与上面的图像形成鲜明对比，因为运行print(outs_1)显示所有幅度都非常小并且变化不大，因此包括 y=0 点和等轴轴会产生类似线的视觉效果，可以解释为self-attention是偏向的。
第三个对于可视化太多而无法像上面那样可视化的特征很有用； 用batch_shape而不是input_shape定义 model 会删除所有? 在打印的形状中，我们可以看到第一个输出的形状是(10, 60, 240) ，第二个是(10, 240, 240) 。 换句话说，第一个 output 返回 LSTM 通道注意力，第二个是“timesteps attention”。 下面的热图结果可以解释为显示注意力“冷却” w.r.t。 时间步长。

SeqWeightedAttention更容易可视化，但没有太多可视化； 您需要摆脱上面的Flatten才能使其工作。 注意的 output 形状然后变为(10, 60)和(10, 240) - 您可以使用简单的直方图plt.hist （只需确保排除批次维度 - 即 feed (60,)或(240,) ）。

 from keras.layers import Input, Dense, LSTM, Flatten, concatenate from keras.models import Model from keras.optimizers import Adam from keras_self_attention import SeqSelfAttention import numpy as np ipt = Input(shape=(240,4)) x = LSTM(60, activation='tanh', return_sequences=True)(ipt) x = SeqSelfAttention(return_attention=True)(x) x = concatenate(x) x = Flatten()(x) out = Dense(1, activation='sigmoid')(x) model = Model(ipt,out) model.compile(Adam(lr=1e-2), loss='binary_crossentropy') X = np.random.rand(10,240,4) # dummy data Y = np.random.randint(0,2,(10,1)) # dummy labels model.train_on_batch(X, Y) outs = get_layer_outputs(model, 'seq', X[0:1], 1) outs_1 = outs[0] outs_2 = outs[1] show_features_1D(model,'lstm',X[0:1],max_timesteps=100,equate_axes=False,show_y_zero=False) show_features_1D(model,'lstm',X[0:1],max_timesteps=100,equate_axes=True, show_y_zero=True) show_features_2D(outs_2[0]) # [0] for 2D since 'outs_2' is 3D

 def show_features_1D(model=None, layer_name=None, input_data=None, prefetched_outputs=None, max_timesteps=100, max_col_subplots=10, equate_axes=False, show_y_zero=True, channel_axis=-1, scale_width=1, scale_height=1, dpi=76): if prefetched_outputs is None: layer_outputs = get_layer_outputs(model, layer_name, input_data, 1)[0] else: layer_outputs = prefetched_outputs n_features = layer_outputs.shape[channel_axis] for _int in range(1, max_col_subplots+1): if (n_features/_int).is_integer(): n_cols = int(n_features/_int) n_rows = int(n_features/n_cols) fig, axes = plt.subplots(n_rows,n_cols,sharey=equate_axes,dpi=dpi) fig.set_size_inches(24*scale_width,16*scale_height) subplot_idx = 0 for row_idx in range(axes.shape[0]): for col_idx in range(axes.shape[1]): subplot_idx += 1 feature_output = layer_outputs[:,subplot_idx-1] feature_output = feature_output[:max_timesteps] ax = axes[row_idx,col_idx] if show_y_zero: ax.axhline(0,color='red') ax.plot(feature_output) ax.axis(xmin=0,xmax=len(feature_output)) ax.axis('off') ax.annotate(str(subplot_idx),xy=(0,.99),xycoords='axes fraction', weight='bold',fontsize=14,color='g') if equate_axes: y_new = [] for row_axis in axes: y_new += [np.max(np.abs([col_axis.get_ylim() for col_axis in row_axis]))] y_new = np.max(y_new) for row_axis in axes: [col_axis.set_ylim(-y_new,y_new) for col_axis in row_axis] plt.show()

def show_features_1D(model=None, layer_name=None, input_data=None,
                     prefetched_outputs=None, max_timesteps=100,
                     max_col_subplots=10, equate_axes=False,
                     show_y_zero=True, channel_axis=-1,
                     scale_width=1, scale_height=1, dpi=76):
    if prefetched_outputs is None:
        layer_outputs = get_layer_outputs(model, layer_name, input_data, 1)[0]
    else:
        layer_outputs = prefetched_outputs
    n_features    = layer_outputs.shape[channel_axis]

    for _int in range(1, max_col_subplots+1):
      if (n_features/_int).is_integer():
        n_cols = int(n_features/_int)
    n_rows = int(n_features/n_cols)

    fig, axes = plt.subplots(n_rows,n_cols,sharey=equate_axes,dpi=dpi)
    fig.set_size_inches(24*scale_width,16*scale_height)

    subplot_idx = 0
    for row_idx in range(axes.shape[0]):
      for col_idx in range(axes.shape[1]): 
        subplot_idx += 1
        feature_output = layer_outputs[:,subplot_idx-1]
        feature_output = feature_output[:max_timesteps]
        ax = axes[row_idx,col_idx]

        if show_y_zero:
            ax.axhline(0,color='red')
        ax.plot(feature_output)

        ax.axis(xmin=0,xmax=len(feature_output))
        ax.axis('off')

        ax.annotate(str(subplot_idx),xy=(0,.99),xycoords='axes fraction',
                    weight='bold',fontsize=14,color='g')
    if equate_axes:
        y_new = []
        for row_axis in axes:
            y_new += [np.max(np.abs([col_axis.get_ylim() for 
                                     col_axis in row_axis]))]
        y_new = np.max(y_new)
        for row_axis in axes:
            [col_axis.set_ylim(-y_new,y_new) for col_axis in row_axis]
    plt.show()

 def get_layer_outputs(model, layer_name, input_data, learning_phase=1): outputs = [layer.output for layer in model.layers if layer_name in layer.name] layers_fn = K.function([model.input, K.learning_phase()], outputs) return layers_fn([input_data, learning_phase])

每个请求的 SeqWeightedAttention 示例：

 ipt = Input(batch_shape=(10,240,4)) x = LSTM(60, activation='tanh', return_sequences=True)(ipt) x = SeqWeightedAttention(return_attention=True)(x) x = concatenate(x) out = Dense(1, activation='sigmoid')(x) model = Model(ipt,out) model.compile(Adam(lr=1e-2), loss='binary_crossentropy') X = np.random.rand(10,240,4) # dummy data Y = np.random.randint(0,2,(10,1)) # dummy labels model.train_on_batch(X, Y) outs = get_layer_outputs(model, 'seq', X, 1) outs_1 = outs[0][0] # additional index since using batch_shape outs_2 = outs[1][0] plt.hist(outs_1, bins=500); plt.show() plt.hist(outs_2, bins=500); plt.show()

如何使用 keras-self-attention package 可视化注意力 LSTM？

问题描述

1 个解决方案

解决方案1
13 已采纳 2019-10-12 19:10:47

如何使用 keras-self-attention package 可视化注意力 LSTM？

问题描述

1 个解决方案

解决方案1 13 已采纳 2019-10-12 19:10:47

解决方案1
13 已采纳 2019-10-12 19:10:47