ValueError：形状 (None, 5) 和 (None, 15, 5) 不兼容

Question

I want to implement a Hierarchical attention mechanism for document classification presented by Yang.我想实现杨提出的文档分类的分层注意机制。 But I want to replace LSTM with Transformer.但我想用 Transformer 替换 LSTM。

I used Apoorv Nandan's text classification with Transformer: https://keras.io/examples/nlp/text_classification_with_transformer/我使用了 Apoorv Nandan 的带有 Transformer 的文本分类： https://keras.io/examples/nlp/text_classification_with_transformer/

I have implemented Transformer hierarchically to classification.我已经将 Transformer 分层实现到分类。 One for sentence representation and another one for document representation.一个用于句子表示，另一个用于文档表示。 The code is as follow:代码如下：

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical


class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

    def compute_output_shape(self, input_shape):
        # it does not change the shape of its input
        return input_shape


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
        super(TransformerBlock, self).__init__(name=name)
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def compute_output_shape(self, input_shape):
        # it does not change the shape of its input
        return input_shape


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, name=None):
        super(TokenAndPositionEmbedding, self).__init__(name=name)
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def compute_output_shape(self, input_shape):
        # it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
        return input_shape + (self.pos_emb.output_dim,)



# Lower level (produce a representation of each sentence):

embed_dim = 100  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 64  # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100  # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size = 1000
class_number = 5
max_docs = 10000
max_sentences = 15
max_words = 60

word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
                                           embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
                                    dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate, name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu", name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)

word_encoder.summary()

# =========================================================================
# Upper level (produce a representation of each document):

L2_dense_units = 100

sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')

# This is the line producing "NotImplementedError":
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)

sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
                               dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_dense = layers.TimeDistributed(layers.Dense(int(L2_dense_units)),name='sentence_dense')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_dense)
preds = layers.Dense(class_number , activation='softmax', name='sentence_output')(sentence_out)

model = keras.Model(sentence_input, preds)
model.summary()

#==========================================================================

Everything is OK(for testing you can copy and paste it in googlecolab).一切正常（为了测试，您可以将其复制并粘贴到 googlecolab 中）。 But when I compile and fit the model by following codes, it throws an error:但是当我通过以下代码编译并安装 model 时，会引发错误：

X = tf.random.uniform(shape=(max_docs, max_sentences, max_words), minval=1, maxval=1000, dtype=tf.dtypes.int32, seed=1)

y = tf.random.uniform(shape=(max_docs, ), minval=0, maxval=class_number , dtype=tf.dtypes.int32, seed=1)
y = to_categorical(y)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    history = model.fit(
        X, y, batch_size=32, epochs=25,
    )

The error is:错误是：

ValueError: Shapes (None, 5) and (None, 15, 5) are incompatible

Answer 1

When I had a similar error, I found that a Flatten() layer helped, I had incompatible shapes of (None, x, y) and (None, y).当我遇到类似错误时，我发现 Flatten() 层有帮助，我的 (None, x, y) 和 (None, y) 形状不兼容。

If you try to provide a flatten layer for the part that gives you the (None, 15, 5), then it should output something like (None, 75).如果您尝试为提供 (None, 15, 5) 的部分提供平坦层，那么它应该 output 类似于 (None, 75)。

The flatten layer merely removes dimensions, when I was doing this I got the output as (None, x y) and due to the way Tensorflow works, it was able to match both shapes as x y is obviously a factor of just y.扁平层仅删除尺寸，当我这样做时，我将 output 设为 (None, x y) 并且由于 Tensorflow 的工作方式，它能够匹配两种形状，因为 x y 显然只是 y 的一个因素。

ValueError：形状 (None, 5) 和 (None, 15, 5) 不兼容

问题描述

1 个解决方案

解决方案1
0 2021-12-04 03:00:21

ValueError：形状 (None, 5) 和 (None, 15, 5) 不兼容

问题描述

1 个解决方案

解决方案1 0 2021-12-04 03:00:21

解决方案1
0 2021-12-04 03:00:21