如何监控 CTC 损失 function 和数据集的准确性？（包括可运行代码）

Question

我一直在尝试加快我的 CRNN 网络的光学字符识别训练，但是在使用 TFRecords 和tf.data.Dataset管道时我无法获得准确度指标。 我以前使用过 Keras 序列并让它工作。 这是一个完整的可运行玩具示例，显示了我的问题（使用 Tensorflow 2.4.1 测试）：

import random
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.python.keras import Input, Model
from tensorflow.python.keras.layers import Dense, Layer, Bidirectional, GRU, Reshape, Activation
from tensorflow.python.keras.optimizer_v2.adam import Adam

AUTOTUNE = tf.data.experimental.AUTOTUNE
CHAR_VECTOR = "ABC"
IMG_W = 10
IMG_H = 10
N_CHANNELS = 3


class CTCLayer(Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = K.ctc_batch_cost

    def call(self, y_true, y_pred, label_length):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred


def get_model():
    n_classes = len(CHAR_VECTOR) + 1

    input = Input(name='image', shape=(IMG_W, IMG_H, N_CHANNELS), dtype='float32')
    label = Input(name='label', shape=[None], dtype='float32')
    label_length = Input(name='label_length', shape=[None], dtype='int64')

    x = Reshape(target_shape=(IMG_W, np.prod(input.shape[2:])), name='reshape')(input)
    x = Dense(24, activation='relu', name='dense1')(x)
    x = Bidirectional(GRU(24, return_sequences=True, name="GRU"), merge_mode="sum")(x)
    x = Dense(n_classes, name='dense2')(x)
    y_pred = Activation('softmax', name='softmax')(x)

    output = CTCLayer(name="ctc")(label, y_pred, label_length)

    m = Model(inputs=[input, label, label_length], outputs=output)
    return m


def image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()]))


def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def create_example(image, label, label_length):
    feature = {
        "image": image_feature(image),
        "label": float_feature_list(label),
        "label_length": int64_feature(label_length),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))


def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.VarLenFeature(tf.float32),
        "label_length": tf.io.FixedLenFeature([1], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    example["image"] = tf.image.convert_image_dtype(tf.io.decode_jpeg(example["image"], channels=3), dtype="float32")
    example["label"] = tf.sparse.to_dense(example["label"])

    return example


def generate_tfrecords(n):
    with tf.io.TFRecordWriter(filename) as writer:
        for i in range(n):
            random_img = np.random.random((IMG_W, IMG_H, N_CHANNELS))
            label_length = random.randint(1, max_text_len)
            label = np.random.randint(0, len(CHAR_VECTOR), max_text_len)
            example = create_example(random_img, label, label_length)
            writer.write(example.SerializeToString())


class DataGenerator(tf.keras.utils.Sequence):
    def __len__(self):
        return steps_per_epoch

    def __getitem__(self, index):
        outputs = np.zeros([batch_size])
        dataset = get_dataset()
        inputs = next(iter(dataset.take(1)))
        return inputs, outputs


def get_dataset():
    generate_tfrecords(batch_size * epochs * steps_per_epoch)
    dataset = (
        tf.data.TFRecordDataset(filename, num_parallel_reads=AUTOTUNE)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .batch(batch_size)
        .prefetch(AUTOTUNE)
    )
    return dataset


if __name__ == "__main__":
    batch_size = 9
    epochs = 7
    steps_per_epoch = 8
    max_text_len = 5
    filename = "test.tfrec"
    use_generator = False
    data = DataGenerator() if use_generator else get_dataset()

    model = get_model()
    '''This fails when use_generator == False, removing the 
     metric solves it'''
    model.compile(optimizer=Adam(), metrics=["accuracy"])
    model.fit(data, epochs=epochs, steps_per_epoch=steps_per_epoch)

设置use_generator = False或删除metrics=["accuracy"]它将运行而不会出错。

如您所见， DataGenerator使用来自 TFRecords 的相同数据，但它也返回一些零，无论出于何种原因，这似乎是神奇的调味汁：

class DataGenerator(tf.keras.utils.Sequence):
    def __len__(self):
        return steps_per_epoch

    def __getitem__(self, index):
        outputs = np.zeros([batch_size])
        dataset = get_dataset()
        inputs = next(iter(dataset.take(1)))
        return inputs, outputs

我还注意到这个 Keras 示例遇到了同样的问题（如果您编辑代码以监控准确性，它会崩溃）： https://keras.io/examples/vision/captcha_ocr/

有没有办法用Dataset模仿__getitem__的行为，或者在不使用Sequence的情况下获得准确性的其他方式？

Answer 1

当您传递数据集进行训练时，您需要包含输出。 您的生成器 function （正确）返回一个元组（输入、输出）； 当您直接传递缺少的数据集时。

如果您修改映射器 function 如下：

def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.VarLenFeature(tf.float32),
        "label_length": tf.io.FixedLenFeature([1], tf.int64),
    }
    tf_example = tf.io.parse_single_example(example, feature_description)
    tf_example["image"] = tf.image.convert_image_dtype(tf.io.decode_jpeg(tf_example["image"], channels=3), dtype="float32")
    tf_example["label"] = tf.sparse.to_dense(tf_example["label"])

    return tf_example, tf.constant([0])

代码现在将在 use_generator = False 的情况下正常运行。 请注意，作为度量标准的准确性没有意义。 该指标将网络 (y_pred) 的 output 与目标 (tf.constant([0]) 进行比较。为了测量准确度，您需要将 label 作为目标...并且您需要一个 ZC1C41452074C 可以比较您的网络的 output 形状为 (batch_size, max_sequence_lenght, n_classes)，带有标签。也就是说，您需要一个稀疏的分类准确度指标。

您可以在以下位置找到我的笔记本： https://colab.research.google.com/drive/1z2NCQnYlG_UIpN7bBNpXXbLwy3JE_PX2?usp=sharing

Answer 2

tf.data的[accuracy]可能存在一些问题，但我不确定这是否是您的情况的主要原因，或者问题是否仍然存在。 如果我尝试如下，它无论如何都会在没有Sequence的情况下运行（使用tf.data ）。

model.compile(optimizer=Adam(), metrics=['sparse_categorical_accuracy'])

如何监控 CTC 损失 function 和数据集的准确性？（包括可运行代码）

问题描述

2 个解决方案

解决方案1
2 已采纳 2021-05-19 20:37:18

解决方案2
1 2021-05-17 09:45:33

如何监控 CTC 损失 function 和数据集的准确性？ （包括可运行代码）

问题描述

2 个解决方案

解决方案1 2 已采纳 2021-05-19 20:37:18

解决方案2 1 2021-05-17 09:45:33

如何监控 CTC 损失 function 和数据集的准确性？（包括可运行代码）

解决方案1
2 已采纳 2021-05-19 20:37:18

解决方案2
1 2021-05-17 09:45:33