How to monitor accuracy with CTC loss function and Datasets? (runnable code included)

Question

I've been trying to speed up training of my CRNN network for optical character recognition, but I can't get the accuracy metric working when using TFRecords and tf.data.Dataset pipelines. I previously used a Keras Sequence and had it working. Here is a complete runnable toy example showing my problem (tested with Tensorflow 2.4.1):

import random
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.python.keras import Input, Model
from tensorflow.python.keras.layers import Dense, Layer, Bidirectional, GRU, Reshape, Activation
from tensorflow.python.keras.optimizer_v2.adam import Adam

AUTOTUNE = tf.data.experimental.AUTOTUNE
CHAR_VECTOR = "ABC"
IMG_W = 10
IMG_H = 10
N_CHANNELS = 3


class CTCLayer(Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = K.ctc_batch_cost

    def call(self, y_true, y_pred, label_length):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred


def get_model():
    n_classes = len(CHAR_VECTOR) + 1

    input = Input(name='image', shape=(IMG_W, IMG_H, N_CHANNELS), dtype='float32')
    label = Input(name='label', shape=[None], dtype='float32')
    label_length = Input(name='label_length', shape=[None], dtype='int64')

    x = Reshape(target_shape=(IMG_W, np.prod(input.shape[2:])), name='reshape')(input)
    x = Dense(24, activation='relu', name='dense1')(x)
    x = Bidirectional(GRU(24, return_sequences=True, name="GRU"), merge_mode="sum")(x)
    x = Dense(n_classes, name='dense2')(x)
    y_pred = Activation('softmax', name='softmax')(x)

    output = CTCLayer(name="ctc")(label, y_pred, label_length)

    m = Model(inputs=[input, label, label_length], outputs=output)
    return m


def image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()]))


def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def create_example(image, label, label_length):
    feature = {
        "image": image_feature(image),
        "label": float_feature_list(label),
        "label_length": int64_feature(label_length),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))


def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.VarLenFeature(tf.float32),
        "label_length": tf.io.FixedLenFeature([1], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    example["image"] = tf.image.convert_image_dtype(tf.io.decode_jpeg(example["image"], channels=3), dtype="float32")
    example["label"] = tf.sparse.to_dense(example["label"])

    return example


def generate_tfrecords(n):
    with tf.io.TFRecordWriter(filename) as writer:
        for i in range(n):
            random_img = np.random.random((IMG_W, IMG_H, N_CHANNELS))
            label_length = random.randint(1, max_text_len)
            label = np.random.randint(0, len(CHAR_VECTOR), max_text_len)
            example = create_example(random_img, label, label_length)
            writer.write(example.SerializeToString())


class DataGenerator(tf.keras.utils.Sequence):
    def __len__(self):
        return steps_per_epoch

    def __getitem__(self, index):
        outputs = np.zeros([batch_size])
        dataset = get_dataset()
        inputs = next(iter(dataset.take(1)))
        return inputs, outputs


def get_dataset():
    generate_tfrecords(batch_size * epochs * steps_per_epoch)
    dataset = (
        tf.data.TFRecordDataset(filename, num_parallel_reads=AUTOTUNE)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .batch(batch_size)
        .prefetch(AUTOTUNE)
    )
    return dataset


if __name__ == "__main__":
    batch_size = 9
    epochs = 7
    steps_per_epoch = 8
    max_text_len = 5
    filename = "test.tfrec"
    use_generator = False
    data = DataGenerator() if use_generator else get_dataset()

    model = get_model()
    '''This fails when use_generator == False, removing the 
     metric solves it'''
    model.compile(optimizer=Adam(), metrics=["accuracy"])
    model.fit(data, epochs=epochs, steps_per_epoch=steps_per_epoch)

Set use_generator = False or remove metrics=["accuracy"] and it will run without error.

As you can see the DataGenerator uses the same data from the TFRecords, but it also returns some zeros, and for whatever reason this seems to be the magic sauce:

class DataGenerator(tf.keras.utils.Sequence):
    def __len__(self):
        return steps_per_epoch

    def __getitem__(self, index):
        outputs = np.zeros([batch_size])
        dataset = get_dataset()
        inputs = next(iter(dataset.take(1)))
        return inputs, outputs

I also noticed that this Keras example suffers from the same problem (it crashes if you edit the code to monitor accuracy): https://keras.io/examples/vision/captcha_ocr/

Is there any way to mimic the behaviour of __getitem__ with the Dataset , or some other way of getting the accuracy without using a Sequence ?

Answer 1

When you pass the dataset for training you need to include the outputs. Your generator function returns (correctly) a tuple (inputs, outputs); When you pass the dataset directly that is missing.

If you modify the mapper function as such:

def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.VarLenFeature(tf.float32),
        "label_length": tf.io.FixedLenFeature([1], tf.int64),
    }
    tf_example = tf.io.parse_single_example(example, feature_description)
    tf_example["image"] = tf.image.convert_image_dtype(tf.io.decode_jpeg(tf_example["image"], channels=3), dtype="float32")
    tf_example["label"] = tf.sparse.to_dense(tf_example["label"])

    return tf_example, tf.constant([0])

The code will now run without errors with use_generator = False. Note that accuracy as a metric doesn't make sense. The metric compares the output of the network (y_pred) with the target (tf.constant([0]). In order to measure accuracy you need to feed the label as the target... and you need a function that can compare the output of your network which is of the shape (batch_size, max_sequence_lenght, n_classes) with the labels. ie you need a sparse categorical accuracy metric.

You can find my notebook at: https://colab.research.google.com/drive/1z2NCQnYlG_UIpN7bBNpXXbLwy3JE_PX2?usp=sharing

Answer 2

There probably some issue with [accuracy] with tf.data , but I'm not super sure if this is the main cause in your case or if the issue still exits. If I try as follows, it runs anyway without Sequence (with tf.data ).

model.compile(optimizer=Adam(), metrics=['sparse_categorical_accuracy'])

How to monitor accuracy with CTC loss function and Datasets? (runnable code included)

Question

2 answers

solution1
2 ACCPTED 2021-05-19 20:37:18

solution2
1 2021-05-17 09:45:33

How to monitor accuracy with CTC loss function and Datasets? (runnable code included)

Question

2 answers

solution1 2 ACCPTED 2021-05-19 20:37:18

solution2 1 2021-05-17 09:45:33

solution1
2 ACCPTED 2021-05-19 20:37:18

solution2
1 2021-05-17 09:45:33