I've been trying to speed up training of my CRNN network for optical character recognition, but I can't get the accuracy metric working when using TFRecords and tf.data.Dataset
pipelines. I previously used a Keras Sequence and had it working. Here is a complete runnable toy example showing my problem (tested with Tensorflow 2.4.1):
import random
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.python.keras import Input, Model
from tensorflow.python.keras.layers import Dense, Layer, Bidirectional, GRU, Reshape, Activation
from tensorflow.python.keras.optimizer_v2.adam import Adam
AUTOTUNE = tf.data.experimental.AUTOTUNE
CHAR_VECTOR = "ABC"
IMG_W = 10
IMG_H = 10
N_CHANNELS = 3
class CTCLayer(Layer):
def __init__(self, name=None):
super().__init__(name=name)
self.loss_fn = K.ctc_batch_cost
def call(self, y_true, y_pred, label_length):
# Compute the training-time loss value and add it
# to the layer using `self.add_loss()`.
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
loss = self.loss_fn(y_true, y_pred, input_length, label_length)
self.add_loss(loss)
# At test time, just return the computed predictions
return y_pred
def get_model():
n_classes = len(CHAR_VECTOR) + 1
input = Input(name='image', shape=(IMG_W, IMG_H, N_CHANNELS), dtype='float32')
label = Input(name='label', shape=[None], dtype='float32')
label_length = Input(name='label_length', shape=[None], dtype='int64')
x = Reshape(target_shape=(IMG_W, np.prod(input.shape[2:])), name='reshape')(input)
x = Dense(24, activation='relu', name='dense1')(x)
x = Bidirectional(GRU(24, return_sequences=True, name="GRU"), merge_mode="sum")(x)
x = Dense(n_classes, name='dense2')(x)
y_pred = Activation('softmax', name='softmax')(x)
output = CTCLayer(name="ctc")(label, y_pred, label_length)
m = Model(inputs=[input, label, label_length], outputs=output)
return m
def image_feature(value):
"""Returns a bytes_list from a string / byte."""
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()]))
def float_feature_list(value):
"""Returns a list of float_list from a float / double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def create_example(image, label, label_length):
feature = {
"image": image_feature(image),
"label": float_feature_list(label),
"label_length": int64_feature(label_length),
}
return tf.train.Example(features=tf.train.Features(feature=feature))
def parse_tfrecord_fn(example):
feature_description = {
"image": tf.io.FixedLenFeature([], tf.string),
"label": tf.io.VarLenFeature(tf.float32),
"label_length": tf.io.FixedLenFeature([1], tf.int64),
}
example = tf.io.parse_single_example(example, feature_description)
example["image"] = tf.image.convert_image_dtype(tf.io.decode_jpeg(example["image"], channels=3), dtype="float32")
example["label"] = tf.sparse.to_dense(example["label"])
return example
def generate_tfrecords(n):
with tf.io.TFRecordWriter(filename) as writer:
for i in range(n):
random_img = np.random.random((IMG_W, IMG_H, N_CHANNELS))
label_length = random.randint(1, max_text_len)
label = np.random.randint(0, len(CHAR_VECTOR), max_text_len)
example = create_example(random_img, label, label_length)
writer.write(example.SerializeToString())
class DataGenerator(tf.keras.utils.Sequence):
def __len__(self):
return steps_per_epoch
def __getitem__(self, index):
outputs = np.zeros([batch_size])
dataset = get_dataset()
inputs = next(iter(dataset.take(1)))
return inputs, outputs
def get_dataset():
generate_tfrecords(batch_size * epochs * steps_per_epoch)
dataset = (
tf.data.TFRecordDataset(filename, num_parallel_reads=AUTOTUNE)
.map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
.batch(batch_size)
.prefetch(AUTOTUNE)
)
return dataset
if __name__ == "__main__":
batch_size = 9
epochs = 7
steps_per_epoch = 8
max_text_len = 5
filename = "test.tfrec"
use_generator = False
data = DataGenerator() if use_generator else get_dataset()
model = get_model()
'''This fails when use_generator == False, removing the
metric solves it'''
model.compile(optimizer=Adam(), metrics=["accuracy"])
model.fit(data, epochs=epochs, steps_per_epoch=steps_per_epoch)
Set use_generator = False
or remove metrics=["accuracy"]
and it will run without error.
As you can see the DataGenerator
uses the same data from the TFRecords, but it also returns some zeros, and for whatever reason this seems to be the magic sauce:
class DataGenerator(tf.keras.utils.Sequence):
def __len__(self):
return steps_per_epoch
def __getitem__(self, index):
outputs = np.zeros([batch_size])
dataset = get_dataset()
inputs = next(iter(dataset.take(1)))
return inputs, outputs
I also noticed that this Keras example suffers from the same problem (it crashes if you edit the code to monitor accuracy): https://keras.io/examples/vision/captcha_ocr/
Is there any way to mimic the behaviour of __getitem__
with the Dataset
, or some other way of getting the accuracy without using a Sequence
?
When you pass the dataset for training you need to include the outputs. Your generator function returns (correctly) a tuple (inputs, outputs); When you pass the dataset directly that is missing.
If you modify the mapper function as such:
def parse_tfrecord_fn(example):
feature_description = {
"image": tf.io.FixedLenFeature([], tf.string),
"label": tf.io.VarLenFeature(tf.float32),
"label_length": tf.io.FixedLenFeature([1], tf.int64),
}
tf_example = tf.io.parse_single_example(example, feature_description)
tf_example["image"] = tf.image.convert_image_dtype(tf.io.decode_jpeg(tf_example["image"], channels=3), dtype="float32")
tf_example["label"] = tf.sparse.to_dense(tf_example["label"])
return tf_example, tf.constant([0])
The code will now run without errors with use_generator = False. Note that accuracy as a metric doesn't make sense. The metric compares the output of the network (y_pred) with the target (tf.constant([0]). In order to measure accuracy you need to feed the label as the target... and you need a function that can compare the output of your network which is of the shape (batch_size, max_sequence_lenght, n_classes) with the labels. ie you need a sparse categorical accuracy metric.
You can find my notebook at: https://colab.research.google.com/drive/1z2NCQnYlG_UIpN7bBNpXXbLwy3JE_PX2?usp=sharing
There probably some issue with [accuracy]
with tf.data
, but I'm not super sure if this is the main cause in your case or if the issue still exits. If I try as follows, it runs anyway without Sequence
(with tf.data
).
model.compile(optimizer=Adam(), metrics=['sparse_categorical_accuracy'])
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.