Validation accuracy not improving imbalanced data

Question

Attempting to make predictions using Kaggle Diabetic retinopathy data set and a CNN model. There are five classes to be predicted. Distribution % of the data label wise is as below.

0    0.73
2    0.15
1    0.07
3    0.02
4    0.02
Name: level, dtype: float64

The relevant important code blocks are furnished below.

# Network training parameters
EPOCHS = 25
BATCH_SIZE =50
VERBOSE = 1
lr=0.0001
OPTIMIZER = tf.keras.optimizers.Adam(lr)
target_size =(256, 256)

NB_CLASSES = 5

THe Image generator class and the preprocessing codes as below.

data_gen=tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=45,
                                                         horizontal_flip=True,
                                                        vertical_flip=True,
                                                          rescale=1./255,
                                                          validation_split=0.2)
    

train_gen=data_gen.flow_from_dataframe(
    dataframe=label_csv, directory=IMAGE_FOLDER_PATH,
    x_col='image', y_col='level',
    target_size=target_size,
    class_mode='categorical',
    batch_size=BATCH_SIZE, shuffle=True,
    subset='training',
    validate_filenames=True
    )
Found 28101 validated image filenames belonging to 5 classes.
validation_gen=data_gen.flow_from_dataframe(
    dataframe=label_csv, directory=IMAGE_FOLDER_PATH,
    x_col='image', y_col='level',
    target_size=target_size,
    class_mode='categorical',
    batch_size=BATCH_SIZE, shuffle=True,
    subset='validation',
    validate_filenames=True
    )

Found 7025 validated image filenames belonging to 5 classes.

train_gen.image_shape
(256, 256, 3)

Model building code blocks as below.

# Architect your CNN model1
model1=tf.keras.models.Sequential()
model1.add(tf.keras.layers.Conv2D(256,(3,3),input_shape=INPUT_SHAPE,activation='relu'))
model1.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))
model1.add(tf.keras.layers.Conv2D(128,(3,3),activation='relu'))
model1.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))
model1.add(tf.keras.layers.Conv2D(64,(3,3),activation='relu'))
model1.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))
model1.add(tf.keras.layers.Conv2D(32,(3,3),activation='relu'))
model1.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))
model1.add(tf.keras.layers.Flatten())
model1.add(tf.keras.layers.Dense(units=512,activation='relu'))
model1.add(tf.keras.layers.Dense(units=256,activation='relu'))
model1.add(tf.keras.layers.Dense(units=128,activation='relu'))
model1.add(tf.keras.layers.Dense(units=64,activation='relu'))
model1.add(tf.keras.layers.Dense(units=32,activation='relu'))
model1.add(tf.keras.layers.Dense(units=NB_CLASSES,activation='softmax'))

model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 254, 254, 256)     7168      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 127, 127, 256)     0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 125, 125, 128)     295040    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 62, 62, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 60, 60, 64)        73792     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 30, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 28, 28, 32)        18464     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 14, 14, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 6272)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               3211776   
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 165       
=================================================================
Total params: 3,780,965
Trainable params: 3,780,965
Non-trainable params: 0


# Compile model1

model1.compile(optimizer=OPTIMIZER,metrics=['accuracy'],loss='categorical_crossentropy')

print (train_gen.n,train_gen.batch_size)
28101 50

STEP_SIZE_TRAIN=train_gen.n//train_gen.batch_size
STEP_SIZE_VALID=validation_gen.n//validation_gen.batch_size
print(STEP_SIZE_TRAIN)
print(STEP_SIZE_VALID)
562
140

# Fit the model1

history1=model1.fit(train_gen,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=validation_gen,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=EPOCHS,verbose=1)

History of the epoch as below and trained stopped at epoch -14 as no improvement observed.

Epoch 1/25
562/562 [==============================] - 1484s 3s/step - loss: 0.9437 - accuracy: 0.7290 - val_loss: 0.8678 - val_accuracy: 0.7309
Epoch 2/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8748 - accuracy: 0.7337 - val_loss: 0.8673 - val_accuracy: 0.7309
Epoch 3/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8681 - accuracy: 0.7367 - val_loss: 0.8614 - val_accuracy: 0.7306
Epoch 4/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8619 - accuracy: 0.7333 - val_loss: 0.8592 - val_accuracy: 0.7306
Epoch 5/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8565 - accuracy: 0.7375 - val_loss: 0.8625 - val_accuracy: 0.7304
Epoch 6/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8608 - accuracy: 0.7357 - val_loss: 0.8556 - val_accuracy: 0.7310
Epoch 7/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8568 - accuracy: 0.7335 - val_loss: 0.8614 - val_accuracy: 0.7304
Epoch 8/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8541 - accuracy: 0.7349 - val_loss: 0.8591 - val_accuracy: 0.7301
Epoch 9/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8582 - accuracy: 0.7321 - val_loss: 0.8583 - val_accuracy: 0.7303
Epoch 10/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8509 - accuracy: 0.7354 - val_loss: 0.8599 - val_accuracy: 0.7311
Epoch 11/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8521 - accuracy: 0.7325 - val_loss: 0.8584 - val_accuracy: 0.7304
Epoch 12/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8422 - accuracy: 0.7352 - val_loss: 0.8481 - val_accuracy: 0.7307
Epoch 13/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8511 - accuracy: 0.7345 - val_loss: 0.8477 - val_accuracy: 0.7307
Epoch 14/25
562/562 [==============================] - 1462s 3s/step - loss: 0.8314 - accuracy: 0.7387 - val_loss: 0.8528 - val_accuracy: 0.7300
Epoch 15/25
 73/562 [==>...........................] - ETA: 17:12 - loss: 0.8388 - accuracy: 0.7344

Validation accuracy not improving more than 73 % even after several epochs.In the earlier trial i tried the learning rate 0.001 but the case was same with no improvements.

Request suggestions to improve the model accuracy.
Also how can we use Grid search when we use the Image generator for preprocessing and would invite suggestions for the same Many thanks in advance

Answer 1

your problem is most likely due to overfitting. your data is quite unbalanced and in addition to finding a better model, a better learning rate or a better optimizer. you could also create a custom generator to augment and select your data in a more balanced way.

I use custom generators for most of the models at work, I can't share the full code of generators but I'll show you a pseudocode example of how to create one. it's actually quite fun to play around and add more steps to it. you can -and you probably should- add pre-processing and post-processing steps but I hope this code gives you an overall idea of the process.

import random
import numpy as np


class myCostumGenerator:

    def __init__(self) -> None:
        # load dataset into a dict, if it's too big then just load filenames and load them at runtime
        # each dict key is a class name, and each value is a list of images or filenames
        self.dataSet, self.imageHeight, self.imageWidth, self.imageChannels = loadData()

    def labelBinarizer(self, label):
        # this is how you convert class names into target Y
        pass

    def augment(self, image):
        # this is how you augment your images
        pass

    def yeildData(self):
        while True:#keras generators need to run infinitly
            for className, data in self.dataSet.items():
                yield self.augment(random.choice(data)), self.labelBinarizer(className)

    def getEmptyBatch(self, batchSize):
        return (
            np.empty([batchSize, self.imageHeight, self.imageWidth, self.imageChannels]),
            np.empty([batchSize, len(self.dataset.keys())]), 0)

    def getBatches(self, batchSize):
        X, Y, i =  self.getEmptyBatch(batchSize)
        for image, label in self.yieldData():
            X[i, ...] = image
            Y[i, ...] = label
            i += 1
            if i== batchSize:
                yield X, Y
                X, Y, i = self.getEmptyBatch(batchSize)


# your model definition and other stuff
# ...
# ...
# ...
# with this method of defining a generator, you have to set number of steps per epoch
generator = myCostumGenerator()
model.fit(
    generator.getBatches(batchSize=256),
    steps_per_epoch = 500
    # other params
)

Validation accuracy not improving imbalanced data

Question

1 answers

solution1
1 ACCPTED 2021-05-25 06:45:04

Validation accuracy not improving imbalanced data

Question

1 answers

solution1 1 ACCPTED 2021-05-25 06:45:04

solution1
1 ACCPTED 2021-05-25 06:45:04