I have two identical models with same parameters. Both of these are trained on MNIST dataset. First one is trained using model.fit() and the second one is trained using model.train_on_batch(). The second model is giving less accuracy. I want to know what could be the reason for that and how to fix it?
Data preperation:
batch_size = 150
num_classes = 10
epochs = 12
# input image dimensions
img_rows, img_cols = 28, 28
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
if K.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
MODEL 1:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
activation='relu',
input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adadelta(),
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
MODEL 1 ACCURACY:
Test loss: 0.023489486496470636 Test accuracy: 0.9924
MODEL 2:
model2 = Sequential()
model2.add(Conv2D(32, kernel_size=(3, 3),
activation='relu',
input_shape=input_shape))
model2.add(Conv2D(64, (3, 3), activation='relu'))
model2.add(Conv2D(128, (3, 3), activation='relu'))
model2.add(Conv2D(256, (3, 3), activation='relu'))
model2.add(Conv2D(128, (3, 3), activation='relu'))
model2.add(Conv2D(64, (3, 3), activation='relu'))
model2.add(Conv2D(64, (3, 3), activation='relu'))
model2.add(Conv2D(32, (3, 3), activation='relu'))
model2.add(MaxPooling2D(pool_size=(2, 2)))
model2.add(Dropout(0.25))
model2.add(Flatten())
model2.add(Dense(128, activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(num_classes, activation='softmax'))
model2.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adadelta(),
metrics=['accuracy'])
batch_size2 = 150
epochs2 = 12
step_epoch = x_train.shape[0] // batch_size2
def next_batch_train(i):
return x_train[i:i+batch_size2,:,:,:], y_train[i:i+batch_size2,:]
iter_num = 0
epoch_num = 0
model_outputs = []
loss_history = []
while epoch_num < epochs2:
while iter_num < step_epoch:
x,y = next_batch_train(iter_num)
loss_history += model2.train_on_batch(x,y)
iter_num += 1
print("EPOCH {} FINISHED".format(epoch_num + 1))
epoch_num += 1
iter_num = 0 # reset counter
score = model2.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
MODEL 2 ACCURACY:
Test loss: 0.5577236003954947 Test accuracy: 0.9387
Four sources of difference:
fit()
uses shuffle=True
by default, this includes the very first epoch (and subsequent ones) step_epoch
number of batches, but iterate over step_epoch - 1
; change <
to <=
next_batch_train
slicing is way off; here's what it's doing vs what it needs to be doing:
x_train[0:128] --> x_train[1:129] --> x_train[2:130] -->...
x_train[0:128] --> x_train[128:256] --> x_train[256:384] -->...
To remedy, you should include a shuffling step in your model2
's train loop - or use fit
with shuffle=False
(not recommended). Also, a tip: 64, 128, 256, 128, 64
Conv2D filters is a pretty bad arrangement; what you're doing is upsampling greatly, in a sense "fabricating data" - if you're going to use more filters, also increase their strides
proportionally so that the total tensor size between the layers remains ~same (or less).
All mentioned fixes + updated seed function below; run it for 1 epoch, 12 takes too long - if 1 works so will 12. Can keep your original model if you'd like, but I recommend testing with one below, as it's significantly faster.
import tensorflow as tf
import numpy as np
import random
def reset_seeds():
np.random.seed(1)
random.seed(2)
if tf.__version__[0] == '2':
tf.random.set_seed(3)
else:
tf.set_random_seed(3)
print("RANDOM SEEDS RESET")
reset_seeds()
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
activation='relu',
input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adadelta(),
metrics=['accuracy'])
def next_batch_train(i):
return (x_train[i*batch_size2:(i+1)*batch_size2,:,:,:],
y_train[i*batch_size2:(i+1)*batch_size2,:])
iter_num = 0
epoch_num = 0
model_outputs = []
loss_history = []
while epoch_num < epochs2:
while iter_num < step_epoch:
x,y = next_batch_train(iter_num)
loss_history += model2.train_on_batch(x,y)
iter_num += 1
print("EPOCH {} FINISHED".format(epoch_num + 1))
epoch_num += 1
iter_num = 0 # reset counter
score = model2.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
Better alternative: use shuffling
class TrainBatches():
def __init__(self, x_train, y_train, batch_size):
self.x_train=x_train
self.y_train=y_train
self.batch_size=batch_size
self.indices = [i for i in range(len(x_train))]
self.counter = 0
def get_next(self):
start = self.indices[self.counter] * self.batch_size
end = start + self.batch_size
self.counter += 1
return self.x_train[start:end], self.y_train[start:end]
def shuffle(self):
np.random.shuffle(self.indices)
print("BATCHES SHUFFLED")
train_batches = TrainBatches(x_train, y_train, batch_size)
while epoch_num < epochs2:
while iter_num <= step_epoch:
x, y = train_batches.get_next()
loss_history += model2.train_on_batch(x,y)
iter_num += 1
train_batches.shuffle()
train_batches.counter = 0
print("EPOCH {} FINISHED".format(epoch_num + 1))
epoch_num += 1
iter_num = 0 # reset counter
Note that this won't guarantee your results will agree with fit()
, as fit()
may shuffle differently (even with a random seed) - but the implementation is in fact correct. Above also doesn't shuffle at the first epoch (easy to change).
One difference between those two models that I noticed is that in your second model, you didn't shuffle your training data after each epoch. .fit()
will shuffle your training data in default.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.