Keras unreasonnably slower than TensorFlow

I'm trying to implement the neural network of this TensorFlow example , but using Keras.

You'll find the code for both implementations at the bottom of the post.

My problem is that the code takes around 1m30 with TensorFlow, and 18 minutes with Keras !

My question is :

  • Did I make a rookie mistake in translating TensorFlow code to Keras code ?
  • Or is Keras incredibly slow ? If so, can it be fixed ?

Tensorflow code :

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

import tensorflow as tf

def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)

def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

x = tf.placeholder(tf.float32, [None, 784]) 
x_image = tf.reshape(x, [-1, 28, 28, 1]) 

y_ = tf.placeholder(tf.float32, [None, 10]) 

neurons_nb_layer_1 = 32
neurons_nb_layer_2 = 64
neurons_nb_layer_3 = 1024

W_conv1 = weight_variable([5, 5, 1, neurons_nb_layer_1]) 
b_conv1 = bias_variable([neurons_nb_layer_1]) 

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) 
h_pool1 = max_pool_2x2(h_conv1) 

W_conv2 = weight_variable([5, 5, neurons_nb_layer_1, neurons_nb_layer_2])
b_conv2 = bias_variable([neurons_nb_layer_2])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2) 

W_fc1 = weight_variable([7 * 7 * neurons_nb_layer_2, neurons_nb_layer_3])
b_fc1 = bias_variable([neurons_nb_layer_3])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * neurons_nb_layer_2]) 
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) 

keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([neurons_nb_layer_3, 10])
b_fc2 = bias_variable([10])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))

train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) 
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 

import datetime
start = datetime.datetime.now()
with tf.Session() as sess:
  for i in range(600):
    batch = mnist.train.next_batch(50)
    if i % 100 == 0:
      train_accuracy = accuracy.eval(feed_dict={
          x: batch[0], y_: batch[1], keep_prob: 1.0})
      print('step %d, training accuracy %g' % (i, train_accuracy))
    train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})

  print('test accuracy %g' % accuracy.eval(feed_dict={
      x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
end = datetime.datetime.now()
time = (end - start).seconds
print(time//60, "min", time%60,"s")

Keras code :

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

import keras
from keras.models import Sequential

model = Sequential()

bias_initializer = keras.initializers.Constant(value = 0.1)

neurons_nb_layer_1 = 32
neurons_nb_layer_2 = 64
neurons_nb_layer_3 = 1024

from keras.layers import Reshape, Conv2D, MaxPooling2D, Dropout, Flatten, Dense
model.add(Reshape((28, 28, 1), input_shape=(784,)))
model.add(Conv2D(filters = neurons_nb_layer_1, kernel_size = 5*5, padding = 'same', activation = "relu", bias_initializer = bias_initializer))
model.add(Conv2D(filters = neurons_nb_layer_2, kernel_size = 5*5, padding = 'same', activation = "relu", bias_initializer = bias_initializer))
model.add(Dense(units = neurons_nb_layer_3, activation = "relu", bias_initializer = bias_initializer))
model.add(Dropout(rate = 0.5))
model.add(Dense(units = 10, activation = "relu"))


model.compile(loss = keras.losses.categorical_crossentropy,
              optimizer = 'adam',

import datetime
start2 = datetime.datetime.now()
for i in range(600):
    batch = mnist.train.next_batch(50)
    if i % 100 == 0:
        train_accuracy = model.evaluate(batch[0], batch[1])
        print("step", i, ":", train_accuracy)
    model.train_on_batch(batch[0], batch[1])
end2 = datetime.datetime.now()
time2 = (end2 - start2).seconds
print(time2//60, "min", time2%60,"s")

according to keras documentation kernel_size = 5*5 is a 25x25 convolution kernel, not 5x5 like your tensorflow example
you might want to use kernel_size=(5,5) or kernel_size=5

