Tensorflow GPU version CNN CuDNN error on Windows 10

Question

I'm trying to do CIFAR-100 training with Python Tensorflow CNN, but the error CUDA_ERROR_OUT_OF_MEMORY , CUDA_STATUS_NOT_INITIALIZED and CUDA_STATUS_BAD_PARAM keep bothering me, I'm using Anaconda virtual environment which Tensorflow depends on my machine, Python version is Anaconda Python 3.5 virtual environment, Tensorflow version is 1.1.0, here's my code: tf_cifar_learning.py:

# Set working directory

import os
dir_model = "c:/tf_model_cifar100"

# Modules needed

import numpy as np
import tensorflow as tf
import pandas as pd
from mlxtend.preprocessing import one_hot


# Load CIFAR Data
from batch import next_batch
from read import unpickle
import time
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.995)


# Prepare test data

testdata = unpickle('test')[b'data']
testdata1 = testdata.astype('float')
del testdata
testdata = testdata1[0:5000, :]
testlabel = unpickle('test')[b'coarse_labels'][0:5000]
testlabel = one_hot(testlabel, 100)
for i in range(testdata.shape[0]):
    for j in range(3072):
        testdata[i][j] = float(testdata[i][j]) / 255.0
    if(i % 1000 == 0):
        print("%d of 5000 test datasets processed" % i)

# Parameters
learning_rate = 0.001
training_iters = 1000000
batch_size = 10 # 128
display_step = 2

# Network Parameters
n_input = 1024*3 # CIFAR data input (img shape: 32*32)
n_classes = 100 # CIFAR total classes
dropout = 0.75 # Dropout, probability to keep units

# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)

# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
    x = tf.nn.bias_add(x, b)
    o = tf.nn.relu(x)
    return o

def maxpool2d(x, k=2):
    # MaxPool2D wrapper
    o = tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME')
    return o

# Create model
def conv_net(x, weights, biases, dropout):
        # Reshape input picture
    x = tf.reshape(x, shape=[-1, 32, 32, 3])
    # Convolution Layer
    conv1 = conv2d(x, weights['wc1'], biases['bc1'])
    # Max Pooling (down-sampling)
    conv1 = maxpool2d(conv1, k=2)

    # Convolution Layer
    conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
    # Max Pooling (down-sampling)
    conv2 = maxpool2d(conv2, k=2)

    # Fully connected layer
    # Reshape conv2 output to fit fully connected layer input
    fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
    fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
    fc1 = tf.nn.relu(fc1)
    # Apply Dropout
    fc1 = tf.nn.dropout(fc1, dropout)

    # Output, class prediction
    out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
    return out

# Store layers weight & bias
weights = {
    # 5x5 conv, 1 input, 32 outputs
    'wc1': tf.Variable(tf.random_normal([5, 5, 3, 32])),
    # 5x5 conv, 32 inputs, 64 outputs
    'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
    # fully connected, 8*8*64 inputs, 1024 outputs
    'wd1': tf.Variable(tf.random_normal([8*8*64, 1024])),
    # 1024 inputs, 100 outputs (class prediction)
    'out': tf.Variable(tf.random_normal([1024, n_classes]))
}

biases = {
    'bc1': tf.Variable(tf.random_normal([32])),
    'bc2': tf.Variable(tf.random_normal([64])),
    'bd1': tf.Variable(tf.random_normal([1024])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# Construct model
pred = conv_net(x, weights, biases, keep_prob)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    sess.run(init)
    step = 1
    # Time measuring
    t1 = time.time()
    # Keep training until reach max iterations
    while step * batch_size < training_iters:
        # Prepare training batch
        batch_x, batch_y = next_batch(batch_size)
        batch_x1 = np.zeros([batch_size, 3072], dtype="float32")
        for i in range(batch_size):
            for j in range(3072):
                batch_x1[i][j] = batch_x[i][j] / 255.0
            #if(i % 200 == 0):
                #print("%d of %d training batch images processed" % (i, batch_size))
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x1, y: batch_y, keep_prob: dropout})
        if step % display_step == 0:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x1, y: batch_y, keep_prob: 1.})
            # Calculate accuracy for all test samples
            acc = accuracy.eval({x: testdata, y: testlabel, keep_prob: 1.})
            # Time measuring
            t2 = time.time()
            tmp = t2-t1
            sec = tmp % 60
            m = int(tmp / 60)
            print("Iter# %8d"%(step*batch_size) + \
                  ", Minibatch Loss= %16.10f"%(loss) + \
                  ", Testing Accuracy= %8.6f"%(acc) + \
                  ", Training currently elapsed " + \
                  "{:d} mins {:f} secs".format(m, sec))   
        step += 1
    print("Optimization Finished!")
    # Save the model after learning
    model_saver = tf.train.Saver()
    model_saver.save(sess, dir_model + "/CIFAR-100_cnn_model.chkp")

batch.py:

def next_batch(batch_size, onehot=True):
    class a:
        try:
            temp = current_batch
        except NameError:
            current_batch = 0
    import numpy as np
    from read import unpickle
    import tensorflow as tf
    #from mlxtend.preprocessing import one_hot
    dict_data = unpickle('train')
    label = np.array(dict_data[b'fine_labels'][a.current_batch:a.current_batch+batch_size])
    a1 = dict_data[b'data']
    a2 = a1[a.current_batch:a.current_batch+batch_size, :]
    a.current_batch += batch_size
    a2 = np.reshape(a2, (batch_size, 3072))
    with tf.device('/cpu:0'):
        if(onehot==True):
            label = tf.Session().run(tf.one_hot(label, 100))
    return a2,label

read.py:

def unpickle(file):
    import pickle
    with open(file, 'rb') as a:
        dict = pickle.load(a, encoding='bytes')
        return dict

Windows CMD python tf_cifar_learning.py output:

(tensorflow) C:\Users\Administrator\learn_tensorflow\cifar-100-python>python tf_cifar_learning.py
0 of 5000 test datasets processed
1000 of 5000 test datasets processed
2000 of 5000 test datasets processed
3000 of 5000 test datasets processed
4000 of 5000 test datasets processed
2017-05-02 17:48:46.635855: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The     TensorFlow library wasn't compiled to use SSE instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.635975: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.637256: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE3 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.638434: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.638939: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.639456: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.641753: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.641909: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.994154: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:887] Found device 0 with properties:
name: GeForce GT 730
major: 3 minor: 5 memoryClockRate (GHz) 0.9015
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.66GiB
2017-05-02 17:48:46.994318: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:908] DMA: 0
2017-05-02 17:48:46.997080: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:918] 0:   Y
2017-05-02 17:48:46.997985: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 730, pci bus id: 0000:01:00.0)
2017-05-02 17:48:46.999359: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_driver.cc:893] failed to allocate 1.99G (2136745984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
2017-05-02 17:48:46.999434: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_driver.cc:893] failed to allocate 1.79G (1923071488 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
2017-05-02 17:48:47.766766: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 730, pci bus id: 0000:01:00.0)
2017-05-02 17:48:48.334298: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:359] could not create cudnn handle: CUDNN_STATUS_NOT_INITIALIZED
2017-05-02 17:48:48.334466: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:366] error retrieving driver version: Unimplemented: kernel reported driver version not implemented on Windows
2017-05-02 17:48:48.343454: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:326] could not destroy cudnn handle: CUDNN_STATUS_BAD_PARAM
2017-05-02 17:48:48.343558: F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\kernels\conv_ops.cc:659] Check failed: stream->parent()->GetConvolveAlgorithms(&algorithms)

(tensorflow) C:\Users\Administrator\learn_tensorflow\cifar-100-python>

than Windows 10 says that Python has stopped working and immediately killed it, can someone please tell me what the problem is and tell me (or maybe give me an example) how to fix it?

Answer 1

It's likely that the problem has something to do with your environment.

You have only one GPU, likely you are also using for display. That's why TensorFlow cannot allocate all the memory it needs at front. You can control how much GPU memory to use per_process_gpu_memory_fraction like this:

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/test_util.py#L388

About Cudnn, it seems that the Cudnn library cannot initialize itself "CUDNN_STATUS_NOT_INITIALIZED". Are you make sure you can run other Cuda and Cudnn samples on that environment?

Answer 2

尝试将per_process_gpu_memory_fraction=0.995设置为像0.7或0.6这样的小数据

Answer 3

Now I know what's going on. It's actually an OOM. Reboot and reducing batch size does the job.

Answer 4

I run into the same error when using TensorFlow-gpu==1.13.1 installed with conda. After a few days of struggling, I solved the problem with the code below:

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

I think the issue is somehow caused by plugging your monitor into your graphic card.

Tensorflow GPU version CNN CuDNN error on Windows 10

Question

4 answers

solution1
0 2017-05-05 18:23:58

solution2
0 2017-11-16 06:49:37

solution3
0 2018-04-06 00:40:07

solution4
0 2019-07-17 02:21:22

Tensorflow GPU version CNN CuDNN error on Windows 10

Question

4 answers

solution1 0 2017-05-05 18:23:58

solution2 0 2017-11-16 06:49:37

solution3 0 2018-04-06 00:40:07

solution4 0 2019-07-17 02:21:22

solution1
0 2017-05-05 18:23:58

solution2
0 2017-11-16 06:49:37

solution3
0 2018-04-06 00:40:07

solution4
0 2019-07-17 02:21:22