How can I get my tensorflow script to run on my GPU?

Question

I'm trying to train a DNN using the following script:

import numpy as np
import os, sys
import argparse
from PIL import Image
from freeze_graph import freeze_graph
import tensorflow as tf
import time

from net import *
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), "./"))
from custom_vgg16 import *


# gram matrix per layer
def gram_matrix(x):
    assert isinstance(x, tf.Tensor)
    b, h, w, ch = x.get_shape().as_list()
    features = tf.reshape(x, [b, h*w, ch])
    # gram = tf.batch_matmul(features, features, adj_x=True)/tf.constant(ch*w*h, tf.float32)
    gram = tf.matmul(features, features, adjoint_a=True)/tf.constant(ch*w*h, tf.float32)
    return gram

# total variation denoising
def total_variation_regularization(x, beta=1):
    assert isinstance(x, tf.Tensor)
    wh = tf.constant([[[[ 1], [ 1], [ 1]]], [[[-1], [-1], [-1]]]], tf.float32)
    ww = tf.constant([[[[ 1], [ 1], [ 1]], [[-1], [-1], [-1]]]], tf.float32)
    tvh = lambda x: conv2d(x, wh, p='SAME')
    tvw = lambda x: conv2d(x, ww, p='SAME')
    dh = tvh(x)
    dw = tvw(x)
    tv = (tf.add(tf.reduce_sum(dh**2, [1, 2, 3]), tf.reduce_sum(dw**2, [1, 2, 3]))) ** (beta / 2.)
    return tv

parser = argparse.ArgumentParser(description='Real-time style transfer')
parser.add_argument('--gpu', '-g', default=-1, type=int,
                    help='GPU ID (negative value indicates CPU)')
parser.add_argument('--dataset', '-d', default='dataset', type=str,
                    help='dataset directory path (according to the paper, use MSCOCO 80k images)')
parser.add_argument('--style_image', '-s', type=str, required=True,
                    help='style image path')
parser.add_argument('--batchsize', '-b', type=int, default=1,
                    help='batch size (default value is 1)')
parser.add_argument('--ckpt', '-c', default=None, type=int,
                    help='the global step of checkpoint file desired to restore.')
parser.add_argument('--lambda_tv', '-l_tv', default=10e-4, type=float,
                    help='weight of total variation regularization according to the paper to be set between 10e-4 and 10e-6.')
parser.add_argument('--lambda_feat', '-l_feat', default=1e0, type=float)
parser.add_argument('--lambda_style', '-l_style', default=1e1, type=float)
parser.add_argument('--epoch', '-e', default=2, type=int)
parser.add_argument('--lr', '-l', default=1e-3, type=float)
parser.add_argument('--pb', '-pb', default=True, type=bool, help='save a pb format as well.')
args = parser.parse_args()

data_dict = loadWeightsData('./vgg16.npy')

batchsize = args.batchsize
gpu = args.gpu
dataset = args.dataset
epochs = args.epoch
learning_rate = args.lr
ckpt = args.ckpt
lambda_tv = args.lambda_tv
lambda_f = args.lambda_feat
lambda_s = args.lambda_style
style_image = args.style_image
save_pb = args.pb
gpu = args.gpu

style_name, _ = os.path.splitext(style_image.split(os.sep)[-1])

fpath = os.listdir(args.dataset)
imagepaths = []
for fn in fpath:
    base, ext = os.path.splitext(fn)
    if ext == '.jpg' or ext == '.png':
        imagepath = os.path.join(dataset, fn)
        imagepaths.append(imagepath)
data_len = len(imagepaths)
iterations = int(data_len / batchsize)
print ('Number of traning images: {}'.format(data_len))
print ('{} epochs, {} iterations per epoch'.format(epochs, iterations))

style_np = np.asarray(Image.open(style_image).convert('RGB').resize((224, 224)), dtype=np.float32)
styles_np = [style_np for x in range(batchsize)]


if gpu > -1:
    device = '/gpu:{}'.format(gpu)
else:
    device = '/cpu:0'

with tf.device(device):

    inputs = tf.placeholder(tf.float32, shape=[batchsize, 224, 224, 3], name='input')
    net = FastStyleNet()
    saver = tf.train.Saver(restore_sequentially=True)
    saver_def = saver.as_saver_def()


    target = tf.placeholder(tf.float32, shape=[batchsize, 224, 224, 3])
    outputs = net(inputs)

    # style target feature
    # compute gram maxtrix of style target
    vgg_s = custom_Vgg16(target, data_dict=data_dict)
    feature_ = [vgg_s.conv1_2, vgg_s.conv2_2, vgg_s.conv3_3, vgg_s.conv4_3, vgg_s.conv5_3]
    gram_ = [gram_matrix(l) for l in feature_]

    # content target feature 
    vgg_c = custom_Vgg16(inputs, data_dict=data_dict)
    feature_ = [vgg_c.conv1_2, vgg_c.conv2_2, vgg_c.conv3_3, vgg_c.conv4_3, vgg_c.conv5_3]

    # feature after transformation 
    vgg = custom_Vgg16(outputs, data_dict=data_dict)
    feature = [vgg.conv1_2, vgg.conv2_2, vgg.conv3_3, vgg.conv4_3, vgg.conv5_3]

    # compute feature loss
    loss_f = tf.zeros(batchsize, tf.float32)
    for f, f_ in zip(feature, feature_):
        loss_f += lambda_f * tf.reduce_mean(tf.subtract(f, f_) ** 2, [1, 2, 3])

    # compute style loss
    gram = [gram_matrix(l) for l in feature]
    loss_s = tf.zeros(batchsize, tf.float32)
    for g, g_ in zip(gram, gram_):
        loss_s += lambda_s * tf.reduce_mean(tf.subtract(g, g_) ** 2, [1, 2])

    # total variation denoising
    loss_tv = lambda_tv * total_variation_regularization(outputs)

    # total loss
    loss = loss_s + loss_f + loss_tv

    # optimizer
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)

with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:

    ckpt_directory = './ckpts/{}/'.format(style_name)
    if not os.path.exists(ckpt_directory):
        os.makedirs(ckpt_directory)

    # training
    tf.global_variables_initializer().run()

    if ckpt:
        if ckpt < 0:
            checkpoint = tf.train.get_checkpoint_state(ckpt_directory)
            input_checkpoint = checkpoint.model_checkpoint_path
        else:
            input_checkpoint =  ckpt_directory + style_name + '-{}'.format(ckpt)
        saver.restore(sess, input_checkpoint)
        print ('Checkpoint {} restored.'.format(ckpt))

    for epoch in range(1, epochs + 1):
        imgs = np.zeros((batchsize, 224, 224, 3), dtype=np.float32)
        for i in range(iterations):
            for j in range(batchsize):
                p = imagepaths[i * batchsize + j]
                imgs[j] = np.asarray(Image.open(p).convert('RGB').resize((224, 224)), np.float32)
            feed_dict = {inputs: imgs, target: styles_np}
            loss_, _= sess.run([loss, train_step,], feed_dict=feed_dict)
            print('[epoch {}/{}] batch {}/{}... loss: {}'.format(epoch, epochs, i + 1, iterations, loss_[0]))    
        saver.save(sess, ckpt_directory + style_name, global_step=epoch)

if save_pb:
    if not os.path.exists('./pbs'):
        os.makedirs('./pbs')
    freeze_graph(ckpt_directory, './pbs/{}.pb'.format(style_name), 'output')

and when i run, it trains it on the images (i'm only using one image at the moment just to get the whole process working) and prints out this at the command line:

D:\myName\tensorflow-fast-neuralstyle>python train.py -s picasso.jpg -d trainTest -g 0
C:\ProgramData\Anaconda3\lib\site-packages\h5py\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Number of traning images: 1
2 epochs, 1 iterations per epoch
2018-05-16 18:47:33.268196: I T:\src\github\tensorflow\tensorflow\core\platform\cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2018-05-16 18:47:33.582973: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1356] Found device 0 with properties:
name: GeForce GTX 1070 major: 6 minor: 1 memoryClockRate(GHz): 1.645
pciBusID: 0000:01:00.0
totalMemory: 8.00GiB freeMemory: 6.63GiB
2018-05-16 18:47:33.590004: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1435] Adding visible gpu devices: 0
2018-05-16 18:47:34.243696: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:923] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-05-16 18:47:34.247206: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:929]      0
2018-05-16 18:47:34.249841: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:942] 0:   N
2018-05-16 18:47:34.252015: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1053] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6405 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1)
[epoch 1/2] batch 1/1... loss: 32216618.0
[epoch 2/2] batch 1/1... loss: 27523674.0
2018-05-16 18:47:55.451428: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1435] Adding visible gpu devices: 0
2018-05-16 18:47:55.456462: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:923] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-05-16 18:47:55.462478: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:929]      0
2018-05-16 18:47:55.465806: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:942] 0:   N
2018-05-16 18:47:55.468555: I T:\src\github\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1053] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6405 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1)

Which is all fine, until I get this error:

InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'save/SaveV2': Could not satisfy explicit device specification '/device:GPU:0' because no supported kernel for GPU devices is available.
Registered kernels:
  device='CPU'

It seems like the script can find my GPU to run on, but somethings stopping it from completing which I dont understand. All the other posts about this error say set the 'allow_soft_placement' argument to True, but in this script it already is.

Any help would be massively appreciated. Thanks!

ps the trained model will be used by this generate.py file

import numpy as np
import argparse
import tensorflow as tf
import os
from PIL import Image

parser = argparse.ArgumentParser(description='Real-time style transfer image generator')
parser.add_argument('--input', '-i', type=str, help='content image')
parser.add_argument('--gpu', '-g', default=-1, type=int,
                    help='GPU ID (negative value indicates CPU)')
parser.add_argument('--style', '-s', default=None, type=str, help='style model name')
parser.add_argument('--ckpt', '-c', default=-1, type=int, help='checkpoint to be loaded')
parser.add_argument('--out', '-o', default='stylized_image.jpg', type=str, help='stylized image\'s name')
parser.add_argument('--pb', '-pb', default=False, type=bool, help='load with pb')

args = parser.parse_args()

if not os.path.exists('./images/output/'):
        os.makedirs('./images/output/')

outfile_path = './images/output/' + args.out
content_image_path = args.input
style_name = args.style
ckpt = args.ckpt
load_with_pb = args.pb
gpu = args.gpu

original_image = Image.open(content_image_path).convert('RGB')

img = np.asarray(original_image.resize((224, 224)), dtype=np.float32)
shaped_input = img.reshape((1,) + img.shape)

if gpu > -1:
    device = '/gpu:{}'.format(gpu)
else:
    device = '/cpu:0'


with tf.device(device):
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        if load_with_pb:
            from tensorflow.core.framework import graph_pb2
            graph_def = graph_pb2.GraphDef()
            with open('./pbs/{}.pb'.format(style_name), "rb") as f:
                graph_def.ParseFromString(f.read())
            input_image, output = tf.import_graph_def(graph_def, return_elements=['input:0', 'output:0'])

        else:
            if ckpt < 0:
                checkpoint = tf.train.get_checkpoint_state('./ckpts/{}/'.format(style_name))
                input_checkpoint = checkpoint.model_checkpoint_path
            else:
                input_checkpoint = './ckpts/{}/{}-{}'.format(style_name, style_name, ckpt)
            saver = tf.train.import_meta_graph(input_checkpoint + '.meta')
            saver.restore(sess, input_checkpoint)
            graph = tf.get_default_graph()

            input_image = graph.get_tensor_by_name('input:0')
            output = graph.get_tensor_by_name('output:0')

        out = sess.run(output, feed_dict={input_image: shaped_input})

out = out.reshape((out.shape[1:]))
im = Image.fromarray(np.uint8(out))

im = im.resize(original_image.size, resample=Image.LANCZOS)
im.save(outfile_path)

Answer 1

I/O nodes such as tf.train.Saver cannot be placed on the GPU. Create them outside your tf.device context after creating your net.

How can I get my tensorflow script to run on my GPU?

Question

1 answers

solution1
0 2018-05-16 18:57:21

How can I get my tensorflow script to run on my GPU?

Question

1 answers

solution1 0 2018-05-16 18:57:21

solution1
0 2018-05-16 18:57:21