Do you have any idea how can I run "eval_image_classifier.py" on GPUs? Should I change any functions or do any modifications? Or whether there exist any other specific functions for evaluation on GPUs?
I can already run "train_image_classifier.py" on GPUs because of having the associated flag for switching between CPU and GPU:
tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
'Use CPUs to deploy clones.')
I did try to add the same line to eval_image_classifier.py
, but it had no effect. I'm using Python 2.7.13 and Tensorflow 1.3.0 .
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
from deployment import model_deploy
from datasets import dataset_factory
from nets import nets_factory
from preprocessing import preprocessing_factory
slim = tf.contrib.slim
tf.app.flags.DEFINE_integer(
'batch_size', 32, 'The number of samples in each batch.')
tf.app.flags.DEFINE_integer(
'max_num_batches', None,
'Max number of batches to evaluate by default use all.')
tf.app.flags.DEFINE_string(
'master', '', 'The address of the TensorFlow master to use.')
tf.app.flags.DEFINE_string(
'checkpoint_path', '...',
'The directory where the model was written to or an absolute path to a '
'checkpoint file.')
tf.app.flags.DEFINE_string(
'eval_dir', '...',
'Directory where the results are saved to.')
tf.app.flags.DEFINE_integer('num_clones', 1,
'Number of model clones to deploy.')
tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
'Use CPUs to deploy clones.')
tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.')
tf.app.flags.DEFINE_integer(
'num_readers', 4,
'The number of parallel readers that read data from the dataset.')
tf.app.flags.DEFINE_integer(
'num_ps_tasks', 0,
'The number of parameter servers. If the value is 0, then the parameters '
'are handled locally by the worker.')
tf.app.flags.DEFINE_integer(
'num_preprocessing_threads', 4,
'The number of threads used to create the batches.')
tf.app.flags.DEFINE_string(
'dataset_name', '...', 'The name of the dataset to load.')
tf.app.flags.DEFINE_string(
'dataset_split_name', 'validation', 'The name of the train/test split.')
tf.app.flags.DEFINE_string(
'dataset_dir', '...',
'The directory where the dataset files are stored.')
tf.app.flags.DEFINE_integer(
'labels_offset', 0,
'An offset for the labels in the dataset. This flag is primarily used to '
'evaluate the VGG and ResNet architectures which do not use a background '
'class for the ImageNet dataset.')
tf.app.flags.DEFINE_string(
'model_name', 'densenet161', 'The name of the architecture to evaluate.')
tf.app.flags.DEFINE_string(
'preprocessing_name', None, 'The name of the preprocessing to use. If left '
'as `None`, then the model_name flag is used.')
tf.app.flags.DEFINE_float(
'moving_average_decay', None,
'The decay to use for the moving average.'
'If left as None, then moving averages are not used.')
tf.app.flags.DEFINE_integer(
'eval_image_size', None, 'Eval image size')
FLAGS = tf.app.flags.FLAGS
def main(_):
if not FLAGS.dataset_dir:
raise ValueError('You must supply the dataset directory with --dataset_dir')
#######################
# Config model_deploy #
#######################
tf.logging.set_verbosity(tf.logging.INFO)
with tf.Graph().as_default():
deploy_config = model_deploy.DeploymentConfig(
num_clones=FLAGS.num_clones,
clone_on_cpu=FLAGS.clone_on_cpu,
#replica_id=FLAGS.task,
num_replicas=FLAGS.worker_replicas,
num_ps_tasks=FLAGS.num_ps_tasks)
# Create global_step
with tf.device(deploy_config.variables_device()):
tf_global_step = slim.create_global_step()
######################
# Select the dataset #
######################
dataset = dataset_factory.get_dataset(
FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
####################
# Select the model #
####################
network_fn = nets_factory.get_network_fn(
FLAGS.model_name,
num_classes=(dataset.num_classes - FLAGS.labels_offset),
is_training=False)
##############################################################
# Create a dataset provider that loads data from the dataset #
##############################################################
with tf.device(deploy_config.inputs_device()):
provider = slim.dataset_data_provider.DatasetDataProvider(
dataset,
num_readers=FLAGS.num_readers,
shuffle=False,
common_queue_capacity=2 * FLAGS.batch_size,
common_queue_min=FLAGS.batch_size)
[image, label] = provider.get(['image', 'label'])
label -= FLAGS.labels_offset
#####################################
# Select the preprocessing function #
#####################################
preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
image_preprocessing_fn = preprocessing_factory.get_preprocessing(
preprocessing_name,
is_training=False)
eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size
image = image_preprocessing_fn(image, eval_image_size, eval_image_size)
images, labels = tf.train.batch(
[image, label],
batch_size=FLAGS.batch_size,
num_threads=FLAGS.num_preprocessing_threads,
capacity=5 * FLAGS.batch_size)
batch_queue = slim.prefetch_queue.prefetch_queue(
[images, labels], capacity=2 * deploy_config.num_clones)
####################
# Define the model #
####################
def clone_fn(batch_queue):
"""Allows data parallelism by creating multiple clones of network_fn."""
with tf.device(deploy_config.inputs_device()):
images, labels = batch_queue.dequeue()
logits, end_points = network_fn(images)
logits = tf.squeeze(logits)
#############################
# Specify the loss function #
#############################
if 'AuxLogits' in end_points:
tf.losses.mean_squared_error(
predictions=end_points['AuxLogits'], labels=labels, weights=0.4, scope='aux_loss')
tf.losses.mean_squared_error(
predictions=logits, labels=labels, weights=1.0)
return end_points
#clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
#first_clone_scope = deploy_config.clone_scope(0)
####################
# Define the model #
####################
logits, _ = network_fn(images)
if FLAGS.moving_average_decay:
variable_averages = tf.train.ExponentialMovingAverage(
FLAGS.moving_average_decay, tf_global_step)
variables_to_restore = variable_averages.variables_to_restore(
slim.get_model_variables())
variables_to_restore[tf_global_step.op.name] = tf_global_step
else:
variables_to_restore = slim.get_variables_to_restore()
logits = tf.squeeze(logits)
# Define the metrics:
predictions = logits
names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
'Accuracy': tf.metrics.root_mean_squared_error(predictions, labels),
'Recall_5': slim.metrics.streaming_recall(
logits, labels),
})
# Print the summaries to screen.
print_ops = []
summary_ops = []
for name, value in names_to_values.items():
summary_name = 'eval/%s' % name
op = tf.summary.scalar(summary_name, value, collections=[])
op = tf.Print(op, [value], summary_name)
summary_ops.append(op)
print_ops.append(tf.Print(value, [value], summary_name))
tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)
# TODO(sguada) use num_epochs=1
if FLAGS.max_num_batches:
num_batches = FLAGS.max_num_batches
else:
# This ensures that we make a single pass over all of the data.
num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))
if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
if tf.train.latest_checkpoint(FLAGS.checkpoint_path):
checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
else:
checkpoint_path = FLAGS.checkpoint_path
eval_interval_secs = 6
tf.logging.info('Evaluating %s' % checkpoint_path)
slim.evaluation.evaluation_loop(
master=FLAGS.master,
checkpoint_dir=checkpoint_path,
logdir=FLAGS.eval_dir,
num_evals=num_batches,
eval_op=list(names_to_updates.values()) + print_ops,
variables_to_restore=variables_to_restore,
eval_interval_secs = eval_interval_secs )
if __name__ == '__main__':
tf.app.run()
I tried to use some code like Tensorflow tutorial as well:
# Creates a graph.
with tf.device('/gpu:2'):
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)
# Creates a session with allow_soft_placement and log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True, log_device_placement=True))
# Runs the op.
print(sess.run(c))
I modified the code in this way:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
from datasets import dataset_factory
from nets import nets_factory
from preprocessing import preprocessing_factory
slim = tf.contrib.slim
tf.app.flags.DEFINE_integer(
'batch_size', 32, 'The number of samples in each batch.')
tf.app.flags.DEFINE_integer(
'max_num_batches', None,
'Max number of batches to evaluate by default use all.')
tf.app.flags.DEFINE_string(
'master', '', 'The address of the TensorFlow master to use.')
tf.app.flags.DEFINE_string(
'checkpoint_path', '...',
'The directory where the model was written to or an absolute path to a '
'checkpoint file.')
tf.app.flags.DEFINE_string(
'eval_dir', '...',
'Directory where the results are saved to.')
tf.app.flags.DEFINE_integer(
'num_preprocessing_threads', 4,
'The number of threads used to create the batches.')
tf.app.flags.DEFINE_string(
'dataset_name', '...', 'The name of the dataset to load.')
tf.app.flags.DEFINE_string(
'dataset_split_name', 'validation', 'The name of the train/test split.')
tf.app.flags.DEFINE_string(
'dataset_dir', '...',
'The directory where the dataset files are stored.')
tf.app.flags.DEFINE_integer(
'labels_offset', 0,
'An offset for the labels in the dataset. This flag is primarily used to '
'evaluate the VGG and ResNet architectures which do not use a background '
'class for the ImageNet dataset.')
tf.app.flags.DEFINE_string(
'model_name', 'densenet161', 'The name of the architecture to evaluate.')
tf.app.flags.DEFINE_string(
'preprocessing_name', None, 'The name of the preprocessing to use. If left '
'as `None`, then the model_name flag is used.')
tf.app.flags.DEFINE_float(
'moving_average_decay', None,
'The decay to use for the moving average.'
'If left as None, then moving averages are not used.')
tf.app.flags.DEFINE_integer(
'eval_image_size', None, 'Eval image size')
FLAGS = tf.app.flags.FLAGS
# Initialize all global and local variables
init = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
def main(_):
if not FLAGS.dataset_dir:
raise ValueError('You must supply the dataset directory with --dataset_dir')
tf.logging.set_verbosity(tf.logging.INFO)
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
log_device_placement=True))
with tf.Graph().as_default(), tf.device('/gpu:0'):
sess.run(init)
tf_global_step = slim.get_or_create_global_step()
######################
# Select the dataset #
######################
dataset = dataset_factory.get_dataset(
FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
####################
# Select the model #
####################
network_fn = nets_factory.get_network_fn(
FLAGS.model_name,
num_classes=(dataset.num_classes - FLAGS.labels_offset),
is_training=False)
##############################################################
# Create a dataset provider that loads data from the dataset #
##############################################################
provider = slim.dataset_data_provider.DatasetDataProvider(
dataset,
shuffle=False,
common_queue_capacity=2 * FLAGS.batch_size,
common_queue_min=FLAGS.batch_size)
[image, label] = provider.get(['image', 'label'])
label -= FLAGS.labels_offset
#####################################
# Select the preprocessing function #
#####################################
preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
image_preprocessing_fn = preprocessing_factory.get_preprocessing(
preprocessing_name,
is_training=False)
eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size
image = image_preprocessing_fn(image, eval_image_size, eval_image_size)
images, labels = tf.train.batch(
[image, label],
batch_size=FLAGS.batch_size,
num_threads=FLAGS.num_preprocessing_threads,
capacity=5 * FLAGS.batch_size)
####################
# Define the model #
####################
logits, _ = network_fn(images)
if FLAGS.moving_average_decay:
variable_averages = tf.train.ExponentialMovingAverage(
FLAGS.moving_average_decay, tf_global_step)
variables_to_restore = variable_averages.variables_to_restore(
slim.get_model_variables())
variables_to_restore[tf_global_step.op.name] = tf_global_step
else:
variables_to_restore = slim.get_variables_to_restore()
logits = tf.squeeze(logits)
# Define the metrics:
predictions = logits
names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
'Accuracy': tf.metrics.root_mean_squared_error(predictions, labels),
'Recall_5': slim.metrics.streaming_recall(
logits, labels),
})
# Print the summaries to screen.
print_ops = []
summary_ops = []
for name, value in names_to_values.items():
summary_name = 'eval/%s' % name
op = tf.summary.scalar(summary_name, value, collections=[])
op = tf.Print(op, [value], summary_name)
summary_ops.append(op)
print_ops.append(tf.Print(value, [value], summary_name))
tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)
# TODO(sguada) use num_epochs=1
if FLAGS.max_num_batches:
num_batches = FLAGS.max_num_batches
else:
# This ensures that we make a single pass over all of the data.
num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))
if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
if tf.train.latest_checkpoint(FLAGS.checkpoint_path):
checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
else:
checkpoint_path = FLAGS.checkpoint_path
#print(checkpoint_path)
eval_interval_secs = 6
tf.logging.info('Evaluating %s' % checkpoint_path)
slim.evaluation.evaluation_loop(
master=FLAGS.master,
checkpoint_dir=checkpoint_path,
logdir=FLAGS.eval_dir,
num_evals=num_batches,
eval_op=list(names_to_updates.values()) + print_ops,
variables_to_restore=variables_to_restore,
eval_interval_secs = eval_interval_secs )
if __name__ == '__main__':
tf.app.run()
When I run this code, I face this error:
Traceback (most recent call last):
File "/home/zgholami/test1/GZ_Project/GZ_DenseNet_TF-slim/eval_image_classifier.py", line 210, in <module>
tf.app.run()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/home/zgholami/test1/GZ_Project/GZ_DenseNet_TF-slim/eval_image_classifier.py", line 206, in main
eval_interval_secs = 60 )
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/contrib/slim/python/slim/evaluation.py", line 296, in evaluation_loo
p
timeout=timeout)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/contrib/training/python/training/evaluation.py", line 447, in evalua
te_repeatedly
session_creator=session_creator, hooks=hooks) as session:
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 668, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 490, in __init__
self._sess = _RecoverableSession(self._coordinated_creator)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 842, in __init__
_WrappedSession.__init__(self, self._create_session())
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 847, in _create_session
return self._sess_creator.create_session()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 551, in create_session
self.tf_sess = self._session_creator.create_session()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 425, in create_session
init_fn=self._scaffold.init_fn)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/session_manager.py", line 273, in prepare_session
config=config)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/session_manager.py", line 189, in _restore_checkpoin
t
saver.restore(sess, checkpoint_filename_with_path)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1560, in restore
{self.saver_def.filename_tensor_name: save_path})
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1124, in _run
feed_dict_tensor, options, run_metadata)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
options, run_metadata)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot assign a device for operation 'eval_step': Could not satisfy explicit device s
pecification '/device:GPU:0' because no supported kernel for GPU devices is available.
Colocation Debug Info:
Colocation group had the following types and devices:
Const: GPU CPU
AssignAdd: CPU
VariableV2: CPU
Identity: GPU CPU
Assign: CPU
IsVariableInitialized: CPU
[[Node: eval_step = VariableV2[_class=["loc:@eval_step"], container="", dtype=DT_INT64, shape=[], shared_name="", _device="/device:GPU:0"]
()]]
Caused by op u'eval_step', defined at:
File "/home/zgholami/test1/GZ_Project/GZ_DenseNet_TF-slim/eval_image_classifier.py", line 210, in <module>
tf.app.run()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/home/zgholami/test1/GZ_Project/GZ_DenseNet_TF-slim/eval_image_classifier.py", line 206, in main
eval_interval_secs = 60 )
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/contrib/slim/python/slim/evaluation.py", line 296, in evaluation_loo
p
timeout=timeout)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/contrib/training/python/training/evaluation.py", line 410, in evalua
te_repeatedly
eval_step = get_or_create_eval_step()
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/training/evaluation.py", line 57, in _get_or_create_eval_step
collections=[ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.EVAL_STEP])
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1065, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 962, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 367, in get_variable
validate_shape=validate_shape, use_resource=use_resource)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 352, in _true_getter
use_resource=use_resource)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 725, in _get_single_variable
validate_shape=validate_shape)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 199, in __init__
expected_shape=expected_shape)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 283, in _init_from_args
name=name)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/state_ops.py", line 131, in variable_op_v2
shared_name=shared_name)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/ops/gen_state_ops.py", line 682, in _variable_v2
name=name)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/group/pawsey0245/zgholami/pyml/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'eval_step': Could not satisfy explicit device specification '
/device:GPU:0' because no supported kernel for GPU devices is available.
Colocation Debug Info:
Colocation group had the following types and devices:
Const: GPU CPU
AssignAdd: CPU
VariableV2: CPU
Identity: GPU CPU
Assign: CPU
IsVariableInitialized: CPU
[[Node: eval_step = VariableV2[_class=["loc:@eval_step"], container="", dtype=DT_INT64, shape=[], shared_name="", _device="/device:GPU:0"]
()]]
ERROR:tensorflow:==================================
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'report_uninitialized_variables_1/boolean_mask/Gather:0' shape=(?,) dtype=string>
If you want to mark it as used call its "mark_used()" method.
您尚未检查TensorFlow的文档: 使用GPU
If you've installed TF with GPU support, which I assume you've as you've trained it using train_image_classifier.py
, then eval_image_classifier.py
also used GPUs as default.
I've just ran default eval_image_classifier.py
from models/research/slim and it is using GPUs to do evaluation. So, you don't have to modify anything to use GPUs to do eval.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.