如何使用 Tensorflow 创建的检查点和相关文件？

Question

A newbie tensorflow question here.这里有一个新手 tensorflow 问题。 I am doing a project using Google QuickDraw dataset.我正在使用 Google QuickDraw 数据集做一个项目。 I used the code given by Google to train a CNN model using the data.:我使用 Google 提供的代码使用数据训练 CNN 模型。：

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ast
import functools
import sys

import tensorflow as tf

tf.logging.set_verbosity('INFO') 

def get_num_classes():
    with open(FLAGS.classes_path) as label_class:
        classes = label_class.readlines()
    classes = [x.strip() for x in classes]
    return len(classes)


def get_input_fn(mode, tfrecord_pattern, batch_size):
  """Creates an input_fn that stores all the data in memory.
  Args:
   mode: one of tf.contrib.learn.ModeKeys.{TRAIN, INFER, EVAL}
   tfrecord_pattern: path to a TF record file created using create_dataset.py.
   batch_size: the batch size to output.
  Returns:
    A valid input_fn for the model estimator.
  """

  def _parse_tfexample_fn(example_proto, mode):
    """Parse a single record which is expected to be a tensorflow.Example."""
    feature_to_type = {
        "drawing": tf.VarLenFeature(dtype=tf.float32),
        "shape": tf.FixedLenFeature([2], dtype=tf.int64)
    }
    if mode != tf.estimator.ModeKeys.PREDICT:
      # The labels won't be available at inference time, so don't add them
      # to the list of feature_columns to be read.
      feature_to_type["class_index"] = tf.FixedLenFeature([1], dtype=tf.int64)

    parsed_features = tf.parse_single_example(example_proto, feature_to_type)
    labels = None
    if mode != tf.estimator.ModeKeys.PREDICT:
      labels = parsed_features["class_index"]
    parsed_features["drawing"] = tf.sparse_tensor_to_dense(parsed_features["drawing"])
    return parsed_features, labels

  def _input_fn():
    """Estimator `input_fn`.
    Returns:
      A tuple of:
      - Dictionary of string feature name to `Tensor`.
      - `Tensor` of target labels.
    """
    dataset = tf.data.TFRecordDataset.list_files(tfrecord_pattern)
    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.shuffle(buffer_size=10)
    dataset = dataset.repeat()
    # Preprocesses 10 files concurrently and interleaves records from each file.
    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        block_length=1)
    dataset = dataset.map(
        functools.partial(_parse_tfexample_fn, mode=mode),
        num_parallel_calls=10)
    dataset = dataset.prefetch(10000)
    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.shuffle(buffer_size=1000000)
    # Our inputs are variable length, so pad them.
    dataset = dataset.padded_batch(
        batch_size, padded_shapes=dataset.output_shapes)
    features, labels = dataset.make_one_shot_iterator().get_next()
    return features, labels

  return _input_fn


def model_fn(features, labels, mode, params):
  """Model function for RNN classifier.
  This function sets up a neural network which applies convolutional layers (as
  configured with params.num_conv and params.conv_len) to the input.
  The output of the convolutional layers is given to LSTM layers (as configured
  with params.num_layers and params.num_nodes).
  The final state of the all LSTM layers are concatenated and fed to a fully
  connected layer to obtain the final classification scores.
  Args:
    features: dictionary with keys: inks, lengths.
    labels: one hot encoded classes
    mode: one of tf.estimator.ModeKeys.{TRAIN, INFER, EVAL}
    params: a parameter dictionary with the following keys: num_layers,
      num_nodes, batch_size, num_conv, conv_len, num_classes, learning_rate.
  Returns:
    ModelFnOps for Estimator API.
  """

  def _get_input_tensors(features, labels):
    """Converts the input dict into inks, lengths, and labels tensors."""
    # features[ink] is a sparse tensor that is [8, batch_maxlen, 3]
    # inks will be a dense tensor of [8, maxlen, 3]
    # shapes is [batchsize, 2]
    shapes = features["shape"]
    # lengths will be [batch_size]
    lengths = tf.squeeze(
        tf.slice(shapes, begin=[0, 0], size=[params.batch_size, 1]))
    inks = tf.reshape(features["drawing"], [params.batch_size, -1, 3])
    if labels is not None:
      labels = tf.squeeze(labels)
    return inks, lengths, labels

  def _add_conv_layers(inks, lengths):
    """Adds convolution layers."""
    convolved = inks
    for i in range(len(params.num_conv)):
      convolved_input = convolved
      if params.batch_norm:
        convolved_input = tf.layers.batch_normalization(
            convolved_input,
            training=(mode == tf.estimator.ModeKeys.TRAIN))
      # Add dropout layer if enabled and not first convolution layer.
      if i > 0 and params.dropout:
        convolved_input = tf.layers.dropout(
            convolved_input,
            rate=params.dropout,
            training=(mode == tf.estimator.ModeKeys.TRAIN))
      convolved = tf.layers.conv1d(
          convolved_input,
          filters=params.num_conv[i],
          kernel_size=params.conv_len[i],
          activation=None,
          strides=1,
          padding="same",
          name="conv1d_%d" % i)
    return convolved, lengths

  def _add_regular_rnn_layers(convolved, lengths):
    """Adds RNN layers."""
    if params.cell_type == "lstm":
      cell = tf.nn.rnn_cell.BasicLSTMCell
    elif params.cell_type == "block_lstm":
      cell = tf.contrib.rnn.LSTMBlockCell
    cells_fw = [cell(params.num_nodes) for _ in range(params.num_layers)]
    cells_bw = [cell(params.num_nodes) for _ in range(params.num_layers)]
    if params.dropout > 0.0:
      cells_fw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_fw]
      cells_bw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_bw]
    outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
        cells_fw=cells_fw,
        cells_bw=cells_bw,
        inputs=convolved,
        sequence_length=lengths,
        dtype=tf.float32,
        scope="rnn_classification")
    return outputs

  def _add_cudnn_rnn_layers(convolved):
    """Adds CUDNN LSTM layers."""
    # Convolutions output [B, L, Ch], while CudnnLSTM is time-major.
    convolved = tf.transpose(convolved, [1, 0, 2])
    lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
        num_layers=params.num_layers,
        num_units=params.num_nodes,
        dropout=params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0,
        direction="bidirectional")
    outputs, _ = lstm(convolved)
    # Convert back from time-major outputs to batch-major outputs.
    outputs = tf.transpose(outputs, [1, 0, 2])
    return outputs

  def _add_rnn_layers(convolved, lengths):
    """Adds recurrent neural network layers depending on the cell type."""
    if params.cell_type != "cudnn_lstm":
      outputs = _add_regular_rnn_layers(convolved, lengths)
    else:
      outputs = _add_cudnn_rnn_layers(convolved)
    # outputs is [batch_size, L, N] where L is the maximal sequence length and N
    # the number of nodes in the last layer.
    mask = tf.tile(
        tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2),
        [1, 1, tf.shape(outputs)[2]])
    zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs))
    outputs = tf.reduce_sum(zero_outside, axis=1)
    return outputs

  def _add_fc_layers(final_state):
    """Adds a fully connected layer."""
    return tf.layers.dense(final_state, params.num_classes)

  # Build the model.
  inks, lengths, labels = _get_input_tensors(features, labels)
  convolved, lengths = _add_conv_layers(inks, lengths)
  final_state = _add_rnn_layers(convolved, lengths)
  logits = _add_fc_layers(final_state)
  # Add the loss.
  cross_entropy = tf.reduce_mean(
      tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=labels, logits=logits))
  # Add the optimizer.
  train_op = tf.contrib.layers.optimize_loss(
      loss=cross_entropy,
      global_step=tf.train.get_global_step(),
      learning_rate=params.learning_rate,
      optimizer="Adam",
      # some gradient clipping stabilizes training in the beginning.
      clip_gradients=params.gradient_clipping_norm,
      summaries=["learning_rate", "loss", "gradients", "gradient_norm"])
  # Compute current predictions.
  predictions = tf.argmax(logits, axis=1)
  return tf.estimator.EstimatorSpec(
      mode=mode,
      predictions={"logits": logits, "predictions": predictions},
      loss=cross_entropy,
      train_op=train_op,
      eval_metric_ops={"accuracy": tf.metrics.accuracy(labels, predictions)})


def create_estimator_and_specs(run_config):
  """Creates an Experiment configuration based on the estimator and input fn."""
  model_params = tf.contrib.training.HParams(
      num_layers=FLAGS.num_layers,
      num_nodes=FLAGS.num_nodes,
      batch_size=FLAGS.batch_size,
      num_conv=ast.literal_eval(FLAGS.num_conv),
      conv_len=ast.literal_eval(FLAGS.conv_len),
      num_classes=get_num_classes(),
      learning_rate=FLAGS.learning_rate,
      gradient_clipping_norm=FLAGS.gradient_clipping_norm,
      cell_type=FLAGS.cell_type,
      batch_norm=FLAGS.batch_norm,
      dropout=FLAGS.dropout)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params=model_params)

  train_spec = tf.estimator.TrainSpec(input_fn=get_input_fn(
      mode=tf.estimator.ModeKeys.TRAIN,
      tfrecord_pattern=FLAGS.training_data,
      batch_size=FLAGS.batch_size), max_steps=FLAGS.steps)

  eval_spec = tf.estimator.EvalSpec(input_fn=get_input_fn(
      mode=tf.estimator.ModeKeys.EVAL,
      tfrecord_pattern=FLAGS.eval_data,
      batch_size=FLAGS.batch_size))

  return estimator, train_spec, eval_spec


def main(unused_args):
  estimator, train_spec, eval_spec = create_estimator_and_specs(
      run_config=tf.estimator.RunConfig(
          model_dir=FLAGS.model_dir,
          save_checkpoints_secs=300,
          save_summary_steps=100))
  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.register("type", "bool", lambda v: v.lower() == "true")
  parser.add_argument(
      "--training_data",
      type=str,
      default="/Train",
      help="Path to training data (tf.Example in TFRecord format)")
  parser.add_argument(
      "--eval_data",
      type=str,
      default="/Eval",
      help="Path to evaluation data (tf.Example in TFRecord format)")
  parser.add_argument(
      "--classes_file",
      type=str,
      default="",
      help="Path to a file with the classes - one class per line")
  parser.add_argument(
      "--num_layers",
      type=int,
      default=3,
      help="Number of recurrent neural network layers.")
  parser.add_argument(
      "--num_nodes",
      type=int,
      default=128,
      help="Number of node per recurrent network layer.")
  parser.add_argument(
      "--num_conv",
      type=str,
      default="[48, 64, 96]",
      help="Number of conv layers along with number of filters per layer.")
  parser.add_argument(
      "--conv_len",
      type=str,
      default="[5, 5, 3]",
      help="Length of the convolution filters.")
  parser.add_argument(
      "--cell_type",
      type=str,
      default="lstm",
      help="Cell type used for rnn layers: cudnn_lstm, lstm or block_lstm.")
  parser.add_argument(
      "--batch_norm",
      type="bool",
      default="False",
      help="Whether to enable batch normalization or not.")
  parser.add_argument(
      "--learning_rate",
      type=float,
      default=0.0001,
      help="Learning rate used for training.")
  parser.add_argument(
      "--gradient_clipping_norm",
      type=float,
      default=9.0,
      help="Gradient clipping norm used during training.")
  parser.add_argument(
      "--dropout",
      type=float,
      default=0.3,
      help="Dropout used for convolutions and bidi lstm layers.")
  parser.add_argument(
      "--steps",
      type=int,
      default=100000,
      help="Number of training steps.")
  parser.add_argument(
      "--batch_size",
      type=int,
      default=8,
      help="Batch size to use for training/evaluation.")
  parser.add_argument(
      "--model_dir",
      type=str,
      default="A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints",
      help="Path for storing the model checkpoints.")
  parser.add_argument(
      "--self_test",
      type="bool",
      default="False",
      help="Whether to enable batch normalization or not.")
  parser.add_argument(
    "--classes_path",
    type=str,
    default="A:\Code\Machine Learning\Software Engineering project\Quick Draw\quickdraw-dataset-master\categories.txt",
    help="Path of the text file which contains name of classes"
  )

  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

I have data in TFRecords format.我有 TFRecords 格式的数据。 And after running the code, it creates an eval folder with events.out.my system name files, some more events.out files in the main folder, a graph.pbtxt file, model.ckpt-0.data-0000-of-0001 , model.ckpt.index , model.ckpt.meta and a file named checkpoint .运行代码后，它会创建一个 eval 文件夹，其中包含events.out.my系统名称文件、 main文件夹中的更多events.out文件、一个graph.pbtxt文件、 model.ckpt-0.data-0000-of-0001 、 model.ckpt.index 、 model.ckpt.meta和一个名为checkpoint的文件。 What are all these files and how to use them for prediction of testing data?所有这些文件是什么以及如何使用它们来预测测试数据？

Also, why there just one model.ckpt.data when the data consists of 9 training and 9 eval files.另外，当数据由 9 个训练和 9 个评估文件组成时，为什么只有一个model.ckpt.data 。

Another question: When I ran the code, it finished execution quite quickly, while most deep learning models take long time to execute.另一个问题：当我运行代码时，它执行得很快，而大多数深度学习模型需要很长时间才能执行。 I mean there are 3450000 examples for training and 345000 for eval and everything was done in like a minute.我的意思是有 3450000 个用于训练的示例和 345000 个用于评估的示例，一切都在一分钟内完成。 I am new to TensorFlow, so please keep that in mind.我是 TensorFlow 的新手，所以请记住这一点。 I don't know much things about it.我对它了解不多。

Update: After adding the line tf.logging.set_verbosity('INFO') to the code, I am getting the following output:更新：将tf.logging.set_verbosity('INFO')行添加到代码后，我得到以下输出：

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-23-22:28:00
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-23-22:28:02
INFO:tensorflow:Saving dict for global step 0: accuracy = 0.0, global_step = 0, loss = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 0: A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Loss for final step: None.

Why is the accuracy 0.0 and loss 0.0.为什么准确度为 0.0，损失为 0.0。 I think there is some problem with this, but don't know exacttly what.我认为这有一些问题，但不知道到底是什么。 he training is not happening on all examples I think.我认为并非所有示例都进行了培训。

Answer 1

Most obvious reason, your model doesn't receive any training data.最明显的原因是您的模型没有收到任何训练数据。 As far as I could understand, your get_input_fn contains unnecessary functionality and probably behaves not as you need it to.据我所知，您的get_input_fn包含不必要的功能，并且可能不会按照您的需要运行。 In particular, using if mode==train twice特别是，使用if mode==train twice

if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.shuffle(buffer_size=10)
    dataset = dataset.repeat()
    # Preprocesses 10 files concurrently and interleaves records from each file.
    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        block_length=1)
    dataset = dataset.map(
        functools.partial(_parse_tfexample_fn, mode=mode),
        num_parallel_calls=10)
    dataset = dataset.prefetch(10000)
    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.shuffle(buffer_size=1000000)

And returning iterator, which shoudn't be used with Estimator API并返回迭代器，不应与 Estimator API 一起使用

features, labels = dataset.make_one_shot_iterator().get_next()
    return features, labels

In order to debug, try simple input pipeline first and add functionality step by step为了调试，首先尝试简单的输入管道并逐步添加功能

dataset = tf.data.TFRecordDataset.list_files(tfrecord_pattern)
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.repeat()
dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=10, block_length=1)
dataset = dataset.map(_parse_tfexample_fn, num_parallel_calls=10)
dataset = dataset.prefetch(1)
dataset = dataset.padded_batch(batch_size, padded_shapes=dataset.output_shapes)

return dataset

And try using estimator.train at first Ps This is not a ready to use function, you should adapt it according to your input data.并首先尝试使用estimator.train Ps 这不是一个随时可用的功能，您应该根据您的输入数据对其进行调整。

如何使用 Tensorflow 创建的检查点和相关文件？

问题描述

1 个解决方案

解决方案1
0 2019-03-24 07:40:47

如何使用 Tensorflow 创建的检查点和相关文件？

问题描述

1 个解决方案

解决方案1 0 2019-03-24 07:40:47

解决方案1
0 2019-03-24 07:40:47