简体   繁体   中英

How can I convert mnist dataset into tfrecords in tensorflow 2.1.0

I saw many codes like the following gist , however, it prints out ModuleNotFoundError: No module named 'tensorflow.examples.tutorials' in tensorflow 2.1.0.

When checking this question , I know I can load from keras with the following code.

(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

However, the result is a 3d ndarray, and I meet some errors when trying to create feature with the following codes

(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0

path = "train.tfrecord"
writer = tf.io.TFRecordWriter(path, options=None)
option = tf.io.TFRecordOptions(compression_type="GZIP")
feature_internal = {
    "image":tf.train.Features(float_list=tf.train.FloatList(value=[X_train_full])),
    "label":tf.train.Features(float_list=tf.train.FloatList(value=[y_train_full]))
}

it outputs

TypeError: array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0. has type <class 'numpy.ndarray'>, but expected one of: numbers.Real

I want to know how can I solve this error and convert mnist into tfrecord

To save your data to TFRecord files, the workflow is as follows: Step 1: Load the data with your own program.

Step 2: Open a TFRecord file with tf.python_io.TFRecordWriter.

Step 3: Parse and save the data to TFRecord files. Follow these steps:

  1. Convert your data into tf.train.Feature using tf.train.BytesList , tf.train.FloatList , or tf.train.Int64List .
  2. Create a tf.train.Features with the converted data.
  3. Create an Example protocol buffer with tf.train.Example .
  4. Serialize the Example to string using tf.train.Example.SerializeToString().
  5. Write the serialized example to TFRecord with the created TFRecordWriter.
import gzip
import os

import numpy
from six.moves import urllib
import tensorflow as tf


params = {}
params['download_data_location'] = '/dbfs/ml/MNISTDemo/mnistData/'
params['tfrecord_location'] = '/dbfs/ml/MNISTDemo/mnistData/'


def download(directory, filename):
  """Download a file from the MNIST dataset if not already done."""
  filepath = os.path.join(directory, filename)
  if tf.gfile.Exists(filepath):
    return filepath
  if not tf.gfile.Exists(directory):
    tf.gfile.MakeDirs(directory)
  # CVDF mirror of http://yann.lecun.com/exdb/mnist/
  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'
  temp_file_name, _ = urllib.request.urlretrieve(url)
  tf.gfile.Copy(temp_file_name, filepath)
  with tf.gfile.GFile(filepath) as f:
      size = f.size()
  print('Successfully downloaded', filename, size, 'bytes.')
  return filepath

def _read32(bytestream):
  dt = numpy.dtype(numpy.uint32).newbyteorder('>')
  return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]

def extract_images(f):
  """Extract the images into a 4D uint8 numpy array [index, y, x, depth].
  Args:
    f: A file object that can be passed into a gzip reader.
  Returns:
    data: A 4D uint8 numpy array [index, y, x, depth].
  Raises:
    ValueError: If the bytestream does not start with 2051.
  """
  print('Extracting', f.name)
  with gzip.GzipFile(fileobj=f) as bytestream:
    magic = _read32(bytestream)
    if magic != 2051:
      raise ValueError('Invalid magic number %d in MNIST image file: %s' %
                       (magic, f.name))
    num_images = _read32(bytestream)
    rows = _read32(bytestream)
    cols = _read32(bytestream)
    buf = bytestream.read(rows * cols * num_images)
    data = numpy.frombuffer(buf, dtype=numpy.uint8)
    data = data.reshape(num_images, rows, cols, 1)
    return data

def dense_to_one_hot(labels_dense, num_classes):
  """Convert class labels from scalars to one-hot vectors."""
  num_labels = labels_dense.shape[0]
  index_offset = numpy.arange(num_labels) * num_classes
  labels_one_hot = numpy.zeros((num_labels, num_classes))
  labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
  return labels_one_hot

def extract_labels(f, one_hot=False, num_classes=10):
  """Extract the labels into a 1D uint8 numpy array [index].
  Args:
    f: A file object that can be passed into a gzip reader.
    one_hot: Does one hot encoding for the result.
    num_classes: Number of classes for the one hot encoding.
  Returns:
    labels: a 1D uint8 numpy array.
  Raises:
    ValueError: If the bystream doesn't start with 2049.
  """
  print('Extracting', f.name)
  with gzip.GzipFile(fileobj=f) as bytestream:
    magic = _read32(bytestream)
    if magic != 2049:
      raise ValueError('Invalid magic number %d in MNIST label file: %s' %
                       (magic, f.name))
    num_items = _read32(bytestream)
    buf = bytestream.read(num_items)
    labels = numpy.frombuffer(buf, dtype=numpy.uint8)
    if one_hot:
      return dense_to_one_hot(labels, num_classes)
    return labels

def load_dataset(directory, images_file, labels_file):
  """Download and parse MNIST dataset."""

  images_file = download(directory, images_file)
  labels_file = download(directory, labels_file)

  with tf.gfile.Open(images_file, 'rb') as f:
    images = extract_images(f)

  with tf.gfile.Open(labels_file, 'rb') as f:
    labels = extract_labels(f)

  return images, labels

directory = params['download_data_location']
validation_size=5000
train_images, train_labels = load_dataset(directory, 'train-images-idx3-ubyte', 'train-labels-idx1-ubyte')
test_images, test_labels = load_dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte') 
validation_images = train_images[:validation_size]
validation_labels = train_labels[:validation_size]
train_images = train_images[validation_size:]
train_labels = train_labels[validation_size:]

name = "train.tfrecords"
filename = os.path.join(params['tfrecord_location'], name)
tfrecord_writer = tf.python_io.TFRecordWriter(filename)

def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

train_images.shape[0]

num_examples = train_images.shape[0]
images = train_images
labels = train_labels

rows = images.shape[1]
cols = images.shape[2]
depth = images.shape[3]

for index in range(num_examples):
  # 1. Convert your data into tf.train.Feature
  image_raw = images[index].tostring()
  feature = {
    'height': _int64_feature(rows),
    'width': _int64_feature(cols),
    'depth': _int64_feature(depth),
    'label': _int64_feature(int(labels[index])),
    'image_raw': _bytes_feature(image_raw)
  }
  # 2. Create a tf.train.Features
  features = tf.train.Features(feature=feature)
  # 3. Createan example protocol
  example = tf.train.Example(features=features)
  # 4. Serialize the Example to string
  example_to_string = example.SerializeToString()
  # 5. Write to TFRecord
  tfrecord_writer.write(example_to_string)

def convert_and_save_to(images, labels , name, params):
  """Converts a TF dataset to tfrecords."""
  num_examples = images.shape[0]

  rows = images.shape[1]
  cols = images.shape[2]
  depth = images.shape[3]

  filename = os.path.join(params['tfrecord_location'], name + '.tfrecords')
  print('Writing', filename)
  with tf.python_io.TFRecordWriter(filename) as writer:
    for index in range(num_examples):
      image_raw = images[index].tostring()
      feature={
              'label': _int64_feature(int(labels[index])),
              'image_raw': _bytes_feature(image_raw)
              }
      features=tf.train.Features(feature=feature)
      example = tf.train.Example(features=features)
      writer.write(example.SerializeToString())

# Convert to Examples and write the result to TFRecord files.
convert_and_save_to(train_images, train_labels, 'train', params)
convert_and_save_to(test_images, test_labels, 'test', params)



The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM