簡體   English   中英

如何在 tensorflow 2.1.0 中將 mnist 數據集轉換為 tfrecords

[英]How can I convert mnist dataset into tfrecords in tensorflow 2.1.0

我看到很多類似下面要點的代碼,但是,它打印出ModuleNotFoundError: No module named 'tensorflow.examples.tutorials' in tensorflow 2.1.0。

檢查此問題時,我知道我可以使用以下代碼從 keras 加載。

(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

但是,結果是 3d ndarray,我在嘗試使用以下代碼創建功能時遇到一些錯誤

(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0

path = "train.tfrecord"
writer = tf.io.TFRecordWriter(path, options=None)
option = tf.io.TFRecordOptions(compression_type="GZIP")
feature_internal = {
    "image":tf.train.Features(float_list=tf.train.FloatList(value=[X_train_full])),
    "label":tf.train.Features(float_list=tf.train.FloatList(value=[y_train_full]))
}

它輸出

TypeError: array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0. has type <class 'numpy.ndarray'>, but expected one of: numbers.Real

我想知道如何解決這個錯誤並將 mnist 轉換為 tfrecord

要將數據保存到 TFRecord 文件,工作流程如下: 第 1 步:使用您自己的程序加載數據。

第 2 步:使用 tf.python_io.TFRecordWriter 打開一個 TFRecord 文件。

第 3 步:將數據解析並保存到 TFRecord 文件中。 按着這些次序:

  1. 使用tf.train.BytesListtf.train.FloatListtf.train.Int64List 將您的數據轉換為 tf.train.Feature
  2. 使用轉換后的數據創建一個tf.train.Features
  3. 使用tf.train.Example創建一個示例協議緩沖區。
  4. 使用 tf.train.Example.SerializeToString() 將示例序列化為字符串。
  5. 使用創建的 TFRecordWriter 將序列化示例寫入 TFRecord。
import gzip
import os

import numpy
from six.moves import urllib
import tensorflow as tf


params = {}
params['download_data_location'] = '/dbfs/ml/MNISTDemo/mnistData/'
params['tfrecord_location'] = '/dbfs/ml/MNISTDemo/mnistData/'


def download(directory, filename):
  """Download a file from the MNIST dataset if not already done."""
  filepath = os.path.join(directory, filename)
  if tf.gfile.Exists(filepath):
    return filepath
  if not tf.gfile.Exists(directory):
    tf.gfile.MakeDirs(directory)
  # CVDF mirror of http://yann.lecun.com/exdb/mnist/
  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'
  temp_file_name, _ = urllib.request.urlretrieve(url)
  tf.gfile.Copy(temp_file_name, filepath)
  with tf.gfile.GFile(filepath) as f:
      size = f.size()
  print('Successfully downloaded', filename, size, 'bytes.')
  return filepath

def _read32(bytestream):
  dt = numpy.dtype(numpy.uint32).newbyteorder('>')
  return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]

def extract_images(f):
  """Extract the images into a 4D uint8 numpy array [index, y, x, depth].
  Args:
    f: A file object that can be passed into a gzip reader.
  Returns:
    data: A 4D uint8 numpy array [index, y, x, depth].
  Raises:
    ValueError: If the bytestream does not start with 2051.
  """
  print('Extracting', f.name)
  with gzip.GzipFile(fileobj=f) as bytestream:
    magic = _read32(bytestream)
    if magic != 2051:
      raise ValueError('Invalid magic number %d in MNIST image file: %s' %
                       (magic, f.name))
    num_images = _read32(bytestream)
    rows = _read32(bytestream)
    cols = _read32(bytestream)
    buf = bytestream.read(rows * cols * num_images)
    data = numpy.frombuffer(buf, dtype=numpy.uint8)
    data = data.reshape(num_images, rows, cols, 1)
    return data

def dense_to_one_hot(labels_dense, num_classes):
  """Convert class labels from scalars to one-hot vectors."""
  num_labels = labels_dense.shape[0]
  index_offset = numpy.arange(num_labels) * num_classes
  labels_one_hot = numpy.zeros((num_labels, num_classes))
  labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
  return labels_one_hot

def extract_labels(f, one_hot=False, num_classes=10):
  """Extract the labels into a 1D uint8 numpy array [index].
  Args:
    f: A file object that can be passed into a gzip reader.
    one_hot: Does one hot encoding for the result.
    num_classes: Number of classes for the one hot encoding.
  Returns:
    labels: a 1D uint8 numpy array.
  Raises:
    ValueError: If the bystream doesn't start with 2049.
  """
  print('Extracting', f.name)
  with gzip.GzipFile(fileobj=f) as bytestream:
    magic = _read32(bytestream)
    if magic != 2049:
      raise ValueError('Invalid magic number %d in MNIST label file: %s' %
                       (magic, f.name))
    num_items = _read32(bytestream)
    buf = bytestream.read(num_items)
    labels = numpy.frombuffer(buf, dtype=numpy.uint8)
    if one_hot:
      return dense_to_one_hot(labels, num_classes)
    return labels

def load_dataset(directory, images_file, labels_file):
  """Download and parse MNIST dataset."""

  images_file = download(directory, images_file)
  labels_file = download(directory, labels_file)

  with tf.gfile.Open(images_file, 'rb') as f:
    images = extract_images(f)

  with tf.gfile.Open(labels_file, 'rb') as f:
    labels = extract_labels(f)

  return images, labels

directory = params['download_data_location']
validation_size=5000
train_images, train_labels = load_dataset(directory, 'train-images-idx3-ubyte', 'train-labels-idx1-ubyte')
test_images, test_labels = load_dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte') 
validation_images = train_images[:validation_size]
validation_labels = train_labels[:validation_size]
train_images = train_images[validation_size:]
train_labels = train_labels[validation_size:]

name = "train.tfrecords"
filename = os.path.join(params['tfrecord_location'], name)
tfrecord_writer = tf.python_io.TFRecordWriter(filename)

def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

train_images.shape[0]

num_examples = train_images.shape[0]
images = train_images
labels = train_labels

rows = images.shape[1]
cols = images.shape[2]
depth = images.shape[3]

for index in range(num_examples):
  # 1. Convert your data into tf.train.Feature
  image_raw = images[index].tostring()
  feature = {
    'height': _int64_feature(rows),
    'width': _int64_feature(cols),
    'depth': _int64_feature(depth),
    'label': _int64_feature(int(labels[index])),
    'image_raw': _bytes_feature(image_raw)
  }
  # 2. Create a tf.train.Features
  features = tf.train.Features(feature=feature)
  # 3. Createan example protocol
  example = tf.train.Example(features=features)
  # 4. Serialize the Example to string
  example_to_string = example.SerializeToString()
  # 5. Write to TFRecord
  tfrecord_writer.write(example_to_string)

def convert_and_save_to(images, labels , name, params):
  """Converts a TF dataset to tfrecords."""
  num_examples = images.shape[0]

  rows = images.shape[1]
  cols = images.shape[2]
  depth = images.shape[3]

  filename = os.path.join(params['tfrecord_location'], name + '.tfrecords')
  print('Writing', filename)
  with tf.python_io.TFRecordWriter(filename) as writer:
    for index in range(num_examples):
      image_raw = images[index].tostring()
      feature={
              'label': _int64_feature(int(labels[index])),
              'image_raw': _bytes_feature(image_raw)
              }
      features=tf.train.Features(feature=feature)
      example = tf.train.Example(features=features)
      writer.write(example.SerializeToString())

# Convert to Examples and write the result to TFRecord files.
convert_and_save_to(train_images, train_labels, 'train', params)
convert_and_save_to(test_images, test_labels, 'test', params)



暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM