使用tf.data API加載tfrecord數據，訓練model，結果沒有變化

Question

環境：

ubuntu 16.04
python 3.6
張量流GPU 1.10.0
Cuda 10.1

代碼：

import os
import tensorflow as tf
import numpy as np
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Image Parameters
N_CLASSES = 2  # CHANGE HERE, total number of classes
IMG_HEIGHT = 128  # CHANGE HERE, the image height to be resized to
IMG_WIDTH = 128  # CHANGE HERE, the image width to be resized to
CHANNELS = 3  # The 3 color channels, change to 1 if grayscale
n_classes = N_CLASSES  # MNIST total classes (0-9 digits)
dropout = 0.75
num_steps = 20000
display_step = 100
learning_rate = 0.01
BATCHSIZE=32

def _parse_function(record):
    keys_to_features = {
        'img_raw': tf.FixedLenFeature((), tf.string),
        'label': tf.FixedLenFeature((), tf.int64)
    }
    parsed = tf.parse_single_example(record, keys_to_features)
    image = tf.decode_raw(parsed['img_raw'], tf.uint8)
    image = tf.reshape(image, [IMG_HEIGHT, IMG_WIDTH, 3])
    image = tf.cast(image, tf.float32)
    label = tf.cast(parsed['label'], tf.int32)
    return image, label

# dataset pipeline
dataset = tf.data.TFRecordDataset("./01_cats_vs_dogs/train_dogs_cat.tfrecord")
dataset = dataset.map(_parse_function)
dataset = dataset.repeat()
dataset = dataset.batch(batch_size=BATCHSIZE)
dataset = dataset.prefetch(BATCHSIZE)

# Create an iterator over the dataset
iterator = dataset.make_one_shot_iterator()
X, Y = iterator.get_next()

# Neural Net Input (images, labels)
print(X.shape)

def conv_net(x, n_classes, dropout, reuse, is_training):
    # Define a scope for reusing the variables
    with tf.variable_scope('ConvNet', reuse=reuse):
        # Convolution Layer with 32 filters and a kernel size of 5
        # x = tf.reshape(x, shape=[-1, 64, 64, 3])
        # Convolution Layer with 32 filters and a kernel size of 5
        conv1 = tf.layers.conv2d(x, 64, 3, activation=tf.nn.relu)
        conv1_1 = tf.layers.conv2d(conv1, 64, 3, activation=tf.nn.relu)
        # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
        pool1 = tf.layers.max_pooling2d(conv1_1, 2, 2)

        # Convolution Layer with 32 filters and a kernel size of 5
        conv2_1 = tf.layers.conv2d(pool1, 128, 3, activation=tf.nn.relu)
        conv2_2 = tf.layers.conv2d(conv2_1, 128, 3, activation=tf.nn.relu)
        # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
        pool2 = tf.layers.max_pooling2d(conv2_2, 2, 2)

        conv3_1 = tf.layers.conv2d(pool2, 512, 3, activation=tf.nn.relu)
        conv3_2 = tf.layers.conv2d(conv3_1, 512, 3, activation=tf.nn.relu)
        conv3_3 = tf.layers.conv2d(conv3_2, 512, 3, activation=tf.nn.relu)
        conv3_4 = tf.layers.conv2d(conv3_3, 512, 3, activation=tf.nn.relu)
        pool3 = tf.layers.max_pooling2d(conv3_4, 2, 2)

        conv4_1 = tf.layers.conv2d(pool3, 512, 3, activation=tf.nn.relu)
        conv4_2 = tf.layers.conv2d(conv4_1, 512, 3, activation=tf.nn.relu)
        conv4_3 = tf.layers.conv2d(conv4_2, 512, 3, activation=tf.nn.relu)
        conv4_4 = tf.layers.conv2d(conv4_3, 512, 3, activation=tf.nn.relu)
        pool4 = tf.layers.max_pooling2d(conv4_4, 2, 2)

        # Flatten the data to a 1-D vector for the fully connected layer
        fc1 = tf.contrib.layers.flatten(pool4)

        # Fully connected layer (in contrib folder for now)
        fc1 = tf.layers.dense(fc1, 4096)
        # Apply Dropout (if is_training is False, dropout is not applied)
        fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training)

        fc2 = tf.layers.dense(fc1, 2048)
        fc2 = tf.layers.dropout(fc2, rate=dropout, training=is_training)
        # Output layer, class prediction
        out = tf.layers.dense(fc2, n_classes)
        # Because 'softmax_cross_entropy_with_logits' already apply softmax,
        # we only apply softmax to testing network
        out = tf.nn.softmax(out) if not is_training else out
        # out = tf.nn.softmax(out)
    return out





logits_train = conv_net(X, N_CLASSES, dropout, reuse=False, is_training=True)
logits_test = conv_net(X, N_CLASSES, dropout, reuse=True, is_training=False)

loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_train, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

correct_pred = tf.equal(tf.argmax(logits_test, 1), tf.cast(Y, tf.int64))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    # sess.run(iterator.initializer)
    sess.run(init)

    # Training cycle
    for step in range(1, num_steps + 1):
        sess.run(train_op)
        if step % display_step == 0 or step == 1:
            # Run optimization and calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy])
            print("Step " + str(step) + ", Minibatch Loss= " + "{:.4f}".format(loss) + ", Training Accuracy= " +
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    saver.save(sess, './model1/my_tf_model.ckpt')

數據集：

我把img數據轉成tfrecord，數據集包含兩個類，{'dog', 'cat'}，tfrecord驗證沒問題

結果：

結果如下，當我訓練model時，我發現准確率似乎沒有改變，我不知道哪里出了問題：

Step 100, Minibatch Loss= 328.0390, Training Accuracy= 0.375
Step 200, Minibatch Loss= 20.1806, Training Accuracy= 0.469
Step 300, Minibatch Loss= 8.0567, Training Accuracy= 0.594
Step 400, Minibatch Loss= 7.8446, Training Accuracy= 0.469
Step 500, Minibatch Loss= 8.1242, Training Accuracy= 0.562
Step 600, Minibatch Loss= 11.3462, Training Accuracy= 0.500
Step 700, Minibatch Loss= 6.3456, Training Accuracy= 0.656
Step 800, Minibatch Loss= 8.6048, Training Accuracy= 0.406
Step 900, Minibatch Loss= 5.7220, Training Accuracy= 0.500
Step 1000, Minibatch Loss= 6.6008, Training Accuracy= 0.594
Step 1100, Minibatch Loss= 10.1282, Training Accuracy= 0.469
Step 1200, Minibatch Loss= 9.9202, Training Accuracy= 0.375
Step 1300, Minibatch Loss= 7.2488, Training Accuracy= 0.562
Step 1400, Minibatch Loss= 5.7681, Training Accuracy= 0.406
Step 1500, Minibatch Loss= 6.8479, Training Accuracy= 0.719
Step 1600, Minibatch Loss= 4.2005, Training Accuracy= 0.562
Step 1700, Minibatch Loss= 6.7389, Training Accuracy= 0.531
Step 1800, Minibatch Loss= 5.1379, Training Accuracy= 0.250
Step 1900, Minibatch Loss= 5.5253, Training Accuracy= 0.562
Step 2000, Minibatch Loss= 10.0953, Training Accuracy= 0.562
Step 2100, Minibatch Loss= 4.0466, Training Accuracy= 0.531
Step 2200, Minibatch Loss= 7.2034, Training Accuracy= 0.562

Answer 1

TLDR；

將學習率更改為較小的值，例如 1e-4
減少Conv2D層的過濾器和Dense層的單元
減少dropout參數，例如：0.2

lr=1e-4 dropout dropout=0.2的 Output 和一個更簡單的模型（設置在這個答案的末尾）：

Step 1, Minibatch Loss= 20.6069, Training Accuracy= 0.625
Step 100, Minibatch Loss= 0.6134, Training Accuracy= 0.656
Step 200, Minibatch Loss= 0.6814, Training Accuracy= 0.625
Step 300, Minibatch Loss= 0.6467, Training Accuracy= 0.688
Step 400, Minibatch Loss= 0.6255, Training Accuracy= 0.625
Step 500, Minibatch Loss= 0.7261, Training Accuracy= 0.500
Step 600, Minibatch Loss= 0.6132, Training Accuracy= 0.656
Step 700, Minibatch Loss= 0.5459, Training Accuracy= 0.719
Step 800, Minibatch Loss= 0.6878, Training Accuracy= 0.688
Step 900, Minibatch Loss= 0.6291, Training Accuracy= 0.625
Step 1000, Minibatch Loss= 0.5999, Training Accuracy= 0.750
Step 1100, Minibatch Loss= 0.5825, Training Accuracy= 0.656
Step 1200, Minibatch Loss= 0.4984, Training Accuracy= 0.844
Step 1300, Minibatch Loss= 0.6453, Training Accuracy= 0.656
Step 1400, Minibatch Loss= 0.7097, Training Accuracy= 0.562
Step 1500, Minibatch Loss= 0.4389, Training Accuracy= 0.750
Step 1600, Minibatch Loss= 0.5230, Training Accuracy= 0.719
Step 1700, Minibatch Loss= 0.6794, Training Accuracy= 0.625
Step 1800, Minibatch Loss= 0.4587, Training Accuracy= 0.781
Step 1900, Minibatch Loss= 0.4308, Training Accuracy= 0.875
Step 2000, Minibatch Loss= 0.4679, Training Accuracy= 0.812
Step 2100, Minibatch Loss= 0.3197, Training Accuracy= 0.875
Step 2200, Minibatch Loss= 0.4301, Training Accuracy= 0.844
Step 2300, Minibatch Loss= 0.2784, Training Accuracy= 0.875
Step 2400, Minibatch Loss= 0.4588, Training Accuracy= 0.781
Step 2500, Minibatch Loss= 0.6086, Training Accuracy= 0.688
Step 2600, Minibatch Loss= 0.5364, Training Accuracy= 0.750
Step 2700, Minibatch Loss= 0.2958, Training Accuracy= 0.906

在步驟 4900，acc 達到 96.9%

Step 4900, Minibatch Loss= 0.1735, Training Accuracy= 0.969

為什么？

高學習率會在開始時迅速減少損失，但難以收斂。
你的 dropout 有點太高了，0.75 意味着 75% 的特征將在訓練期間被丟棄

此外，您最好將數據集分為訓練和測試。

參考

https://www.quora.com/Why-does-my-convolutional-neural-network-always-produce-the-same-outputs

model 配置：

    conv1 = tf.layers.conv2d(x, 16, 3, activation=tf.nn.relu)
    conv1_1 = tf.layers.conv2d(conv1, 16, 3, activation=tf.nn.relu)
    pool1 = tf.layers.max_pooling2d(conv1_1, 2, 2)

    conv2_1 = tf.layers.conv2d(pool1, 32, 3, activation=tf.nn.relu)
    conv2_2 = tf.layers.conv2d(conv2_1, 32, 3, activation=tf.nn.relu)
    pool2 = tf.layers.max_pooling2d(conv2_2, 2, 2)

    conv3_1 = tf.layers.conv2d(pool2, 64, 3, activation=tf.nn.relu)
    conv3_2 = tf.layers.conv2d(conv3_1, 64, 3, activation=tf.nn.relu)
    conv3_3 = tf.layers.conv2d(conv3_2, 128, 3, activation=tf.nn.relu)
    conv3_4 = tf.layers.conv2d(conv3_3, 128, 3, activation=tf.nn.relu)
    pool3 = tf.layers.max_pooling2d(conv3_4, 2, 2)

    conv4_1 = tf.layers.conv2d(pool3, 512, 3, activation=tf.nn.relu)
    conv4_2 = tf.layers.conv2d(conv4_1, 512, 3, activation=tf.nn.relu)
    pool4 = tf.layers.max_pooling2d(conv4_2, 2, 2)

    fc1 = tf.contrib.layers.flatten(pool4)
    fc1 = tf.layers.dense(fc1, 512)
    fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training)
    fc2 = tf.layers.dense(fc1, 128)
    fc2 = tf.layers.dropout(fc2, rate=dropout, training=is_training)
    out = tf.layers.dense(fc2, n_classes)
    out = tf.nn.softmax(out) if not is_training else out

使用tf.data API加載tfrecord數據，訓練model，結果沒有變化

問題描述

環境：

代碼：

數據集：

結果：

1 個解決方案

解決方案1
1 已采納 2020-05-17 16:11:09

使用tf.data API加載tfrecord數據，訓練model，結果沒有變化

問題描述

環境：

代碼：

數據集：

結果：

1 個解決方案

解決方案1 1 已采納 2020-05-17 16:11:09

解決方案1
1 已采納 2020-05-17 16:11:09