繁体   English   中英

Keras 3D Convolution:检查模型输入时出错:预期covolution3d_input_1有5个维度,但得到阵列形状(1,90,100,100)

[英]Keras 3D Convolution: Error when checking model input: expected covolution3d_input_1 to have 5 dimensions, but got array shape (1, 90, 100, 100)

该系统的目标是对发音单词的视频输入进行分类。 每个样本都是一组90,100x100,灰度(1色通道帧,尺寸为(1, 90, 100, 100) 。以前,训练数据直接加载到内存并训练,但有效,但不是为了解决这个问题,系统被修改为预处理并将训练数据保存到HDF5文件中,然后使用生成器将训练数据拟合到模型中以进行按需加载。但是,由于此修改,现在会生成以下错误:

例外:检查模型输入时出错:预期convolution3d_input_1有5个维度,但得到的形状为数组(1,90,100,100)

这是系统的代码:

from keras import backend as K
from keras.callbacks import Callback
from keras.constraints import maxnorm
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Convolution3D
from keras.layers.convolutional import MaxPooling3D
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.utils.io_utils import HDF5Matrix
from pprint import pprint
from sklearn.utils import shuffle
K.set_image_dim_ordering("th")

import cv2
import h5py
import json
import os
import sys
import numpy as np

class OpticalSpeechRecognizer(object):
    def __init__(self, rows, columns, frames_per_sequence):
        self.rows = rows
        self.columns = columns
        self.frames_per_sequence = frames_per_sequence
        self.osr = None

    def train_osr_model(self, training_save_fn):
        """ Train the optical speech recognizer
        """
        print "\nTraining OSR"
        validation_ratio = 0.3
        training_sequence_generator = self.generate_training_sequences(training_save_fn)
        validation_sequence_generator = self.generate_training_sequences(training_save_fn, validation_ratio=validation_ratio)
        training_save_file = h5py.File(training_save_fn, "r")
        sample_count = training_save_file.attrs["sample_count"]
        pbi = PrintBatchInfo()
        self.osr.fit_generator(generator=training_sequence_generator,
                               validation_data=validation_sequence_generator,
                               samples_per_epoch=sample_count,
                               nb_val_samples=int(round(validation_ratio*sample_count)),
                               nb_epoch=10,
                               verbose=2,
                               callbacks=[pbi],
                               class_weight=None,
                               nb_worker=1)

    def generate_osr_model(self, training_save_fn):
        """ Builds the optical speech recognizer model
        """
        print "".join(["Generating OSR model\n",
                       "-"*40])
        training_save_file = h5py.File(training_save_fn, "r")
        osr = Sequential()
        print " - Adding convolution layers"
        osr.add(Convolution3D(nb_filter=32,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              input_shape=(1, self.frames_per_sequence, self.rows, self.columns),
                              activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Convolution3D(nb_filter=32,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
        osr.add(Convolution3D(nb_filter=64,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Convolution3D(nb_filter=64,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
        osr.add(Convolution3D(nb_filter=128,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Convolution3D(nb_filter=128,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
        osr.add(Flatten())
        osr.add(Dropout(0.2))
        print " - Adding fully connected layers"
        osr.add(Dense(output_dim=128,
                      init="normal",
                      activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Dense(output_dim=64,
                      init="normal",
                      activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Dense(output_dim=32,
                      init="normal",
                      activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Dense(output_dim=len(training_save_file.attrs["training_classes"].split(",")),
                      init="normal",
                      activation="softmax"))
        print " - Compiling model"
        sgd = SGD(lr=0.01,
                  decay=1e-6,
                  momentum=0.9,
                  nesterov=True)
        osr.compile(loss="categorical_crossentropy",
                    optimizer=sgd,
                    metrics=["accuracy"])
        self.osr = osr
        print " * OSR MODEL GENERATED * "

    def generate_training_sequences(self, training_save_fn, validation_ratio=0):
        while True:
            training_save_file = h5py.File(training_save_fn, "r")
            sample_count = int(training_save_file.attrs["sample_count"])
            # generate sequences for validation
            if validation_ratio:
                validation_sample_count = int(round(validation_ratio*sample_count))
                validation_sample_idxs = np.random.randint(low=0, high=sample_count, size=validation_sample_count)
                for idx in validation_sample_idxs:
                    X = training_save_file["X"][idx]
                    Y = training_save_file["Y"][idx]
                    yield (X, Y)
            # generate sequences for training
            else:
                for idx in range(0, sample_count):
                    X = training_save_file["X"][idx]
                    Y = training_save_file["Y"][idx]
                    yield (X, Y)

    def process_training_data(self, config_file, training_save_fn):
        """ Preprocesses training data and saves them into an HDF5 file
        """
        # load training metadata from config file
        training_metadata = {}
        training_classes = []
        with open(config_file) as training_config:
            training_metadata = json.load(training_config)
            training_classes = sorted(list(training_metadata.keys()))

            print "".join(["\n",
                           "Found {0} training classes!\n".format(len(training_classes)),
                           "-"*40])
            for class_label, training_class in enumerate(training_classes):
                print "{0:<4d} {1:<10s} {2:<30s}".format(class_label, training_class, training_metadata[training_class])
            print ""

        # count number of samples
        sample_count = 0
        sample_count_by_class = [0]*len(training_classes)
        for class_label, training_class in enumerate(training_classes):
            # get training class sequeunce paths
            training_class_data_path = training_metadata[training_class]
            training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
                                             for file_name in os.listdir(training_class_data_path)
                                             if (os.path.isfile(os.path.join(training_class_data_path, file_name))
                                                 and ".mov" in file_name)]
            # update sample count
            sample_count += len(training_class_sequence_paths)
            sample_count_by_class[class_label] = len(training_class_sequence_paths)

        print "".join(["\n",
                       "Found {0} training samples!\n".format(sample_count),
                       "-"*40])
        for class_label, training_class in enumerate(training_classes):
            print "{0:<4d} {1:<10s} {2:<6d}".format(class_label, training_class, sample_count_by_class[class_label])
        print ""

        # initialize HDF5 save file, but clear older duplicate first if it exists
        try:
            print "Saved file \"{0}\" already exists! Overwriting previous saved file.\n".format(training_save_fn)
            os.remove(training_save_fn)
        except OSError:
            pass
        training_save_file = h5py.File(training_save_fn, "w")
        training_save_file.attrs["training_classes"] = np.string_(",".join(training_classes))
        training_save_file.attrs["sample_count"] = sample_count
        x_training_dataset = training_save_file.create_dataset("X", 
                                                              shape=(sample_count, 1, self.frames_per_sequence, self.rows, self.columns),
                                                              dtype="f")
        y_training_dataset = training_save_file.create_dataset("Y",
                                                               shape=(sample_count, len(training_classes)),
                                                               dtype="i")

        # iterate through each class data
        sample_idx = 0
        for class_label, training_class in enumerate(training_classes):
            # get training class sequeunce paths
            training_class_data_path = training_metadata[training_class]
            training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
                                             for file_name in os.listdir(training_class_data_path)
                                             if (os.path.isfile(os.path.join(training_class_data_path, file_name))
                                                 and ".mov" in file_name)]
            # iterate through each sequence
            for idx, training_class_sequence_path in enumerate(training_class_sequence_paths):
                sys.stdout.write("Processing training data for class \"{0}\": {1}/{2} sequences\r"
                                 .format(training_class, idx+1, len(training_class_sequence_paths)))
                sys.stdout.flush()

                # append grayscale, normalized sample frames
                frames = self.process_frames(training_class_sequence_path)
                x_training_dataset[sample_idx] = [frames]

                # append one-hot encoded sample label
                label = [0]*len(training_classes)
                label[class_label] = 1
                y_training_dataset[sample_idx] = label

                # update sample index
                sample_idx += 1

            print "\n"

        training_save_file.close()

        print "Training data processed and saved to {0}".format(training_save_fn)

    def process_frames(self, video_file_path):
        """ Splits frames, resizes frames, converts RGB frames to greyscale, and normalizes frames
        """
        video = cv2.VideoCapture(video_file_path)
        success, frame = video.read()

        frames = []
        success = True

        # resize, convert to grayscale, normalize, and collect valid frames 
        while success:
          success, frame = video.read()
          if success:
            frame = cv2.resize(frame, (self.rows, self.columns))
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frame = frame.astype('float32') / 255.0
            frames.append(frame)

        # pre-pad short sequences and equalize frame lengths
        if len(frames) < self.frames_per_sequence:
            frames = [frames[0]]*(self.frames_per_sequence - len(frames)) + frames
        frames = frames[0:self.frames_per_sequence]

        return frames

class PrintBatchInfo(Callback):
    def on_batch_end(self, epoch, logs={}):
        print logs

if __name__ == "__main__":
    osr = OpticalSpeechRecognizer(100, 100, 90)
    osr.process_training_data("training_config.json", "training_data.h5")
    osr.generate_osr_model("training_data.h5")
    osr.train_osr_model("training_data.h5")

让我感到困惑的是,报告的输入维度是预期的输入维度,但它抱怨缺少第五维度。 对于每次迭代,生成器是否应该生成一批样本而不是单个样本以生成5维输出?

如果您要返回一个简单示例,则需要确保输出为5维且形状为: (batch_size, channels, frames, height, width) 这仅仅是因为每一层的维度都应该是固定的。 使这项工作最简单的方法是:

X = training_save_file["X"][[idx]]

使用此修复程序,您的输出应与预期形状匹配。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM