简体   繁体   中英

Keras 3D Convolution: Error when checking model input: expected covolution3d_input_1 to have 5 dimensions, but got array shape (1, 90, 100, 100)

The goal of the system is to classify video input by which word was pronounced. Each sample is a set of 90, 100x100, grayscale (1-colour channel frame, with a dimension of (1, 90, 100, 100) . Previously, training data was loaded directly onto memory and trained, which worked, but was not efficient, and would have been impossible later on with more training samples. To work around this, the system was modified to preprocess and save training data into an HDF5 file, then fit the training data into the model with a generator for on demand loading. However, the following error is now generated as a result of this modification:

Exception: Error when checking model input: expected convolution3d_input_1 to have 5 dimensions, but got array with shape (1, 90, 100, 100)

Here is the code for the system:

from keras import backend as K
from keras.callbacks import Callback
from keras.constraints import maxnorm
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Convolution3D
from keras.layers.convolutional import MaxPooling3D
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.utils.io_utils import HDF5Matrix
from pprint import pprint
from sklearn.utils import shuffle
K.set_image_dim_ordering("th")

import cv2
import h5py
import json
import os
import sys
import numpy as np

class OpticalSpeechRecognizer(object):
    def __init__(self, rows, columns, frames_per_sequence):
        self.rows = rows
        self.columns = columns
        self.frames_per_sequence = frames_per_sequence
        self.osr = None

    def train_osr_model(self, training_save_fn):
        """ Train the optical speech recognizer
        """
        print "\nTraining OSR"
        validation_ratio = 0.3
        training_sequence_generator = self.generate_training_sequences(training_save_fn)
        validation_sequence_generator = self.generate_training_sequences(training_save_fn, validation_ratio=validation_ratio)
        training_save_file = h5py.File(training_save_fn, "r")
        sample_count = training_save_file.attrs["sample_count"]
        pbi = PrintBatchInfo()
        self.osr.fit_generator(generator=training_sequence_generator,
                               validation_data=validation_sequence_generator,
                               samples_per_epoch=sample_count,
                               nb_val_samples=int(round(validation_ratio*sample_count)),
                               nb_epoch=10,
                               verbose=2,
                               callbacks=[pbi],
                               class_weight=None,
                               nb_worker=1)

    def generate_osr_model(self, training_save_fn):
        """ Builds the optical speech recognizer model
        """
        print "".join(["Generating OSR model\n",
                       "-"*40])
        training_save_file = h5py.File(training_save_fn, "r")
        osr = Sequential()
        print " - Adding convolution layers"
        osr.add(Convolution3D(nb_filter=32,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              input_shape=(1, self.frames_per_sequence, self.rows, self.columns),
                              activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Convolution3D(nb_filter=32,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
        osr.add(Convolution3D(nb_filter=64,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Convolution3D(nb_filter=64,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
        osr.add(Convolution3D(nb_filter=128,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Convolution3D(nb_filter=128,
                              kernel_dim1=3,
                              kernel_dim2=3,
                              kernel_dim3=3,
                              border_mode="same",
                              activation="relu"))
        osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
        osr.add(Flatten())
        osr.add(Dropout(0.2))
        print " - Adding fully connected layers"
        osr.add(Dense(output_dim=128,
                      init="normal",
                      activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Dense(output_dim=64,
                      init="normal",
                      activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Dense(output_dim=32,
                      init="normal",
                      activation="relu"))
        osr.add(Dropout(0.2))
        osr.add(Dense(output_dim=len(training_save_file.attrs["training_classes"].split(",")),
                      init="normal",
                      activation="softmax"))
        print " - Compiling model"
        sgd = SGD(lr=0.01,
                  decay=1e-6,
                  momentum=0.9,
                  nesterov=True)
        osr.compile(loss="categorical_crossentropy",
                    optimizer=sgd,
                    metrics=["accuracy"])
        self.osr = osr
        print " * OSR MODEL GENERATED * "

    def generate_training_sequences(self, training_save_fn, validation_ratio=0):
        while True:
            training_save_file = h5py.File(training_save_fn, "r")
            sample_count = int(training_save_file.attrs["sample_count"])
            # generate sequences for validation
            if validation_ratio:
                validation_sample_count = int(round(validation_ratio*sample_count))
                validation_sample_idxs = np.random.randint(low=0, high=sample_count, size=validation_sample_count)
                for idx in validation_sample_idxs:
                    X = training_save_file["X"][idx]
                    Y = training_save_file["Y"][idx]
                    yield (X, Y)
            # generate sequences for training
            else:
                for idx in range(0, sample_count):
                    X = training_save_file["X"][idx]
                    Y = training_save_file["Y"][idx]
                    yield (X, Y)

    def process_training_data(self, config_file, training_save_fn):
        """ Preprocesses training data and saves them into an HDF5 file
        """
        # load training metadata from config file
        training_metadata = {}
        training_classes = []
        with open(config_file) as training_config:
            training_metadata = json.load(training_config)
            training_classes = sorted(list(training_metadata.keys()))

            print "".join(["\n",
                           "Found {0} training classes!\n".format(len(training_classes)),
                           "-"*40])
            for class_label, training_class in enumerate(training_classes):
                print "{0:<4d} {1:<10s} {2:<30s}".format(class_label, training_class, training_metadata[training_class])
            print ""

        # count number of samples
        sample_count = 0
        sample_count_by_class = [0]*len(training_classes)
        for class_label, training_class in enumerate(training_classes):
            # get training class sequeunce paths
            training_class_data_path = training_metadata[training_class]
            training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
                                             for file_name in os.listdir(training_class_data_path)
                                             if (os.path.isfile(os.path.join(training_class_data_path, file_name))
                                                 and ".mov" in file_name)]
            # update sample count
            sample_count += len(training_class_sequence_paths)
            sample_count_by_class[class_label] = len(training_class_sequence_paths)

        print "".join(["\n",
                       "Found {0} training samples!\n".format(sample_count),
                       "-"*40])
        for class_label, training_class in enumerate(training_classes):
            print "{0:<4d} {1:<10s} {2:<6d}".format(class_label, training_class, sample_count_by_class[class_label])
        print ""

        # initialize HDF5 save file, but clear older duplicate first if it exists
        try:
            print "Saved file \"{0}\" already exists! Overwriting previous saved file.\n".format(training_save_fn)
            os.remove(training_save_fn)
        except OSError:
            pass
        training_save_file = h5py.File(training_save_fn, "w")
        training_save_file.attrs["training_classes"] = np.string_(",".join(training_classes))
        training_save_file.attrs["sample_count"] = sample_count
        x_training_dataset = training_save_file.create_dataset("X", 
                                                              shape=(sample_count, 1, self.frames_per_sequence, self.rows, self.columns),
                                                              dtype="f")
        y_training_dataset = training_save_file.create_dataset("Y",
                                                               shape=(sample_count, len(training_classes)),
                                                               dtype="i")

        # iterate through each class data
        sample_idx = 0
        for class_label, training_class in enumerate(training_classes):
            # get training class sequeunce paths
            training_class_data_path = training_metadata[training_class]
            training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
                                             for file_name in os.listdir(training_class_data_path)
                                             if (os.path.isfile(os.path.join(training_class_data_path, file_name))
                                                 and ".mov" in file_name)]
            # iterate through each sequence
            for idx, training_class_sequence_path in enumerate(training_class_sequence_paths):
                sys.stdout.write("Processing training data for class \"{0}\": {1}/{2} sequences\r"
                                 .format(training_class, idx+1, len(training_class_sequence_paths)))
                sys.stdout.flush()

                # append grayscale, normalized sample frames
                frames = self.process_frames(training_class_sequence_path)
                x_training_dataset[sample_idx] = [frames]

                # append one-hot encoded sample label
                label = [0]*len(training_classes)
                label[class_label] = 1
                y_training_dataset[sample_idx] = label

                # update sample index
                sample_idx += 1

            print "\n"

        training_save_file.close()

        print "Training data processed and saved to {0}".format(training_save_fn)

    def process_frames(self, video_file_path):
        """ Splits frames, resizes frames, converts RGB frames to greyscale, and normalizes frames
        """
        video = cv2.VideoCapture(video_file_path)
        success, frame = video.read()

        frames = []
        success = True

        # resize, convert to grayscale, normalize, and collect valid frames 
        while success:
          success, frame = video.read()
          if success:
            frame = cv2.resize(frame, (self.rows, self.columns))
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frame = frame.astype('float32') / 255.0
            frames.append(frame)

        # pre-pad short sequences and equalize frame lengths
        if len(frames) < self.frames_per_sequence:
            frames = [frames[0]]*(self.frames_per_sequence - len(frames)) + frames
        frames = frames[0:self.frames_per_sequence]

        return frames

class PrintBatchInfo(Callback):
    def on_batch_end(self, epoch, logs={}):
        print logs

if __name__ == "__main__":
    osr = OpticalSpeechRecognizer(100, 100, 90)
    osr.process_training_data("training_config.json", "training_data.h5")
    osr.generate_osr_model("training_data.h5")
    osr.train_osr_model("training_data.h5")

What confuses me is that the reported input dimensions are the expected input dimensions, but it is complaining about a missing 5th dimension. Should the generator yield a batch of samples instead of a single sample for every iteration to generate a 5-dimensional output?

In case when you are returning a simple example you need to make sure that your output is 5-dimensional with shapes : (batch_size, channels, frames, height, width) . This is simply because the dimensionality of every layer shoud be fixed. The simplest way to make this working is:

X = training_save_file["X"][[idx]]

With this fix your output should match the expected shape.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM