简体   繁体   English

如何使用 Keras 实现 CNN-LSTM

[英]How to implement a CNN-LSTM using Keras

I am attempting to implement a CNN-LSTM that classifies mel-spectrogram images representing the speech of people with Parkinson's Disease/Healthy Controls.我正在尝试实现一个 CNN-LSTM,该 CNN-LSTM 对代表帕金森病/健康控制患者的语音的梅尔谱图图像进行分类。 I am trying to implement a pre-existing model (DenseNet-169) with an LSTM model, however I am running into the following error: ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 216, 1].我正在尝试使用 LSTM model 实现预先存在的 model (DenseNet-169),但是我遇到了以下错误: ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 216, 1]. ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 216, 1]. Can anyone advise where I'm going wrong?谁能告诉我哪里出错了?

import librosa
import os
import glob
import IPython.display as ipd
from pathlib import Path
import timeit
import time, sys

%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display

import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import cv2
import seaborn as sns

%tensorflow_version 1.x #version 1 works without problems
import tensorflow

from tensorflow.keras import models
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import TimeDistributed

import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, BatchNormalization, Activation, GaussianNoise, LSTM
from sklearn.metrics import accuracy_score

DATA_DIR = Path('/content/drive/MyDrive/PhD_Project_Experiments/Spontaneous_Dialogue_PD_Dataset') 
diagnosis = [x.name for x in DATA_DIR.glob('*') if x.is_dir()]
diagnosis

def create_paths_ds(paths: Path, label: str) -> list:
    EXTENSION_TYPE = '.wav'
    return [(x, label) for x in paths.glob('*' + EXTENSION_TYPE)]

from collections import Counter

categories_to_use = [
    'Parkinsons_Disease',
    'Healthy_Control',
]

NUM_CLASSES = len(categories_to_use)

print(f'Number of classes: {NUM_CLASSES}')

paths_all_labels = []
for cat in categories_to_use:
    paths_all_labels += create_paths_ds(DATA_DIR / cat, cat)
 
X_train, X_test = train_test_split(paths_all_labels,test_size=0.1, stratify = [paths_all_labels[y][1] for y in range(len(paths_all_labels))] ) #fix stratified sampling for test data
X_train, X_val = train_test_split(X_train, test_size=0.2, stratify = [X_train[y][1] for y in range(len(X_train))] ) 

for i in categories_to_use:
  print('Number of train samples for '+i+': '+ str([X_train[y][1] for y in range(len(X_train))].count(i))) #checks whether train samples are equally divided
  print('Number of test samples for '+i+': '+ str([X_test[y][1] for y in range(len(X_test))].count(i))) #checks whether test samples are equally divided
  print('Number of validation samples for '+i+': '+ str([X_val[y][1] for y in range(len(X_val))].count(i))) #checks whether val samples are equally divided

print(f'Train length: {len(X_train)}')
print(f'Validation length: {len(X_val)}')
print(f'Test length: {len(X_test)}')

def load_and_preprocess_lstm(dataset, SAMPLE_SIZE = 30):
    IMG_SIZE = (216,128) 
    progress=0

    data = []
    labels = []
    for (path, label) in dataset:
        audio, sr = librosa.load(path)
        dur = librosa.get_duration(audio, sr = sr)
        sampleNum = int(dur / SAMPLE_SIZE)
        offset = (dur % SAMPLE_SIZE) / 2
        for i in range(sampleNum):
            audio, sr = librosa.load(path, offset= offset+i, duration=SAMPLE_SIZE)
            sample = librosa.feature.melspectrogram(audio, sr=sr)
            # print(sample.shape)
            sample = cv2.resize(sample, dsize=IMG_SIZE)
            sample = np.expand_dims(sample,-1)
            print(sample.shape)
            data += [(sample, label)]
            labels += [label]
        progress +=1
        print('\r Progress: '+str(round(100*progress/len(dataset))) + '%', end='')
    return data, labels

def retrieve_samples(sample_size, model_type):

    if model_type == 'cnn':
  
        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_cnn(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_cnn(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_cnn(X_val,sample_size)
        print('\n')

    elif model_type == 'lstm':

        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)      
        print('\n')

    elif model_type == "cnnlstm":

        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)      
        print('\n')

    print("shape: " + str(X_train_samples[0][0].shape))
    print("number of training samples: "+ str(len(X_train_samples)))
    print("number of validation samples: "+ str(len(X_val_samples)))
    print("number of test samples: "+ str(len(X_test_samples)))


    return X_train_samples, X_test_samples, X_val_samples

def create_cnn_lstm_model(input_shape):

    model = Sequential()
    cnn = tensorflow.keras.applications.DenseNet169(include_top=True, weights=None, input_tensor=None, input_shape=input_shape, pooling=None, classes=2)
    # define LSTM model
    model.add(tensorflow.keras.layers.TimeDistributed(cnn, input_shape=input_shape))
    model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = True, input_shape = input_shape))
    model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = False))
    model.add(Dense(units=NUM_CLASSES, activation='sigmoid'))#Compile

    model.compile(loss=tensorflow.keras.losses.binary_crossentropy, optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    return model

def create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples):
    #Prepare samples to work for training the model
    labelizer = LabelEncoder()

    #prepare training data and labels
    x_train = np.array([x[0] for x in X_train_samples])
    y_train = np.array([x[1] for x in X_train_samples])
    y_train = labelizer.fit_transform(y_train) 
    y_train = to_categorical(y_train)

    #prepare validation data and labels
    x_val = np.array([x[0] for x in X_val_samples])
    y_val = np.array([x[1] for x in X_val_samples])
    y_val = labelizer.transform(y_val)
    y_val = to_categorical(y_val)

    #prepare test data and labels
    x_test = np.array([x[0] for x in X_test_samples])
    y_test = np.array([x[1] for x in X_test_samples])
    y_test = labelizer.transform(y_test)
    y_test = to_categorical(y_test)

    return x_train, y_train, x_val, y_val, x_test, y_test, labelizer


#Main loop for testing multiple sample sizes

#choose model type: 'cnn' or 'lstm'
model_type = 'cnnlstm'

n_epochs = 20
patience= 20
es = EarlyStopping(patience=20)
fragment_sizes = [5,10]
start = timeit.default_timer()

ModelData = pd.DataFrame(columns = ['Model Type','Fragment size (s)', 'Time to Compute (s)',  'Early Stopping epoch', 'Training accuracy', 'Validation accuracy', 'Test Accuracy']) #create a DataFrame for storing the results 

conf_matrix_data = []

for i in fragment_sizes:

    start_per_size = timeit.default_timer()

    print(f'\n---------- Model trained on fragments of size: {i} seconds ----------------')
    X_train_samples, X_test_samples, X_val_samples = retrieve_samples(i,model_type)
    x_train, y_train, x_val, y_val, x_test, y_test, labelizer = create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples)

    if model_type == 'cnn':
        model = create_cnn_model(X_train_samples[0][0].shape)
    elif model_type == 'lstm':
        model = create_lstm_model(X_train_samples[0][0].shape)
    elif model_type == 'cnnlstm':
        model = create_cnn_lstm_model(X_train_samples[0][0].shape)


    history = model.fit(x_train, y_train, 
              batch_size = 8, 
              epochs=n_epochs,
              verbose=1, 
              callbacks=[es],
              validation_data=(x_val, y_val))
    print('Finished training')


    early_stopping_epoch = len(history.history['accuracy'])
    training_accuracy = history.history['accuracy'][early_stopping_epoch-1-patience]
    validation_accuracy = history.history['val_accuracy'][early_stopping_epoch-1-patience]

    plot_data(history, i)

    predictions = model.predict(x_test)
    score = accuracy_score(labelizer.inverse_transform(y_test.argmax(axis=1)), labelizer.inverse_transform(predictions.argmax(axis=1)))

    print('Fragment size = ' + str(i) + ' seconds')
    print('Accuracy on test samples: ' + str(score))
    
    conf_matrix_data += [(predictions, y_test, i)]

    stop_per_size = timeit.default_timer()
    time_to_compute = round(stop_per_size - start_per_size)

    print ('Time to compute: '+str(time_to_compute))

    ModelData.loc[len(ModelData)] = [model_type, i, time_to_compute, early_stopping_epoch, training_accuracy, validation_accuracy, score] #store particular settings configuration, early stoppping epoch and accuracies in dataframe

stop = timeit.default_timer()
print ('\ntime to compute: '+str(stop-start))

I believe the input_shape is (128, 216, 1)我相信 input_shape 是 (128, 216, 1)

The issue here is that you don't have a time-axis to time distribute your CNN (DenseNet169) layer over.这里的问题是您没有时间轴来时间分布您的 CNN (DenseNet169) 层。

In this step -在这一步 -

tensorflow.keras.layers.TimeDistributed(cnn, input_shape=(128,216,1)))

You are passing the 128 dimension axis as a time-axis.您将 128 维轴作为时间轴传递。 That means each of the CNN (DenseNet169) is left with a input shape of (216,1) , which is not an image and therefore throws an error because it's expecting 3D tensors (images) and not 2D tensors.这意味着每个 CNN (DenseNet169) 的输入形状为(216,1) ,这不是图像,因此会引发错误,因为它期望 3D 张量(图像)而不是 2D 张量。

Your input shape needs to be a 4D tensor something like - (10, 128, 216, 1) , so that the 10 becomes the time axis (for time distributing), and (128, 216, 1) becomes an image input for the CNN (DenseNet169).您的输入形状需要是 4D 张量,例如 - (10, 128, 216, 1) ,因此10成为时间轴(用于时间分布),并且(128, 216, 1)成为图像输入CNN (DenseNet169)。


A solution with ragged tensors and time-distributed layer具有不规则张量和时间分布层的解决方案

IIUC, your data contains n audio files, each file containing a variable number of mel-spectrogram images. IIUC,您的数据包含 n 个音频文件,每个文件包含可变数量的梅尔谱图图像。

  1. You need to use tf.raggedtensors to be able to work with variable tensor shapes as inputs to the model您需要使用tf.raggedtensors才能使用可变张量形状作为 model 的输入
  2. This requires an explicit definition of an Input layer where you set ragged=True这需要明确定义输入层,您在其中设置ragged=True
  3. This allows you to pass each audio file as a single sample, with variable images, each of which will be time distributed.这允许您将每个音频文件作为单个样本传递,其中包含可变图像,每个图像都将是时间分布的。
  4. You will have to use None as the time distributed axis shape while defining the model在定义 model 时,您必须使用None作为时间分布轴形状

1. Creating a dummy dataset 1. 创建一个虚拟数据集

Let's start with a sample dataset -让我们从一个示例数据集开始 -

import tensorflow as tf
from tensorflow.keras import layers, Model, utils, applications


#Assuming there are 5 audio files
num_audio = 5

data = []

#Create a random number of mel-spectrograms for each audio file
for i in range(num_audio):
    n_images = np.random.randint(4,10)
    data.append(np.random.random((n_images,128,216,1)))
    
print([i.shape for i in data])
[(5, 128, 216, 1), 
 (5, 128, 216, 1), 
 (9, 128, 216, 1), 
 (6, 128, 216, 1), 
 (4, 128, 216, 1)]

So, your data should be looking something like this.因此,您的数据应该看起来像这样。 Here, I have a dummy dataset with 5 audio files, first one has 5 images of shape (128,216,1) , while the last one has 4 images of the same shape.在这里,我有一个包含 5 个音频文件的虚拟数据集,第一个有 5 个形状(128,216,1)的图像,而最后一个有 4 个相同形状的图像。

2. Converting them to ragged-tensors 2. 将它们转换为不规则张量

Next, let's convert and store these are ragged tensors.接下来,让我们转换并存储这些参差不齐的张量。 Ragged tensors allow variable-length objects to be stored, in this case, a variable number of images.参差不齐的张量允许存储可变长度的对象,在这种情况下,可以存储可变数量的图像。 Read more about them here .在此处阅读有关它们的更多信息。

在此处输入图像描述

#Convert each set of images (for each audio) to tensors and then a ragged tensor
tensors = [tensorflow.convert_to_tensor(i) for i in data]
X_train = tensorflow.ragged.stack(tensors).to_tensor()

#Creating dummy y_train, one for each audio files
y_train = tensorflow.convert_to_tensor(np.random.randint(0,2,(5,2)))

3. Create a model 3.创建model

I am using a functional API since I find it more readable and works better with an explicit input layer, but you can use input layers in Sequential API as well.我正在使用functional API ,因为我发现它更具可读性并且与显式输入层配合得更好,但是您也可以在Sequential API中使用输入层。 Feel free to convert it to your preference.随意将其转换为您的偏好。

Notice that I am using (None,128,216,1) as input shape.请注意,我使用(None,128,216,1)作为输入形状。 This creates 5 channels (first implicit one for batches) as - (Batch, audio_files, h, w, channels)这将创建 5 个通道(第一个用于批次的隐式通道)作为 - (Batch, audio_files, h, w, channels)

I have a dummy LSTM layer to showcase how the architecture works, feel free to stack more layers.我有一个虚拟 LSTM 层来展示架构的工作原理,随意堆叠更多层。 Also, do note, that your DenseNet169 is only returning 2 features.另外,请注意,您的DenseNet169仅返回 2 个特征。 And therefore your TimeDistributed layers is returning (None, None, 2) shaped tensor, where first None is the number of audio files, and the second None is the number of images (time axis).因此,您的TimeDistributed层返回(None, None, 2)形状的张量,其中第一个None是音频文件的数量,第二个None是图像的数量(时间轴)。 Therefore, do choose your next layers accordingly as 512 LSTM cells may be too much:)因此,请相应地选择下一层,因为 512 个 LSTM 单元可能太多了:)

#Create model
inp = layers.Input((None,128,216,1), ragged=True)

cnn = tensorflow.keras.applications.DenseNet169(include_top=True, 
                                                weights=None, 
                                                input_tensor=None, 
                                                input_shape=(128,216,1), #<----- input shape for cnn is just the image
                                                pooling=None, classes=2)


#Feel free to modify these layers!
x = layers.TimeDistributed(cnn)(inp)
x = layers.LSTM(8)(x)
out = layers.Dense(2)(x)

model = Model(inp, out)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics='accuracy')

utils.plot_model(model, show_shapes=True, show_layer_names=False)

在此处输入图像描述

4. Train! 4. 训练!

The next step is simply to train.下一步就是简单地训练。 Feel free to add your own parameters.随意添加您自己的参数。

model.fit(X_train, y_train, epochs=2)
Epoch 1/2
WARNING:tensorflow:5 out of the last 5 calls to <function Model.make_train_function.<locals>.train_function at 0x7f8e55b4fe50> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
1/1 [==============================] - 37s 37s/step - loss: 3.4057 - accuracy: 0.4000
Epoch 2/2
1/1 [==============================] - 16s 16s/step - loss: 3.3544 - accuracy: 0.4000

Hope that helps.希望有帮助。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM